Как ускорить обучение / оценку модели данных?

Я использовал caret::train для обучения 11 моделей из набора данных с 32000 obs и 250 переменных. Я использую R на коробке Linux.

Программное обеспечение сильно замедлилось. Я сохранил все модели в списке и записал на диск, заняв около 1,3 Гб.

У кого-нибудь есть опыт создания / использования RAM диска с R? Повлияет ли RAM-диск на производительность?

Я предполагаю, что я достигаю предела подкачки памяти. Любые другие предложения о том, как ускорить R?

Я хотел бы закончить команды прогнозирования / производительности, вместе с графиком тестирования ROC и табулированием производительности ROC для 11 моделей. Кроме того, я хотел бы запустить процесс для нескольких других зависимых переменных.

Я считаю, что есть 80 Гб оперативной памяти, и есть 7 ядер. При обучении моделей я использовал 6-параллельную параллельную обработку, но для обучения моделей леса потребовалось несколько часов.

Код похож на gripComp из этого примера кода, за исключением того, что cor используется для удаления независимой переменной, имеющей корреляцию выше 0,9 до обучения. У меня нет набора данных для обмена, но есть около 30000 obs и 250 независимых переменных и около 20 зависимых переменных, но я обрабатываю только зависимые переменные по одной за раз.

Псевдокод. Основное различие заключается в том, что модели сохраняются на диск по мере их расчета, а диск проверяется на предмет того, рассчитывалась ли модель ранее. Кроме того, у меня включен многоядерный, и это не указано в этом примере кода. Обучение заняло около 30 часов с 6 ядрами. Я подождал около 3 часов команд предсказания / производительности, а затем пошел домой.

caretApproach <- function(df, cName, l=3, m="cv", n=3/4, 
                          ostats="gripComp.txt", oplot="gripComp.png", 
                          pw=700, ph=700, plty=1, secAllowed=5,
                          useParellel=TRUE){

  inTrain <- createDataPartition(df$INCOME, p = .8,
                                 list = FALSE,
                                 times = 1)
  set.seed(1)

  # browser()
  indep <- subset(df, select = setdiff(names(df),c(cName)))
  classes <- subset(df, select = c(cName))
  classes <- unlist(classes)  # this converts to a vector

  d_cor <- as.matrix(cor(indep))
  # I don't remember this code fragment, but cor is used to removed correlated columns of classes

  trainDescr <- indep[inTrain,]
  testDescr <- indep[-inTrain,]
  trainClass <- classes[inTrain]
  testClass <- classes[-inTrain]

set.seed(2)
###1 Recursive partitioning  rpart rpart
rpartFit <- train(trainDescr, trainClass, method='rpart', tuneLength = l,trControl = trainControl(method = m))
###2 Recursive partitioning  ctree2 party
ctreeFit <- train(trainDescr, trainClass, method='ctree2', tuneLength = l,trControl = trainControl(method = m))
###3 Random forests  rf  randomForest
rfFit <- train(trainDescr, trainClass, method='rf', tuneLength = l,trControl = trainControl(method = m))
###4 Random forests  cforest party
cforestFit <- train(trainDescr, trainClass, method='cforest', tuneLength = l,trControl = trainControl(method = m))
###5 Bagging  treebag  ipred
treebagFit <- train(trainDescr, trainClass, method='treebag', tuneLength = l,trControl = trainControl(method = m))
###6 Neural networks  nnet  nnet
nnetFit <- train(trainDescr, trainClass, method='nnet', tuneLength = l,trControl = trainControl(method = m))
###7 Support vector machines  svmRadial  kernlab 
svmRadialFit <- train(trainDescr, trainClass, method='svmRadial', tuneLength = l,trControl = trainControl(method = m))
###8 Support vector machines  svmLinear  kernlab
svmLinearFit <- train(trainDescr, trainClass, method='svmLinear', tuneLength = l,trControl = trainControl(method = m))
###9 k nearest neighbors  knn  caret
knnFit <- train(trainDescr, trainClass, method='knn', tuneLength = l,trControl = trainControl(method = m))
###10 Generalized linear model glm  stats
glmFit <- train(trainDescr, trainClass, method='glm', tuneLength = l,trControl = trainControl(method = m))
###11 Logistic/Multinomial Regression multinom  nnet
# I have an if statement here to check if trainDescr is factor
multinomFit <- train(trainDescr, trainClass, method='multinom', tuneLength = l,trControl = trainControl(method = m))

### models
models <- list(rpart=rpartFit, ctree2=ctreeFit, rf=rfFit, cforest=cforestFit, treebag=treebagFit, nnet = nnetFit, svmRadial = svmRadialFit, svmLinear = svmLinearFit, knn = knnFit, glm = glmFit, multinom=multinomFit)

# It takes about 30 hrs of clock time to get to this point
# At this point, models was written to the disk, and the file size is 1.3 GB
save(models,file='models.RData')

  ### predict values
  predValues <- extractPrediction(models, testX = testDescr, testY = testClass)
  testValues <- subset(predValues, dataType == "Test")

  ### predict probability
  probValues <- extractProb(models, testX = testDescr, testY = testClass)
  testProbs <- subset(probValues, dataType == "Test")

# I waited about 3 hours for the above 4 lines, then went home
# The rest of the code may need some correcting

  ############stats
  ###1 rpart
  rpartPred <- subset(testValues, model == "rpart")
  x <- confusionMatrix(rpartPred$pred, rpartPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "rpart")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x1 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  png(filename = oplot, width = pw, height = ph, units = "px")
  plotColors <-colors()[c(9,17,19,24,27,33,51,62,84,254,552)]
  plot(rocCurve, col=plotColors[1], main="", lty=plty)
  legend('bottomright', c('rpart', 'ctree2', 'rf', 'cforest', 'treebag', 'nnet', 'svmRadial', 'svmLinear', 'knn', 'glm', 'multinom'), pch=c(15), col=plotColors)

  ###2 ctree2
  ctreePred <- subset(testValues, model == "ctree2")
  x=confusionMatrix(ctreePred$pred, ctreePred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "ctree2")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x2 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[2], main="", lty=plty, add=TRUE)

  ###3 rf
  rfPred <- subset(testValues, model == "rf")
  x=confusionMatrix(rfPred$pred, rfPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "rf")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x3 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[3], main="", lty=plty, add=TRUE)

  ###4 cforest
  cforestPred <- subset(testValues, model == "cforest")
  x=confusionMatrix(cforestPred$pred, cforestPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "cforest")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x4 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[4], main="", lty=plty, add=TRUE)

  ###5 treebag
  treebagPred <- subset(testValues, model == "treebag")
  x=confusionMatrix(treebagPred$pred, treebagPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "treebag")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x5 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[5], main="", lty=plty, add=TRUE)

  ###6 nnet
  nnetPred <- subset(testValues, model == "nnet")
  x=confusionMatrix(nnetPred$pred, nnetPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "nnet")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x6 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[6], main="", lty=plty, add=TRUE)

  ###7 svmRadial
  svmRadialPred <- subset(testValues, model == "svmRadial")
  x=confusionMatrix(svmRadialPred$pred, svmRadialPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
  myProb <- subset(testProbs, model == "svmRadial")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x7 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[7], main="", lty=plty, add=TRUE)

  ###8 svmLinear
  svmLinearPred <- subset(testValues, model == "svmLinear")
  x=confusionMatrix(svmLinearPred$pred, svmLinearPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "svmLinear")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x8 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[8], main="", lty=plty, add=TRUE)

  ###9 knn
  knnPred <- subset(testValues, model == "knn")
  x=confusionMatrix(knnPred$pred, knnPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "knn")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x9 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[9], main="", lty=plty, add=TRUE)

  ###10 glm
  glmPred <- subset(testValues, model == "glm")
  x=confusionMatrix(glmPred$pred, glmPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "glm")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x10 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[10], main="", lty=plty, add=TRUE)

  ###11 multinom
  # Needs if statement to check for factors
  multinomPred <- subset(testValues, model == "multinom")
  x=confusionMatrix(multinomPred$pred, multinomPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "multinom")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x11 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[11], main="", lty=plty, add=TRUE)
  dev.off()
  ### output
  b <- c(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11)
  s <- matrix(b, 11, 9, byrow=T, dimnames=list(c("rpart", "ctree2", "rf", "cforest", "treebag", "nnet", "svmRadial", "svmLinear", "knn", "glm", "multinom"), c("TP", "FN", "FP", "TN", "AUC", "Accuracy", "Sensitivity", "Specificity", "phi")))
  write.table(s, ostats)
  write.table(s)
  return(s)

0 ответов

Другие вопросы по тегам