Как ускорить обучение / оценку модели данных?
Я использовал caret::train для обучения 11 моделей из набора данных с 32000 obs и 250 переменных. Я использую R на коробке Linux.
Программное обеспечение сильно замедлилось. Я сохранил все модели в списке и записал на диск, заняв около 1,3 Гб.
У кого-нибудь есть опыт создания / использования RAM диска с R? Повлияет ли RAM-диск на производительность?
Я предполагаю, что я достигаю предела подкачки памяти. Любые другие предложения о том, как ускорить R?
Я хотел бы закончить команды прогнозирования / производительности, вместе с графиком тестирования ROC и табулированием производительности ROC для 11 моделей. Кроме того, я хотел бы запустить процесс для нескольких других зависимых переменных.
Я считаю, что есть 80 Гб оперативной памяти, и есть 7 ядер. При обучении моделей я использовал 6-параллельную параллельную обработку, но для обучения моделей леса потребовалось несколько часов.
Код похож на gripComp из этого примера кода, за исключением того, что cor используется для удаления независимой переменной, имеющей корреляцию выше 0,9 до обучения. У меня нет набора данных для обмена, но есть около 30000 obs и 250 независимых переменных и около 20 зависимых переменных, но я обрабатываю только зависимые переменные по одной за раз.
Псевдокод. Основное различие заключается в том, что модели сохраняются на диск по мере их расчета, а диск проверяется на предмет того, рассчитывалась ли модель ранее. Кроме того, у меня включен многоядерный, и это не указано в этом примере кода. Обучение заняло около 30 часов с 6 ядрами. Я подождал около 3 часов команд предсказания / производительности, а затем пошел домой.
caretApproach <- function(df, cName, l=3, m="cv", n=3/4,
ostats="gripComp.txt", oplot="gripComp.png",
pw=700, ph=700, plty=1, secAllowed=5,
useParellel=TRUE){
inTrain <- createDataPartition(df$INCOME, p = .8,
list = FALSE,
times = 1)
set.seed(1)
# browser()
indep <- subset(df, select = setdiff(names(df),c(cName)))
classes <- subset(df, select = c(cName))
classes <- unlist(classes) # this converts to a vector
d_cor <- as.matrix(cor(indep))
# I don't remember this code fragment, but cor is used to removed correlated columns of classes
trainDescr <- indep[inTrain,]
testDescr <- indep[-inTrain,]
trainClass <- classes[inTrain]
testClass <- classes[-inTrain]
set.seed(2)
###1 Recursive partitioning rpart rpart
rpartFit <- train(trainDescr, trainClass, method='rpart', tuneLength = l,trControl = trainControl(method = m))
###2 Recursive partitioning ctree2 party
ctreeFit <- train(trainDescr, trainClass, method='ctree2', tuneLength = l,trControl = trainControl(method = m))
###3 Random forests rf randomForest
rfFit <- train(trainDescr, trainClass, method='rf', tuneLength = l,trControl = trainControl(method = m))
###4 Random forests cforest party
cforestFit <- train(trainDescr, trainClass, method='cforest', tuneLength = l,trControl = trainControl(method = m))
###5 Bagging treebag ipred
treebagFit <- train(trainDescr, trainClass, method='treebag', tuneLength = l,trControl = trainControl(method = m))
###6 Neural networks nnet nnet
nnetFit <- train(trainDescr, trainClass, method='nnet', tuneLength = l,trControl = trainControl(method = m))
###7 Support vector machines svmRadial kernlab
svmRadialFit <- train(trainDescr, trainClass, method='svmRadial', tuneLength = l,trControl = trainControl(method = m))
###8 Support vector machines svmLinear kernlab
svmLinearFit <- train(trainDescr, trainClass, method='svmLinear', tuneLength = l,trControl = trainControl(method = m))
###9 k nearest neighbors knn caret
knnFit <- train(trainDescr, trainClass, method='knn', tuneLength = l,trControl = trainControl(method = m))
###10 Generalized linear model glm stats
glmFit <- train(trainDescr, trainClass, method='glm', tuneLength = l,trControl = trainControl(method = m))
###11 Logistic/Multinomial Regression multinom nnet
# I have an if statement here to check if trainDescr is factor
multinomFit <- train(trainDescr, trainClass, method='multinom', tuneLength = l,trControl = trainControl(method = m))
### models
models <- list(rpart=rpartFit, ctree2=ctreeFit, rf=rfFit, cforest=cforestFit, treebag=treebagFit, nnet = nnetFit, svmRadial = svmRadialFit, svmLinear = svmLinearFit, knn = knnFit, glm = glmFit, multinom=multinomFit)
# It takes about 30 hrs of clock time to get to this point
# At this point, models was written to the disk, and the file size is 1.3 GB
save(models,file='models.RData')
### predict values
predValues <- extractPrediction(models, testX = testDescr, testY = testClass)
testValues <- subset(predValues, dataType == "Test")
### predict probability
probValues <- extractProb(models, testX = testDescr, testY = testClass)
testProbs <- subset(probValues, dataType == "Test")
# I waited about 3 hours for the above 4 lines, then went home
# The rest of the code may need some correcting
############stats
###1 rpart
rpartPred <- subset(testValues, model == "rpart")
x <- confusionMatrix(rpartPred$pred, rpartPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "rpart")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x1 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
png(filename = oplot, width = pw, height = ph, units = "px")
plotColors <-colors()[c(9,17,19,24,27,33,51,62,84,254,552)]
plot(rocCurve, col=plotColors[1], main="", lty=plty)
legend('bottomright', c('rpart', 'ctree2', 'rf', 'cforest', 'treebag', 'nnet', 'svmRadial', 'svmLinear', 'knn', 'glm', 'multinom'), pch=c(15), col=plotColors)
###2 ctree2
ctreePred <- subset(testValues, model == "ctree2")
x=confusionMatrix(ctreePred$pred, ctreePred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "ctree2")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x2 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[2], main="", lty=plty, add=TRUE)
###3 rf
rfPred <- subset(testValues, model == "rf")
x=confusionMatrix(rfPred$pred, rfPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "rf")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x3 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[3], main="", lty=plty, add=TRUE)
###4 cforest
cforestPred <- subset(testValues, model == "cforest")
x=confusionMatrix(cforestPred$pred, cforestPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "cforest")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x4 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[4], main="", lty=plty, add=TRUE)
###5 treebag
treebagPred <- subset(testValues, model == "treebag")
x=confusionMatrix(treebagPred$pred, treebagPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "treebag")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x5 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[5], main="", lty=plty, add=TRUE)
###6 nnet
nnetPred <- subset(testValues, model == "nnet")
x=confusionMatrix(nnetPred$pred, nnetPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "nnet")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x6 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[6], main="", lty=plty, add=TRUE)
###7 svmRadial
svmRadialPred <- subset(testValues, model == "svmRadial")
x=confusionMatrix(svmRadialPred$pred, svmRadialPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "svmRadial")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x7 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[7], main="", lty=plty, add=TRUE)
###8 svmLinear
svmLinearPred <- subset(testValues, model == "svmLinear")
x=confusionMatrix(svmLinearPred$pred, svmLinearPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "svmLinear")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x8 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[8], main="", lty=plty, add=TRUE)
###9 knn
knnPred <- subset(testValues, model == "knn")
x=confusionMatrix(knnPred$pred, knnPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "knn")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x9 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[9], main="", lty=plty, add=TRUE)
###10 glm
glmPred <- subset(testValues, model == "glm")
x=confusionMatrix(glmPred$pred, glmPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "glm")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x10 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[10], main="", lty=plty, add=TRUE)
###11 multinom
# Needs if statement to check for factors
multinomPred <- subset(testValues, model == "multinom")
x=confusionMatrix(multinomPred$pred, multinomPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "multinom")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x11 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[11], main="", lty=plty, add=TRUE)
dev.off()
### output
b <- c(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11)
s <- matrix(b, 11, 9, byrow=T, dimnames=list(c("rpart", "ctree2", "rf", "cforest", "treebag", "nnet", "svmRadial", "svmLinear", "knn", "glm", "multinom"), c("TP", "FN", "FP", "TN", "AUC", "Accuracy", "Sensitivity", "Specificity", "phi")))
write.table(s, ostats)
write.table(s)
return(s)