Micnlab.files.wordpress.com



#BINARY CLASSIFICATION SCRIPT ----#From:#A practical guide to machine learning for neurosurgeons #Acta Neurochirurgica#Authors: Victor E. Staartjes (Zurich), Julius M. Kernbach (Aachen)#Contact for questions or clarification:#victoregon.staartjes@usz.ch#January 2020#Executed on: R Version 3.6.2 "Dark and Stormy Night" (64 bit)#OS = Windows 10; CPU = 16 physical cores @ 3.40 GHz; GPU = NVIDIA 980 Ti; RAM = 64 GB#Total Runtime (4 CPU Cores): 14 minutes; 44 seconds#1 SETUP & PRE-PROCESSING DATA ----#1.1 Load packages ----if (!require("pacman")) install.packages("pacman")pacman::p_load("readxl", "caret", "Hmisc", "doParallel", "mlbench", "rlang", "dplyr", "VIM", "randomForest", "klaR", "devtools", "gbm", "gam", "pROC", "rms") #These packages are required to execute this code. If unavailable and you have an internet connection, they will be installed automatically.#If you desire to update R to the latest version, delete the hashtag (#) in the next line of code, and run the code. You will then be guided through the update process.#pacman::p_load("installr"); updateR(fast = T, GUI = T, quit_R = T, install_R = T, keep_old_packages = T, copy_packages = T, start_new_R = T, silent = T, print_R_versions = T)#1.2 Import data from .xlsx ----#In Excel, make sure that your dependent (Target/Endpoint) variable is at the very last column of your Excel database#Also, convert any categorical variables into numerical form, i.e. 1/0 or 1/2/3/4/etc. instead of strings/text#Both are the case in the Glioblastoma database#Import the dataset from the downloaded Excel using the RStudio GUI, OR runt this code: #A - Store the .xlsx file in the same folder as the scriptdf <- as.data.frame(read_excel("GlioblastomaData.xlsx"))#B - Alternatively, you may have to enter the path to the .xlsx. file. For example:df <- as.data.frame(read_excel("C:/Machine Learning/Folder/GlioblastomaData.xlsx")) #Replace this string (" ") with the path to the downloaded Glioblastoma dataset!!!#1.3 Check data format ---- #Continous variables should be "numeric" in R, and categorical ones should be "factor" in Rstr(df) #For the Glioblastoma dataset, we see that we have 22 variables, but that IDH, MGMT, Male, TERTp, Midline, Comorbidity, Epilepsy, PriorSurgery, Married, ActiveWorker, Chemotherapy, HigherEducation, and the binary outcome TwelveMonths are still handled as "numeric".#1.4 Reformat the categorical variables to "factor", and rename the endpoint ----cols <- c("IDH", "MGMT", "Male", "TERTp", "Midline", "Comorbidity", "Epilepsy", "PriorSurgery", "Married", "ActiveWorker", "Chemotherapy", "HigherEducation", "TwelveMonths")df[cols] <- lapply(df[cols], factor)df$TwelveMonths <- factor(df$TwelveMonths, levels = c(0,1), labels = c("no", "yes")) #Here, we rename the endpoint to "yes" for 12-month-survival and "no" for 12-month-deathstr(df) #We now see that all categorical variables are correctly labeled as "factor", and R will handle them correctly now#1.5 Remove unncessary columns ----#In the binary Glioblastoma example, the column "Survival" is a continous version of our endpoint of interest - TwelveMonths (Twelve Month survival) - and would thus be highly correlated with our endpoint. We must therefore delete it and any other highly multicollinear variables. df <- df[,-21] #Survival was in the 21st columns, so we remove itstr(df) #And check the data again - We still have 10'000 patients, but now 21 instead of 22 variables.#1.6 OPTIONAL Enable multicore processing ----#If you have a device with multiple CPU cores, you can enable multicore parallel processing by entering the number of cores to be allocated. For example, on our test PC, we have 16 physical cores with 32 virtual cores, so we could choose to allocate 28 to model training. Here we choose 4 as a common number of cores.#If you want to enable multicore processing, delete the two hashtags below!cl <- makePSOCKcluster(4) #Enter number of cores to be allocated; Here, we choose 4 cores, a common numberregisterDoParallel(cl)#1.7 Split training and testing set. Here, we choose a 80%/20% random split ----set.seed(123) #Setting seeds allows reproducibility dt = sort(sample(nrow(df), nrow(df)*.80)) #Allocate 80% split train <-df[dt,] #Split test <-df[-dt,]train <- train[sample(nrow(train)),] #Shuffletest <- test[sample(nrow(test)),]#Check approximate equal distribution of the endpoint - Here we see that 12-month survival was virtually equal in training and testing setsprop.table(table(train$TwelveMonths))prop.table(table(test$TwelveMonths))#1.8 If missing data is present, impute missing data using a KNN Imputer (K = 5) ----#Choose a KNN imputer because later on, a KNN Imputer will be co-trained with the prediction model to impute missing data for future predictions #Again, make sure that your dependent (Target/Endpoint) variable is at the very last column of your Excel databasetrainx <- train[,1:(ncol(train)-1)]TwelveMonths <- train[,ncol(train)]trainx <- VIM::kNN(trainx, k = 5, imp_var = F) #We see that the Glioblastoma database contains no missing data - thus, no imputation was performedtrain <- cbind(trainx, TwelveMonths) #1.9 Variable Selection using Recursive Feature Elimination ----set.seed(123) #Set a seedrfecontrol <- rfeControl(functions=nbFuncs, method = "boot", number = 25, rerank = F, verbose = T, allowParallel = T) #RFE using naive Bayes classifier, in 25-fold bootstrapset.seed(123)RFE <- rfe(x = train[,1:(ncol(train)-1)], y = train[,ncol(train)], sizes=c(10:(ncol(train)-1)), rfeControl=rfecontrol) #Run the RFE algorithmprint(RFE) #Resultsplot(RFE, type=c("g", "o")) #Plot performance over #varspredictors(RFE) #Variables to be kept#Apparently, this combination of 13 selected variables explains the highest amoung of variance in our endpoint. #Thus, we should keep only these 13 independent variables, and our dependent variable, and remove the 7 "garbage"/"low-impact"/"multicollinear" independent variables that were not selected by RFE#Keep only the RFE-selected 13 variables & our endpoint "TwelveMonths"keepvars <- c(predictors(RFE), "TwelveMonths")train <- train[keepvars]#1.10 Get a final overview ----summary(train)summary(test)#2 MODEL TRAINING ----#2.1 Set Up the Training and Resampling Structure ----#As a standard, we choose bootstraping with replacement as the resampling method, with 25 repeats. #We also choose random upsampling as a standard to adjust for class imbalance if present. Alternatively, sampling = "smote" could be used for SMOTE upsampling.ctrl <- trainControl(method = "boot", number = 25, classProbs = T, summaryFunction = twoClassSummary, allowParallel = T, sampling = "up", returnResamp = "all")#2.2.1 Train Model #1: GLM (Logistic Regression) ----set.seed(123)lrfit <- caret::train(TwelveMonths ~ ., data = train, method = "glm", trControl = ctrl, tuneLength = 5, metric = "ROC", preProcess = "knnImpute", na.action = na.pass)#We automatically tune our models based on AUC (=ROC), and co-train a KNN imputer that then automatically imputes any potential missing data for future samples#Assess Discriminationconf <- print(confusionMatrix.train(lrfit, norm = "none", positive = "yes")[1])[[1]]; tp <- conf[2,2]; tn <- conf[1,1]; fp <- conf[2,1]; fn <- conf[1,2]; sens <- tp/(tp+fn); spec <- tn/(tn+fp); prev <- as.numeric(prop.table(table(train[,ncol(train)])))[2]#Assess Calibrationprob <- predict(lrfit, train, type="prob", na.action = na.pass) calLogReg <- val.prob(prob[,2], as.numeric(train$TwelveMonths)-1, ylab = "Observed Frequency", xlab = "Predicted Probability", g = 10)LogReg <- as.data.frame(cbind(max(lrfit$results$ROC), sens * prev + spec * (1-prev), sens, spec, (sens*prev)/(sens*prev+(1-spec)*(1-prev)), (spec*(1-prev))/((1-sens)*prev+spec*(1-prev)), (2*((sens*prev)/(sens*prev+(1-spec)*(1-prev)))*(sens))/(((sens*prev)/(sens*prev+(1-spec)*(1-prev)))+sens)))LogReg <- as.data.frame(cbind(LogReg, calLogReg[12], calLogReg[13]))colnames(LogReg) <- c("AUC", "Accuracy", "Sensitivity", "Specificity", "PPV", "NPV", "F1 Score", "Intercept", "Slope")#Summaryprint(lrfit)print(LogReg)#Save Model Filesave(lrfit, file= "GLM.Rdata")#2.2.2 Train Model #2: Random Forest ----set.seed(123)rffit <- caret::train(TwelveMonths ~ ., data = train, method = "rf", trControl = ctrl, tuneLength = 12, metric = "ROC", preProcess = "knnImpute", na.action = na.pass)#Assess Discriminationconf <- print(confusionMatrix.train(rffit, norm = "none", positive = "yes")[1])[[1]]; tp <- conf[2,2]; tn <- conf[1,1]; fp <- conf[2,1]; fn <- conf[1,2]; sens <- tp/(tp+fn); spec <- tn/(tn+fp); prev <- as.numeric(prop.table(table(train[,ncol(train)])))[2]#Assess Calibrationprob <- predict(rffit, train, type="prob", na.action = na.pass) calRF <- val.prob(prob[,2], as.numeric(train$TwelveMonths)-1, ylab = "Observed Frequency", xlab = "Predicted Probability", g = 10)RF <- as.data.frame(cbind(max(rffit$results$ROC), sens * prev + spec * (1-prev), sens, spec, (sens*prev)/(sens*prev+(1-spec)*(1-prev)), (spec*(1-prev))/((1-sens)*prev+spec*(1-prev)), (2*((sens*prev)/(sens*prev+(1-spec)*(1-prev)))*(sens))/(((sens*prev)/(sens*prev+(1-spec)*(1-prev)))+sens)))RF <- as.data.frame(cbind(RF, calRF[12], calRF[13]))colnames(RF) <- c("AUC", "Accuracy", "Sensitivity", "Specificity", "PPV", "NPV", "F1 Score", "Intercept", "Slope")#Summaryprint(rffit)print(RF)#Save Model Filesave(rffit, file= "RF.Rdata")#2.2.3 Train Model #3: Gradient Boosting Machine (GBM) ----set.seed(123)gbmfit <- caret::train(TwelveMonths ~ ., data = train, method = "gbm", trControl = ctrl, tuneLength = 8, metric = "ROC", preProcess = "knnImpute", na.action = na.pass)#Assess Discriminationconf <- print(confusionMatrix.train(gbmfit, norm = "none", positive = "yes")[1])[[1]]; tp <- conf[2,2]; tn <- conf[1,1]; fp <- conf[2,1]; fn <- conf[1,2]; sens <- tp/(tp+fn); spec <- tn/(tn+fp); prev <- as.numeric(prop.table(table(train[,ncol(train)])))[2]#Assess Calibrationprob <- predict(gbmfit, train, type="prob", na.action = na.pass) calGBM <- val.prob(prob[,2], as.numeric(train$TwelveMonths)-1, ylab = "Observed Frequency", xlab = "Predicted Probability", g = 10)GBM <- as.data.frame(cbind(max(gbmfit$results$ROC), sens * prev + spec * (1-prev), sens, spec, (sens*prev)/(sens*prev+(1-spec)*(1-prev)), (spec*(1-prev))/((1-sens)*prev+spec*(1-prev)), (2*((sens*prev)/(sens*prev+(1-spec)*(1-prev)))*(sens))/(((sens*prev)/(sens*prev+(1-spec)*(1-prev)))+sens)))GBM <- as.data.frame(cbind(GBM, calGBM[12], calGBM[13]))colnames(GBM) <- c("AUC", "Accuracy", "Sensitivity", "Specificity", "PPV", "NPV", "F1 Score", "Intercept", "Slope")#Summaryprint(gbmfit)print(GBM)#Save Model Filesave(gbmfit, file= "GBM.Rdata")#2.2.4 Train Model #4: Generalized Additive Model (GAM) ----set.seed(123)gamfit <- caret::train(TwelveMonths ~ ., data = train, method = "gamLoess", trControl = ctrl, tuneLength = 10, metric = "ROC", preProcess = "knnImpute", na.action = na.pass)#Assess Discriminationconf <- print(confusionMatrix.train(gamfit, norm = "none", positive = "yes")[1])[[1]]; tp <- conf[2,2]; tn <- conf[1,1]; fp <- conf[2,1]; fn <- conf[1,2]; sens <- tp/(tp+fn); spec <- tn/(tn+fp); prev <- as.numeric(prop.table(table(train[,ncol(train)])))[2]#Assess Calibrationprob <- predict(gamfit, train, type="prob", na.action = na.pass) calGAM <- val.prob(prob[,2], as.numeric(train$TwelveMonths)-1, ylab = "Observed Frequency", xlab = "Predicted Probability", g = 10)GAM <- as.data.frame(cbind(max(gamfit$results$ROC), sens * prev + spec * (1-prev), sens, spec, (sens*prev)/(sens*prev+(1-spec)*(1-prev)), (spec*(1-prev))/((1-sens)*prev+spec*(1-prev)), (2*((sens*prev)/(sens*prev+(1-spec)*(1-prev)))*(sens))/(((sens*prev)/(sens*prev+(1-spec)*(1-prev)))+sens)))GAM <- as.data.frame(cbind(GAM, calGAM[12], calGAM[13]))colnames(GAM) <- c("AUC", "Accuracy", "Sensitivity", "Specificity", "PPV", "NPV", "F1 Score", "Intercept", "Slope")#Summaryprint(gamfit)print(GAM)#Save Model Filesave(gamfit, file= "GAM.Rdata")#2.2.5 Train Model #5: Naive Bayes Classifier ----set.seed(123)NBfit <- caret::train(TwelveMonths ~ ., data = train, method = "nb", trControl = ctrl, tuneLength = 5, metric = "ROC", preProcess = "knnImpute", na.action = na.pass)#Assess Discriminationconf <- print(confusionMatrix.train(NBfit, norm = "none", positive = "yes")[1])[[1]]; tp <- conf[2,2]; tn <- conf[1,1]; fp <- conf[2,1]; fn <- conf[1,2]; sens <- tp/(tp+fn); spec <- tn/(tn+fp); prev <- as.numeric(prop.table(table(train[,ncol(train)])))[2]#Assess Calibrationprob <- predict(NBfit, train, type="prob", na.action = na.pass) calNB <- val.prob(prob[,2], as.numeric(train$TwelveMonths)-1, ylab = "Observed Frequency", xlab = "Predicted Probability", g = 10)NB <- as.data.frame(cbind(max(NBfit$results$ROC), sens * prev + spec * (1-prev), sens, spec, (sens*prev)/(sens*prev+(1-spec)*(1-prev)), (spec*(1-prev))/((1-sens)*prev+spec*(1-prev)), (2*((sens*prev)/(sens*prev+(1-spec)*(1-prev)))*(sens))/(((sens*prev)/(sens*prev+(1-spec)*(1-prev)))+sens)))NB <- as.data.frame(cbind(NB, calNB[12], calNB[13]))colnames(NB) <- c("AUC", "Accuracy", "Sensitivity", "Specificity", "PPV", "NPV", "F1 Score", "Intercept", "Slope")#Summaryprint(NBfit)print(NB)#Save Model Filesave(NBfit, file= "NB.Rdata")#3 MODEL EVALUATION AND SELECTION ----#3.1 Evaluate the models in comparison (Discrimination & Calibration) ----#Print Results Summaryresults <- as.data.frame(rbind(LogReg, RF, GBM, GAM, NB))rownames(results) <- c("GLM", "Random Forest", "GBM", "GAM", "Naive Bayes")print(results)#Plot Results (Opens external plot viewer)dev.new(width = 12, height = 7, noRStudioGD = T);par(mfrow=c(1,2)) ; barplot(as.matrix(results[,-c(8:9)]), cex.axis = 1, cex.names = 0.7, beside = T, col = c("darkgray", "black", "darkorange", "darkblue", "darkgreen"), ylim = c(0,1)) ; abline(a = 1, b = 0) ; legend("top", legend = rownames(results), col = c("darkgray", "black", "darkorange", "darkblue", "darkgreen"), pch = c(19), bty = "n", pt.cex = 0.7, cex = 0.7, text.col = "black", horiz = T); barplot(abs(as.matrix(-results[,-c(1:7)])), beside = T, col = c("darkgray", "black", "darkorange", "darkblue", "darkgreen")) ; abline(a = 1, b = 0) ;legend("top", legend = rownames(results), col = c("darkgray", "black", "darkorange", "darkblue", "darkgreen"), pch = c(19), bty = "n", pt.cex = 0.7, cex = 0.7, text.col = "black", horiz = F)dev.off() #This line closes the plot viewer#Select a suitable model based on the discrimination and calibration measures#In this example, we might choose the GLM or the GAM - Both have similar discrimination, an intercept very close to zero, and a slope of approximately 1.0. We chose the GAM, but from here onwards, any model type can be used.#3.2 Select the final model ----finalmodel <- gamfit #Here we chose the GAM (generalized additive model)finalmodelstats <- GAM #And its training evaluation#3.3 Internally validate the selected final model on the hitherto unseen test set ----prob <- predict(finalmodel, test, type="prob", na.action = na.pass) # Prediction#Assess Discriminationresult.roc <- roc(test$TwelveMonths, prob$yes) # Draw ROC curve.auc <- result.roc$aucpred <- factor(ifelse(prob$yes > 0.50, "yes", "no"))conf <- print(confusionMatrix(pred, test$TwelveMonths, positive = "yes")[[2]]); tp <- conf[2,2]; tn <- conf[1,1]; fp <- conf[2,1]; fn <- conf[1,2]; sens <- tp/(tp+fn); spec <- tn/(tn+fp); prev <- as.numeric(prop.table(table(test[,ncol(test)])))[2]#Assess Calibrationprob <- predict(finalmodel, test, type="prob", na.action = na.pass) calFinal <- val.prob(prob[,2], as.numeric(test$TwelveMonths)-1, ylab = "Observed Frequency", xlab = "Predicted Probability", g = 10)Final <- as.data.frame(cbind(auc, sens * prev + spec * (1-prev), sens, spec, (sens*prev)/(sens*prev+(1-spec)*(1-prev)), (spec*(1-prev))/((1-sens)*prev+spec*(1-prev)), (2*((sens*prev)/(sens*prev+(1-spec)*(1-prev)))*(sens))/(((sens*prev)/(sens*prev+(1-spec)*(1-prev)))+sens)))Final <- as.data.frame(cbind(Final, calFinal[12], calFinal[13]))colnames(Final) <- c("AUC", "Accuracy", "Sensitivity", "Specificity", "PPV", "NPV", "F1 Score", "Intercept", "Slope")#Summaryprint(finalmodel)print(Final)#Save Final Model Filesave(finalmodel, file= "FINALMODEL.Rdata")#4 Compile all evaluation data and figures required for reporting ----#4.1.1 Bootstrapping (Training) performance measures ----print(finalmodelstats)#4.1.2 Training calibration plot ----prob <- predict(finalmodel, train, type="prob", na.action = na.pass) val.prob(prob[,2], as.numeric(train$TwelveMonths)-1, ylab = "Observed Frequency", xlab = "Predicted Probability", g = 10)#4.2.1 Testing (Internal Validation) performance measures ----print(Final)#4.2.2 Testing calibration plot ----prob <- predict(finalmodel, test, type="prob", na.action = na.pass) jpeg("Fig_4_7_cALIBRATION.jpeg", width = 5, height = 5, res = 2000, units = "in")val.prob(prob[,2], as.numeric(test$TwelveMonths)-1, ylab = "Observed Frequency", xlab = "Predicted Probability", g = 10)dev.off()#4.3 Assess variable importance ----imp <- varImp(finalmodel, scale = T)imp #Calculated variable importance based on AUCplot(imp) #Plot variable importance ................
................

In order to avoid copyright disputes, this page is only a partial summary.

Google Online Preview   Download