R code
setwd("C:/Users/243886/OneDrive - Universitetet i Stavanger/ML-5spot-SGS/Rcode")
NPVpl <- data.frame(N)
NPVpl <- NPVpl[, c(1)]
input1 <- data.frame(input)
geoin <- read.csv('INPUTPCA489.csv',nrows = F)
Final_Input <- cbind(input1,NPVpl)
install.packages(c("e1071", "caret", "doSNOW", "ipred", "xgboost"))
install.packages(c('lattice','ggplot2'))
library(caret)
library(doSNOW)
INPUTMEAN <- data.frame(INPUTMEAN)
train <- INPUT
#=================================================================
# Data Wrangling
#=================================================================
# Set up factors.
# Subset data to features we wish to keep/use.
features <- c('MC1','MC2','MC3','MC4','MPV','NPVpl')
c('En1C1','En2C1','En3C1','En4C1','En5C1','En6C1','En7C1','En8C1','En9C1',
'En10C1','En1C2','En2C2','En3C2','En4C2','En5C2','En6C2','En7C2','En8C2'
,'En9C2','En10C2','En1C3','En2C3','En3C3','En4C3','En5C3','En6C3','En7C3',
'En8C3','En9C3','En10C3','En1C4','En2C4','En3C4','En4C4','En5C4','En6C4',
'En7C4','En8C4','En9C4','En10C4','En1PV','En2PV','En3PV','En4PV','En5PV',
'En6PV','En7PV','En8PV','En9PV','En10PV','NPVpl')
train <- train[, features]
#=================================================================
# Split Data
#=================================================================
names(train)[26]<-"NPVpl"
# Use caret to create a 70/30% split of the training data,
# keeping the proportions of the Survived class label the
# same across splits.
set.seed(54321)
indexes <- createDataPartition(train$NPVpl,
times = 1,
p = 0.7,
list = FALSE)
profs.train <- train[indexes,]
profs.test <- train[-indexes,]
# Examine the proportions of the Survived class lable across
# the datasets.
prop.table(table(train$NPVpl))
prop.table(table(proonebyone.train$NPVpl))
prop.table(table(proonebyone.test$NPVpl))
#=================================================================
# Train Model
#=================================================================
# nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
#4 4000 6 0.01 0 0.4 2 1
# nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
#10 4000 6 0.025 0 0.4 2.25 1
# Set up caret to perform 10-fold cross validation repeated 3
# times and to use a grid search for optimal model hyperparamter
# values.
train.control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3,
search = "grid")
# Leverage a grid search of hyperparameters for xgboost. See
# the following presentation for more information:
# https://www.slideshare.net/odsc/owen-zhangopen-sourcetoolsanddscompetitions1
tune.grid <- expand.grid(eta = c(0.0025),
nrounds = c(4000),
max_depth = 6,
min_child_weight = c(2.25),
colsample_bytree = c(0.4),
gamma = 0,
subsample = 1)
View(tune.grid)
# Use the doSNOW package to enable caret to train in parallel.
# While there are many package options in this space, doSNOW
# has the advantage of working on both Windows and Mac OS X.
#
# Create a socket cluster using 10 processes.
#
# NOTE - Tune this number based on the number of cores/threads
# available on your machine!!!
#
cl <- makeCluster(10, type = "SOCK")
# Register cluster so that caret will know to train in parallel.
registerDoSNOW(cl)
library(foreach)
install.packages('doParallel')
library(doParallel)
cl <- makeCluster(30)
registerDoParallel(cl)
# Train the xgboost model using 10-fold CV repeated 3 times
# and a hyperparameter grid search to train the optimal model.
library('xgboost')
caret.cv <- train(NPVpl ~ .,
data = profs.train,
method = "xgbTree",
tuneGrid = tune.grid,
trControl = train.control)
stopCluster(cl)
caret.cv$bestTune
# Examine caret's processing results
# Make predictions on the test set using a xgboost model
# trained on all 625 rows of the training set using the
# found optimal hyperparameter values.
preds <- predict(caret.cv, profs.test)
plot(preds,profs.test$NPVpl,col='red',type = 'p',pch=1 ,xlab =
'NPV predicted by ML ($MM)',ylab = 'NPV of the real Test Data ($MM)',
main = 'Test Data vs. ML Prediction')
abline(a=0,b=1,col=4,lwd=3)
mylabel = bquote(italic(R)^2 == .(format(r2, digits = 2)))
text(x = 40, y = 15, labels = mylabel)
caret::R2(preds,profs.test$NPVpl)
# Use caret's confusionMatrix() function to estimate the
# effectiveness of this model on unseen, new data
caret.cv$bestTune