caret-and-bigstatsr-workshop/caret_tutorial.R

library(caret)
library(data.table)

## Data Loading [Predictors]

# Genotype Data
gen_path <- "/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Genotype_Data_Imputed.csv"
gen_data <- fread(gen_path, sep = ",", header = FALSE)

## Data Loading [Traits]

# Phenotype Data
phn_path <- "/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Phenotype_Data.csv"
phn_data <- fread(phn_path, sep = ",", header = FALSE)

## Data Overview

# Preview of Genotype Data.
str(gen_data)
gen_data[1:10, 1:10]

# Preview of Phenotype Data.
str(phn_data)
phn_data[1:10, 1]

# Find missing data (if any) in the loaded datasets.
sum(is.na(gen_data))
sum(is.na(phn_data))

## Pre-Processing

# Set random seed for reproducibility.
set.seed(226)

# Perform a 80% / 20% split of the data.
# Get the index for the rows to be used in the training data.
train_index <- createDataPartition(phn_data$V1,
                                   p = 0.8,
                                   list = FALSE)
train_index

# Now, retrieve the corresponding training and testing data.
x_train <- gen_data[train_index,]
x_test <- gen_data[-train_index,]
y_train <- phn_data[train_index,]
y_test <- phn_data[-train_index,]

## Model Training [glmnet]

# Define a custom tuning grid.
tune_grid <- expand.grid(alpha = seq(0.0001, 1, length = 5),
                         lambda = 5)

# Parameter tuning.
param_tune <- trainControl(method = "repeatedcv",
                           number = 2,
                           repeats = 5,
                           trim = TRUE,
                           search = "grid",
                           verboseIter = TRUE)

# Train a model.
glmnet_model <- train(x_train, y_train$V1,
                      method = "glmnet",
                      metric = "MAE",
                      tuneGrid = tune_grid,
                      trControl = param_tune)


glmnet_model

# Predict and check model accuracy.
glmnet_prediction <- predict(glmnet_model, x_test)
postResample(pred = glmnet_prediction, obs = y_test$V1)

## Model Training [earth]

# Custom tuning grid.
tune_grid <- expand.grid(nprune = 1:10,
                         degree = 1:10)

# Parameter tuning.
param_tune <- trainControl(method = "repeatedcv",
                           number = 2,
                           repeats = 5,
                           trim = TRUE,
                           search = "grid",
                           verboseIter = TRUE)

# Train a model.
earth_model <- train(x_train, y_train$V1,
                     method = "earth",
                     metric = "RMSE",
                     tuneGrid = tune_grid,
                     trControl = param_tune)

# Predict and check model accuracy.
earth_prediction <- predict(earth_model, x_test)
postResample(pred = earth_prediction, obs = y_test$V1)

## Model Training [mlpKerasDropout]

# Parameter tuning.
param_tune <- trainControl(search = "random")

# Train a model.
keras_model <- train(x_train, y_train$V1,
                     method = "mlpKerasDropout",
                     metric = "RMSE",
                     callbacks = list(
                       keras::callback_early_stopping(monitor = "loss",
                                                      mode = "auto",
                                                      patience = 20,
                                                      restore_best_weights = TRUE)
                     ),
                     trControl = param_tune,
                     tuneLength = 3,
                     epochs = 50)

# Predict and check model accuracy.
keras_prediction <- predict(keras_model, x_test)
postResample(pred = keras_prediction, obs = y_test$V1)

## Model Training [xgbDART]

# Custom tuning grid.
tune_grid <- expand.grid(nrounds = 0,
                         max_depth = 2,
                         eta = 0.0001,
                         gamma = 0,
                         subsample = 0.5,
                         colsample_bytree = 0.5,
                         rate_drop = seq(0.1, 1, length = 20),
                         skip_drop = seq(0.1, 1, length = 20),
                         min_child_weight = 9)

# Parameter tuning.
param_tune <- trainControl(method = "repeatedcv",
                           number = 2,
                           repeats = 5,
                           verboseIter = TRUE)

# Train a model.
xgb_model <- train(x_train, y_train$V1,
                   # xgbTree may be faster.
                   method = "xgbDART",
                   metric = "RMSE",
                   tuneGrid = tune_grid,
                   trControl = param_tune,
                   verbosity = 0)

# Predict and check model accuracy.
xgb_prediction <- predict(xgb_model, x_test)
postResample(pred = xgb_prediction, obs = y_test$V1)