caret-and-bigstatsr-workshop/caret_tutorial.R

library(caret)
library(data.table)

## Data Loading [Predictors]

# Genotype Data
gen_path <- "/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Genotype_Data_Imputed.csv"
gen_data <- fread(gen_path, sep = ",", header = FALSE)

## Data Loading [Traits]

# Phenotype Data
phn_path <- "/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Phenotype_Data.csv"
phn_data <- fread(phn_path, sep = ",", header = FALSE)

## Data Overview

# Preview of Genotype Data.
str(gen_data)
gen_data[1:10, 1:10]

# Preview of Phenotype Data.
str(phn_data)
phn_data[1:10, 1]

# Find missing data (if any) in the loaded datasets.
sum(is.na(gen_data))
sum(is.na(phn_data))

## Pre-Processing

# Set random seed for reproducibility.
set.seed(226)

# Perform a 80% / 20% split of the data.
# Get the index for the rows to be used in the training data.
train_index <- createDataPartition(phn_data$V1,
                                   p = 0.8,
                                   list = FALSE)
train_index

# Now, retrieve the corresponding training and testing data.
x_train <- gen_data[train_index,]
x_test <- gen_data[-train_index,]
y_train <- phn_data[train_index,]
y_test <- phn_data[-train_index,]

## Model Training [glmnet]

# Define a custom tuning grid.
tune_grid <- expand.grid(alpha = seq(0.0001, 1, length = 5),
                         lambda = 5)

# Parameter tuning.
param_tune <- trainControl(method = "repeatedcv",
                           number = 2,
                           repeats = 5,
                           trim = TRUE,
                           search = "grid",
                           verboseIter = TRUE)

# Train a model.
glmnet_model <- train(x_train, y_train$V1,
                      method = "glmnet",
                      metric = "MAE",
                      tuneGrid = tune_grid,
                      trControl = param_tune)


glmnet_model

# Predict and check model accuracy.
glmnet_prediction <- predict(glmnet_model, x_test)
postResample(pred = glmnet_prediction, obs = y_test$V1)

## Model Training [earth]

# Custom tuning grid.
tune_grid <- expand.grid(nprune = 1:10,
                         degree = 1:10)

# Parameter tuning.
param_tune <- trainControl(method = "repeatedcv",
                           number = 2,
                           repeats = 5,
                           trim = TRUE,
                           search = "grid",
                           verboseIter = TRUE)

# Train a model.
earth_model <- train(x_train, y_train$V1,
                     method = "earth",
                     metric = "RMSE",
                     tuneGrid = tune_grid,
                     trControl = param_tune)

# Predict and check model accuracy.
earth_prediction <- predict(earth_model, x_test)
postResample(pred = earth_prediction, obs = y_test$V1)

## Model Training [mlpKerasDropout]

# Parameter tuning.
param_tune <- trainControl(search = "random")

# Train a model.
keras_model <- train(x_train, y_train$V1,
                     method = "mlpKerasDropout",
                     metric = "RMSE",
                     callbacks = list(
                       keras::callback_early_stopping(monitor = "loss",
                                                      mode = "auto",
                                                      patience = 20,
                                                      restore_best_weights = TRUE)
                     ),
                     trControl = param_tune,
                     tuneLength = 3,
                     epochs = 50)

# Predict and check model accuracy.
keras_prediction <- predict(keras_model, x_test)
postResample(pred = keras_prediction, obs = y_test$V1)

## Model Training [xgbDART]

# Custom tuning grid.
tune_grid <- expand.grid(nrounds = 0,
                         max_depth = 2,
                         eta = 0.0001,
                         gamma = 0,
                         subsample = 0.5,
                         colsample_bytree = 0.5,
                         rate_drop = seq(0.1, 1, length = 20),
                         skip_drop = seq(0.1, 1, length = 20),
                         min_child_weight = 9)

# Parameter tuning.
param_tune <- trainControl(method = "repeatedcv",
                           number = 2,
                           repeats = 5,
                           verboseIter = TRUE)

# Train a model.
xgb_model <- train(x_train, y_train$V1,
                   # xgbTree may be faster.
                   method = "xgbDART",
                   metric = "RMSE",
                   tuneGrid = tune_grid,
                   trControl = param_tune,
                   verbosity = 0)

# Predict and check model accuracy.
xgb_prediction <- predict(xgb_model, x_test)
postResample(pred = xgb_prediction, obs = y_test$V1)
Upload files 2025-01-11 04:05:22 +00:00			`library(caret)`
			`library(data.table)`

			`## Data Loading [Predictors]`

			`# Genotype Data`
			`gen_path <- "/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Genotype_Data_Imputed.csv"`
			`gen_data <- fread(gen_path, sep = ",", header = FALSE)`

			`## Data Loading [Traits]`

			`# Phenotype Data`
			`phn_path <- "/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Phenotype_Data.csv"`
			`phn_data <- fread(phn_path, sep = ",", header = FALSE)`

			`## Data Overview`

			`# Preview of Genotype Data.`
			`str(gen_data)`
			`gen_data[1:10, 1:10]`

			`# Preview of Phenotype Data.`
			`str(phn_data)`
			`phn_data[1:10, 1]`

			`# Find missing data (if any) in the loaded datasets.`
			`sum(is.na(gen_data))`
			`sum(is.na(phn_data))`

			`## Pre-Processing`

			`# Set random seed for reproducibility.`
			`set.seed(226)`

			`# Perform a 80% / 20% split of the data.`
			`# Get the index for the rows to be used in the training data.`
			`train_index <- createDataPartition(phn_data$V1,`
			`p = 0.8,`
			`list = FALSE)`
			`train_index`

			`# Now, retrieve the corresponding training and testing data.`
			`x_train <- gen_data[train_index,]`
			`x_test <- gen_data[-train_index,]`
			`y_train <- phn_data[train_index,]`
			`y_test <- phn_data[-train_index,]`

			`## Model Training [glmnet]`

			`# Define a custom tuning grid.`
			`tune_grid <- expand.grid(alpha = seq(0.0001, 1, length = 5),`
			`lambda = 5)`

			`# Parameter tuning.`
			`param_tune <- trainControl(method = "repeatedcv",`
			`number = 2,`
			`repeats = 5,`
			`trim = TRUE,`
			`search = "grid",`
			`verboseIter = TRUE)`

			`# Train a model.`
			`glmnet_model <- train(x_train, y_train$V1,`
			`method = "glmnet",`
			`metric = "MAE",`
			`tuneGrid = tune_grid,`
			`trControl = param_tune)`


			`glmnet_model`

			`# Predict and check model accuracy.`
			`glmnet_prediction <- predict(glmnet_model, x_test)`
			`postResample(pred = glmnet_prediction, obs = y_test$V1)`

			`## Model Training [earth]`

			`# Custom tuning grid.`
			`tune_grid <- expand.grid(nprune = 1:10,`
			`degree = 1:10)`

			`# Parameter tuning.`
			`param_tune <- trainControl(method = "repeatedcv",`
			`number = 2,`
			`repeats = 5,`
			`trim = TRUE,`
			`search = "grid",`
			`verboseIter = TRUE)`

			`# Train a model.`
			`earth_model <- train(x_train, y_train$V1,`
			`method = "earth",`
			`metric = "RMSE",`
			`tuneGrid = tune_grid,`
			`trControl = param_tune)`

			`# Predict and check model accuracy.`
			`earth_prediction <- predict(earth_model, x_test)`
			`postResample(pred = earth_prediction, obs = y_test$V1)`

			`## Model Training [mlpKerasDropout]`

			`# Parameter tuning.`
			`param_tune <- trainControl(search = "random")`

			`# Train a model.`
			`keras_model <- train(x_train, y_train$V1,`
			`method = "mlpKerasDropout",`
			`metric = "RMSE",`
			`callbacks = list(`
			`keras::callback_early_stopping(monitor = "loss",`
			`mode = "auto",`
			`patience = 20,`
			`restore_best_weights = TRUE)`
			`),`
			`trControl = param_tune,`
			`tuneLength = 3,`
			`epochs = 50)`

			`# Predict and check model accuracy.`
			`keras_prediction <- predict(keras_model, x_test)`
			`postResample(pred = keras_prediction, obs = y_test$V1)`

			`## Model Training [xgbDART]`

			`# Custom tuning grid.`
			`tune_grid <- expand.grid(nrounds = 0,`
			`max_depth = 2,`
			`eta = 0.0001,`
			`gamma = 0,`
			`subsample = 0.5,`
			`colsample_bytree = 0.5,`
			`rate_drop = seq(0.1, 1, length = 20),`
			`skip_drop = seq(0.1, 1, length = 20),`
			`min_child_weight = 9)`

			`# Parameter tuning.`
			`param_tune <- trainControl(method = "repeatedcv",`
			`number = 2,`
			`repeats = 5,`
			`verboseIter = TRUE)`

			`# Train a model.`
			`xgb_model <- train(x_train, y_train$V1,`
			`# xgbTree may be faster.`
			`method = "xgbDART",`
			`metric = "RMSE",`
			`tuneGrid = tune_grid,`
			`trControl = param_tune,`
			`verbosity = 0)`

			`# Predict and check model accuracy.`
			`xgb_prediction <- predict(xgb_model, x_test)`
			`postResample(pred = xgb_prediction, obs = y_test$V1)`