1
0
caret-and-bigstatsr-workshop/caret_tutorial.R
2025-01-10 23:05:22 -05:00

155 lines
4.7 KiB
R
Executable File

library(caret)
library(data.table)
## Data Loading [Predictors]
# Genotype Data
gen_path <- "/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Genotype_Data_Imputed.csv"
gen_data <- fread(gen_path, sep = ",", header = FALSE)
## Data Loading [Traits]
# Phenotype Data
phn_path <- "/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Phenotype_Data.csv"
phn_data <- fread(phn_path, sep = ",", header = FALSE)
## Data Overview
# Preview of Genotype Data.
str(gen_data)
gen_data[1:10, 1:10]
# Preview of Phenotype Data.
str(phn_data)
phn_data[1:10, 1]
# Find missing data (if any) in the loaded datasets.
sum(is.na(gen_data))
sum(is.na(phn_data))
## Pre-Processing
# Set random seed for reproducibility.
set.seed(226)
# Perform a 80% / 20% split of the data.
# Get the index for the rows to be used in the training data.
train_index <- createDataPartition(phn_data$V1,
p = 0.8,
list = FALSE)
train_index
# Now, retrieve the corresponding training and testing data.
x_train <- gen_data[train_index,]
x_test <- gen_data[-train_index,]
y_train <- phn_data[train_index,]
y_test <- phn_data[-train_index,]
## Model Training [glmnet]
# Define a custom tuning grid.
tune_grid <- expand.grid(alpha = seq(0.0001, 1, length = 5),
lambda = 5)
# Parameter tuning.
param_tune <- trainControl(method = "repeatedcv",
number = 2,
repeats = 5,
trim = TRUE,
search = "grid",
verboseIter = TRUE)
# Train a model.
glmnet_model <- train(x_train, y_train$V1,
method = "glmnet",
metric = "MAE",
tuneGrid = tune_grid,
trControl = param_tune)
glmnet_model
# Predict and check model accuracy.
glmnet_prediction <- predict(glmnet_model, x_test)
postResample(pred = glmnet_prediction, obs = y_test$V1)
## Model Training [earth]
# Custom tuning grid.
tune_grid <- expand.grid(nprune = 1:10,
degree = 1:10)
# Parameter tuning.
param_tune <- trainControl(method = "repeatedcv",
number = 2,
repeats = 5,
trim = TRUE,
search = "grid",
verboseIter = TRUE)
# Train a model.
earth_model <- train(x_train, y_train$V1,
method = "earth",
metric = "RMSE",
tuneGrid = tune_grid,
trControl = param_tune)
# Predict and check model accuracy.
earth_prediction <- predict(earth_model, x_test)
postResample(pred = earth_prediction, obs = y_test$V1)
## Model Training [mlpKerasDropout]
# Parameter tuning.
param_tune <- trainControl(search = "random")
# Train a model.
keras_model <- train(x_train, y_train$V1,
method = "mlpKerasDropout",
metric = "RMSE",
callbacks = list(
keras::callback_early_stopping(monitor = "loss",
mode = "auto",
patience = 20,
restore_best_weights = TRUE)
),
trControl = param_tune,
tuneLength = 3,
epochs = 50)
# Predict and check model accuracy.
keras_prediction <- predict(keras_model, x_test)
postResample(pred = keras_prediction, obs = y_test$V1)
## Model Training [xgbDART]
# Custom tuning grid.
tune_grid <- expand.grid(nrounds = 0,
max_depth = 2,
eta = 0.0001,
gamma = 0,
subsample = 0.5,
colsample_bytree = 0.5,
rate_drop = seq(0.1, 1, length = 20),
skip_drop = seq(0.1, 1, length = 20),
min_child_weight = 9)
# Parameter tuning.
param_tune <- trainControl(method = "repeatedcv",
number = 2,
repeats = 5,
verboseIter = TRUE)
# Train a model.
xgb_model <- train(x_train, y_train$V1,
# xgbTree may be faster.
method = "xgbDART",
metric = "RMSE",
tuneGrid = tune_grid,
trControl = param_tune,
verbosity = 0)
# Predict and check model accuracy.
xgb_prediction <- predict(xgb_model, x_test)
postResample(pred = xgb_prediction, obs = y_test$V1)