155 lines
4.7 KiB
R
Executable File
155 lines
4.7 KiB
R
Executable File
library(caret)
|
|
library(data.table)
|
|
|
|
## Data Loading [Predictors]
|
|
|
|
# Genotype Data
|
|
gen_path <- "/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Genotype_Data_Imputed.csv"
|
|
gen_data <- fread(gen_path, sep = ",", header = FALSE)
|
|
|
|
## Data Loading [Traits]
|
|
|
|
# Phenotype Data
|
|
phn_path <- "/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Phenotype_Data.csv"
|
|
phn_data <- fread(phn_path, sep = ",", header = FALSE)
|
|
|
|
## Data Overview
|
|
|
|
# Preview of Genotype Data.
|
|
str(gen_data)
|
|
gen_data[1:10, 1:10]
|
|
|
|
# Preview of Phenotype Data.
|
|
str(phn_data)
|
|
phn_data[1:10, 1]
|
|
|
|
# Find missing data (if any) in the loaded datasets.
|
|
sum(is.na(gen_data))
|
|
sum(is.na(phn_data))
|
|
|
|
## Pre-Processing
|
|
|
|
# Set random seed for reproducibility.
|
|
set.seed(226)
|
|
|
|
# Perform a 80% / 20% split of the data.
|
|
# Get the index for the rows to be used in the training data.
|
|
train_index <- createDataPartition(phn_data$V1,
|
|
p = 0.8,
|
|
list = FALSE)
|
|
train_index
|
|
|
|
# Now, retrieve the corresponding training and testing data.
|
|
x_train <- gen_data[train_index,]
|
|
x_test <- gen_data[-train_index,]
|
|
y_train <- phn_data[train_index,]
|
|
y_test <- phn_data[-train_index,]
|
|
|
|
## Model Training [glmnet]
|
|
|
|
# Define a custom tuning grid.
|
|
tune_grid <- expand.grid(alpha = seq(0.0001, 1, length = 5),
|
|
lambda = 5)
|
|
|
|
# Parameter tuning.
|
|
param_tune <- trainControl(method = "repeatedcv",
|
|
number = 2,
|
|
repeats = 5,
|
|
trim = TRUE,
|
|
search = "grid",
|
|
verboseIter = TRUE)
|
|
|
|
# Train a model.
|
|
glmnet_model <- train(x_train, y_train$V1,
|
|
method = "glmnet",
|
|
metric = "MAE",
|
|
tuneGrid = tune_grid,
|
|
trControl = param_tune)
|
|
|
|
|
|
glmnet_model
|
|
|
|
# Predict and check model accuracy.
|
|
glmnet_prediction <- predict(glmnet_model, x_test)
|
|
postResample(pred = glmnet_prediction, obs = y_test$V1)
|
|
|
|
## Model Training [earth]
|
|
|
|
# Custom tuning grid.
|
|
tune_grid <- expand.grid(nprune = 1:10,
|
|
degree = 1:10)
|
|
|
|
# Parameter tuning.
|
|
param_tune <- trainControl(method = "repeatedcv",
|
|
number = 2,
|
|
repeats = 5,
|
|
trim = TRUE,
|
|
search = "grid",
|
|
verboseIter = TRUE)
|
|
|
|
# Train a model.
|
|
earth_model <- train(x_train, y_train$V1,
|
|
method = "earth",
|
|
metric = "RMSE",
|
|
tuneGrid = tune_grid,
|
|
trControl = param_tune)
|
|
|
|
# Predict and check model accuracy.
|
|
earth_prediction <- predict(earth_model, x_test)
|
|
postResample(pred = earth_prediction, obs = y_test$V1)
|
|
|
|
## Model Training [mlpKerasDropout]
|
|
|
|
# Parameter tuning.
|
|
param_tune <- trainControl(search = "random")
|
|
|
|
# Train a model.
|
|
keras_model <- train(x_train, y_train$V1,
|
|
method = "mlpKerasDropout",
|
|
metric = "RMSE",
|
|
callbacks = list(
|
|
keras::callback_early_stopping(monitor = "loss",
|
|
mode = "auto",
|
|
patience = 20,
|
|
restore_best_weights = TRUE)
|
|
),
|
|
trControl = param_tune,
|
|
tuneLength = 3,
|
|
epochs = 50)
|
|
|
|
# Predict and check model accuracy.
|
|
keras_prediction <- predict(keras_model, x_test)
|
|
postResample(pred = keras_prediction, obs = y_test$V1)
|
|
|
|
## Model Training [xgbDART]
|
|
|
|
# Custom tuning grid.
|
|
tune_grid <- expand.grid(nrounds = 0,
|
|
max_depth = 2,
|
|
eta = 0.0001,
|
|
gamma = 0,
|
|
subsample = 0.5,
|
|
colsample_bytree = 0.5,
|
|
rate_drop = seq(0.1, 1, length = 20),
|
|
skip_drop = seq(0.1, 1, length = 20),
|
|
min_child_weight = 9)
|
|
|
|
# Parameter tuning.
|
|
param_tune <- trainControl(method = "repeatedcv",
|
|
number = 2,
|
|
repeats = 5,
|
|
verboseIter = TRUE)
|
|
|
|
# Train a model.
|
|
xgb_model <- train(x_train, y_train$V1,
|
|
# xgbTree may be faster.
|
|
method = "xgbDART",
|
|
metric = "RMSE",
|
|
tuneGrid = tune_grid,
|
|
trControl = param_tune,
|
|
verbosity = 0)
|
|
|
|
# Predict and check model accuracy.
|
|
xgb_prediction <- predict(xgb_model, x_test)
|
|
postResample(pred = xgb_prediction, obs = y_test$V1)
|