library(caret) library(data.table) ## Data Loading [Predictors] # Genotype Data gen_path <- "/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Genotype_Data_Imputed.csv" gen_data <- fread(gen_path, sep = ",", header = FALSE) ## Data Loading [Traits] # Phenotype Data phn_path <- "/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Phenotype_Data.csv" phn_data <- fread(phn_path, sep = ",", header = FALSE) ## Data Overview # Preview of Genotype Data. str(gen_data) gen_data[1:10, 1:10] # Preview of Phenotype Data. str(phn_data) phn_data[1:10, 1] # Find missing data (if any) in the loaded datasets. sum(is.na(gen_data)) sum(is.na(phn_data)) ## Pre-Processing # Set random seed for reproducibility. set.seed(226) # Perform a 80% / 20% split of the data. # Get the index for the rows to be used in the training data. train_index <- createDataPartition(phn_data$V1, p = 0.8, list = FALSE) train_index # Now, retrieve the corresponding training and testing data. x_train <- gen_data[train_index,] x_test <- gen_data[-train_index,] y_train <- phn_data[train_index,] y_test <- phn_data[-train_index,] ## Model Training [glmnet] # Define a custom tuning grid. tune_grid <- expand.grid(alpha = seq(0.0001, 1, length = 5), lambda = 5) # Parameter tuning. param_tune <- trainControl(method = "repeatedcv", number = 2, repeats = 5, trim = TRUE, search = "grid", verboseIter = TRUE) # Train a model. glmnet_model <- train(x_train, y_train$V1, method = "glmnet", metric = "MAE", tuneGrid = tune_grid, trControl = param_tune) glmnet_model # Predict and check model accuracy. glmnet_prediction <- predict(glmnet_model, x_test) postResample(pred = glmnet_prediction, obs = y_test$V1) ## Model Training [earth] # Custom tuning grid. tune_grid <- expand.grid(nprune = 1:10, degree = 1:10) # Parameter tuning. param_tune <- trainControl(method = "repeatedcv", number = 2, repeats = 5, trim = TRUE, search = "grid", verboseIter = TRUE) # Train a model. earth_model <- train(x_train, y_train$V1, method = "earth", metric = "RMSE", tuneGrid = tune_grid, trControl = param_tune) # Predict and check model accuracy. earth_prediction <- predict(earth_model, x_test) postResample(pred = earth_prediction, obs = y_test$V1) ## Model Training [mlpKerasDropout] # Parameter tuning. param_tune <- trainControl(search = "random") # Train a model. keras_model <- train(x_train, y_train$V1, method = "mlpKerasDropout", metric = "RMSE", callbacks = list( keras::callback_early_stopping(monitor = "loss", mode = "auto", patience = 20, restore_best_weights = TRUE) ), trControl = param_tune, tuneLength = 3, epochs = 50) # Predict and check model accuracy. keras_prediction <- predict(keras_model, x_test) postResample(pred = keras_prediction, obs = y_test$V1) ## Model Training [xgbDART] # Custom tuning grid. tune_grid <- expand.grid(nrounds = 0, max_depth = 2, eta = 0.0001, gamma = 0, subsample = 0.5, colsample_bytree = 0.5, rate_drop = seq(0.1, 1, length = 20), skip_drop = seq(0.1, 1, length = 20), min_child_weight = 9) # Parameter tuning. param_tune <- trainControl(method = "repeatedcv", number = 2, repeats = 5, verboseIter = TRUE) # Train a model. xgb_model <- train(x_train, y_train$V1, # xgbTree may be faster. method = "xgbDART", metric = "RMSE", tuneGrid = tune_grid, trControl = param_tune, verbosity = 0) # Predict and check model accuracy. xgb_prediction <- predict(xgb_model, x_test) postResample(pred = xgb_prediction, obs = y_test$V1)