1
0

Upload files

This commit is contained in:
Emilio Soriano Chávez 2025-01-10 23:05:22 -05:00
commit aad558045f
Signed by: ami
SSH Key Fingerprint: SHA256:sWhVpdjbG+Qc9U4/7pEQbUHnVr3MSfIqu6Xq3YEaQ8I
9 changed files with 4075 additions and 0 deletions

BIN
Models.png Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 84 KiB

BIN
R_Logo.png Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 242 KiB

1001
Test_Genotype_Data.csv Executable file

File diff suppressed because one or more lines are too long

1000
Test_Genotype_Data_Imputed.csv Executable file

File diff suppressed because it is too large Load Diff

1000
Test_Phenotype_Data.csv Executable file

File diff suppressed because it is too large Load Diff

78
bigstatsR_tutorial.R Executable file
View File

@ -0,0 +1,78 @@
library(bigreadr)
library(bigstatsr)
library(data.table)
library(impute)
# Function for checking the number of missing values in the dataframe.
check_na <- function(X, ind) {
sum(is.na(X[, ind]))
}
# Impute function to be used in data subsets.
impute_func <- function(X, ind) {
round(t(impute::impute.knn(t(X[, ind]), k = 50, rowmax = 1, colmax = 1)$data), 0)
}
# Read only the first row of the file.
# This allows us to obtain the number of columns.
first_row <- fread2("/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Genotype_Data.csv",
nrows = 1)
col_num <- ncol(first_row)
# Read the entire file using big_read.
gen_data <- big_read("/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Genotype_Data.csv",
select = 1:col_num,
backingfile = "G2F_Genotype_Data",
progress = TRUE,
type = "double")
# Examine the data.
gen_data
gen_data[1:10, 1:10]
str(gen_data)
sum(is.na(gen_data))
# Check how many missing values we have.
big_apply(gen_data,
a.FUN = check_na,
a.combine = "+",
ncores = 2)
# Subset the dataframe into sets of 100 columns.
sub_idx <- split(1:col_num, ceiling(seq_along(1:col_num) / 100))
for (i in 1:length(sub_idx)) {
# Display current subset being evaluated.
print(i)
# Get the Genotype Data for the current subset.
gen_subset <- gen_data[, sub_idx[[i]]]
gen_imputed <- round(t(impute::impute.knn(t(gen_subset), k = 50, rowmax = 1, colmax = 1)$data), 0)
# Apply the imputing function to the current subset.
# gen_imputed <- big_apply(gen_subset,
# a.FUN = impute_func,
# a.combine = "c",
# ncores = 2)
# "Save" imputed data in the original dataframe.
gen_data[, sub_idx[[i]]] <- gen_imputed
}
# Now, verify there are no missing values anymore.
big_apply(gen_data,
a.FUN = check_na,
a.combine = "+",
ncores = 2)
# Write the newly-imputed data to a new file.
big_write(gen_data,
"/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Genotype_Data_Imputed.csv",
every_nrow = 100,
progress = interactive())
# Remove backing files.
system("rm *.bk")
system("rm *.rds")

154
caret_tutorial.R Executable file
View File

@ -0,0 +1,154 @@
library(caret)
library(data.table)
## Data Loading [Predictors]
# Genotype Data
gen_path <- "/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Genotype_Data_Imputed.csv"
gen_data <- fread(gen_path, sep = ",", header = FALSE)
## Data Loading [Traits]
# Phenotype Data
phn_path <- "/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Phenotype_Data.csv"
phn_data <- fread(phn_path, sep = ",", header = FALSE)
## Data Overview
# Preview of Genotype Data.
str(gen_data)
gen_data[1:10, 1:10]
# Preview of Phenotype Data.
str(phn_data)
phn_data[1:10, 1]
# Find missing data (if any) in the loaded datasets.
sum(is.na(gen_data))
sum(is.na(phn_data))
## Pre-Processing
# Set random seed for reproducibility.
set.seed(226)
# Perform a 80% / 20% split of the data.
# Get the index for the rows to be used in the training data.
train_index <- createDataPartition(phn_data$V1,
p = 0.8,
list = FALSE)
train_index
# Now, retrieve the corresponding training and testing data.
x_train <- gen_data[train_index,]
x_test <- gen_data[-train_index,]
y_train <- phn_data[train_index,]
y_test <- phn_data[-train_index,]
## Model Training [glmnet]
# Define a custom tuning grid.
tune_grid <- expand.grid(alpha = seq(0.0001, 1, length = 5),
lambda = 5)
# Parameter tuning.
param_tune <- trainControl(method = "repeatedcv",
number = 2,
repeats = 5,
trim = TRUE,
search = "grid",
verboseIter = TRUE)
# Train a model.
glmnet_model <- train(x_train, y_train$V1,
method = "glmnet",
metric = "MAE",
tuneGrid = tune_grid,
trControl = param_tune)
glmnet_model
# Predict and check model accuracy.
glmnet_prediction <- predict(glmnet_model, x_test)
postResample(pred = glmnet_prediction, obs = y_test$V1)
## Model Training [earth]
# Custom tuning grid.
tune_grid <- expand.grid(nprune = 1:10,
degree = 1:10)
# Parameter tuning.
param_tune <- trainControl(method = "repeatedcv",
number = 2,
repeats = 5,
trim = TRUE,
search = "grid",
verboseIter = TRUE)
# Train a model.
earth_model <- train(x_train, y_train$V1,
method = "earth",
metric = "RMSE",
tuneGrid = tune_grid,
trControl = param_tune)
# Predict and check model accuracy.
earth_prediction <- predict(earth_model, x_test)
postResample(pred = earth_prediction, obs = y_test$V1)
## Model Training [mlpKerasDropout]
# Parameter tuning.
param_tune <- trainControl(search = "random")
# Train a model.
keras_model <- train(x_train, y_train$V1,
method = "mlpKerasDropout",
metric = "RMSE",
callbacks = list(
keras::callback_early_stopping(monitor = "loss",
mode = "auto",
patience = 20,
restore_best_weights = TRUE)
),
trControl = param_tune,
tuneLength = 3,
epochs = 50)
# Predict and check model accuracy.
keras_prediction <- predict(keras_model, x_test)
postResample(pred = keras_prediction, obs = y_test$V1)
## Model Training [xgbDART]
# Custom tuning grid.
tune_grid <- expand.grid(nrounds = 0,
max_depth = 2,
eta = 0.0001,
gamma = 0,
subsample = 0.5,
colsample_bytree = 0.5,
rate_drop = seq(0.1, 1, length = 20),
skip_drop = seq(0.1, 1, length = 20),
min_child_weight = 9)
# Parameter tuning.
param_tune <- trainControl(method = "repeatedcv",
number = 2,
repeats = 5,
verboseIter = TRUE)
# Train a model.
xgb_model <- train(x_train, y_train$V1,
# xgbTree may be faster.
method = "xgbDART",
metric = "RMSE",
tuneGrid = tune_grid,
trControl = param_tune,
verbosity = 0)
# Predict and check model accuracy.
xgb_prediction <- predict(xgb_model, x_test)
postResample(pred = xgb_prediction, obs = y_test$V1)

408
slides.html Executable file

File diff suppressed because one or more lines are too long

434
slides.md Executable file
View File

@ -0,0 +1,434 @@
---
marp: true
---
<!-- theme: default -->
![width:150px](R_Logo.png)
# `caret` & `bigstatsR` Workshop
---
# Required Libraries
```r
# "bigstatsR" and related packages.
install.packages("bigstatsr")
install.packages("bigreadr")
install.packages("data.table")
# "impute" library.
install.packages("BiocManager")
BiocManager::install("impute")
# "caret" library and sample models.
install.packages("caret")
install.packages("glmnet")
```
---
# Getting Started
- All files used in the workshop can be downloaded here:
- https://cloud.ami.sc/s/gABPQGrrLa6cCiy
- Throughout the workshop, you will need to fill out the missing code in the scripts, marked as `(...)`.
---
![width:150px](R_Logo.png)
# `bigstatsR` Package
> Provides functions for fast statistical analysis of large-scale data encoded as matrices. The package can handle matrices that are too large to fit in memory thanks to memory-mapping to binary files on disk.
---
# Reading Files with `bigstatsR` [1]
```r
# Read only the first row of the file.
# This allows us to obtain the number of columns.
first_row <- fread2("Test_Genotype_Data.csv",
nrows = 1)
col_num <- ncol(first_row)
# Read the entire file using big_read.
gen_data <- big_read("Test_Genotype_Data.csv", # Our large data file.
select = 1:col_num, # The number of columns we want to read.
backingfile = "Data", # Path to the backing file.
progress = TRUE, # Show progress.
type = "double") # Data type of the contents.
```
---
# Reading Files with `bigstatsR` [2]
- Our file will be read as a Filebacked Big Matrix (FBM).
- After executing the code, we should see the following:
```r
> gen_data
A Filebacked Big Matrix of type 'double' with 1000 rows and 1000 columns.
```
- Be careful to remove the backing files before trying to re-read the same file.
---
# Working with FBMs [1]
- We can interact with FBMs *in a similar manner* as we do with normal dataframes.
```r
# Examine the data.
gen_data
gen_data[1:10, 1:10]
str(gen_data)
```
---
# Working with FBMs [2]
- Functions need to be applied differently.
- Say we wanted to find out the number of missing values in our data.
- The following approach does not work anymore:
```r
# Will not work!
sum(is.na(gen_data))
```
---
# Applying Functions [1]
- In order for the function to be applied, we need to define it separately and use `big_apply`.
- Our function **must** take the following as arguments:
- An FBM object: `X`
- A vector of indices: `ind`
```r
# Function for checking the number of missing values in the dataframe.
check_na <- function(X, ind) {
sum(is.na(X[, ind]))
}
```
- `big_apply` will take care of splitting our data into subsets, applying our function to each subset, and combining the results.
---
# Applying Functions [2]
- Now, we can find the number of missing values in our dataset.
```r
# Check how many missing values we have.
big_apply(gen_data, # FBM to which we want to apply the function.
a.FUN = check_na, # The function that we want to apply.
a.combine = "+", # How the results of each subset should be combined.
ncores = 2) # Number of CPU cores to use.
```
---
# Applying Functions [3]
- If we *really* want to, we can manually create subsets and apply functions using a `for` loop.
- First, we must get the indices for each subset:
```r
# Subset the dataframe into sets of 100 columns.
sub_idx <- split(1:col_num, ceiling(seq_along(1:col_num) / 100))
```
---
# Applying Functions [4]
- Now, we can extract our subsets and apply our function:
```r
for (i in 1:length(sub_idx)) {
# Display current subset being evaluated.
print(i)
# Get the Genotype Data for the current subset.
gen_subset <- gen_data[, sub_idx[[i]]]
# Impute missing data using KNN clustering.
gen_imputed <- round(
t(impute::impute.knn(t(gen_subset), k = 50, rowmax = 1, colmax = 1)$data), 0)
# "Write back" imputed data to the FBM.
gen_data[, sub_idx[[i]]] <- gen_imputed
}
```
---
# Applying Functions [5]
- Check that the `impute` function worked:
```r
# Now, verify there are no missing values anymore.
big_apply(gen_data,
a.FUN = check_na,
a.combine = "+",
ncores = 2)
```
---
# Saving FBMs
- How do we save our modified data back to a `.csv` file? Easy, use `big_write`:
```r
# Write the newly-imputed data to a new file.
big_write(gen_data, # Our FBM object.
"Test_Genotype_Data_Imputed.csv", # Name of the file.
every_nrow = 100, # How many rows do we want to save at once.
progress = interactive()) # Show a fancy progress bar.
```
---
# Removing Backing Files
- `big_read` will create backing files that are of no use after we saved our file.
- They can be safely removed:
```r
# Remove backing files.
system("rm *.bk")
system("rm *.rds")
```
---
![width:150px](R_Logo.png)
# `caret` Package
> Short for Classification And REgression Training. It is a set of functions that attempt to streamline the process for creating predictive models. The package started off as a way to provide a uniform interface for modeling functions, as well as a way to standardize common tasks (such parameter tuning and variable importance).
---
# Loading Data
- First, let's load our newly-imputed Genotype Data:
```r
## Data Loading [Predictors]
# Genotype Data
gen_path <- "Test_Genotype_Data_Imputed.csv"
gen_data <- fread(gen_path, sep = ",", header = FALSE)
```
- Let's load our Phenotype Data as well:
```r
## Data Loading [Traits]
# Phenotype Data
phn_path <- "Test_Phenotype_Data.csv"
phn_data <- fread(phn_path, sep = ",", header = FALSE)
```
---
# Data Overview
- Our Genotype Data is a large matrix of $0$s and $1$s:
```r
# Preview of Genotype Data.
str(gen_data)
gen_data[1:10, 1:10]
```
- Our Phenotype Data, which we want to predict, is a single column of `Plant Height (cm)` measurements:
```r
# Preview of Phenotype Data.
str(phn_data)
phn_data[1:10, 1]
```
---
# The `createDataPartition` Function [1]
- We can easily split our data into training and testing sets as follows:
```r
# We want 80% of our data for training and 20% for testing.
# Get the indices for the rows to be used in the training data.
train_index <- createDataPartition(
phn_data$V1, # Our Phenotype Data.
p = 0.8, # Percentage of data to be used for training.
list = FALSE) # To return a matrix instead of a list.
# Check out which rows will be used for training.
train_index
```
---
# The `createDataPartition` Function [2]
- Now that we know which indices will be used for training, we can extract the corresponding data:
```r
# The values used as predictors [x].
x_train <- gen_data[train_index,]
x_test <- gen_data[-train_index,]
# The values to be predicted [y].
y_train <- phn_data[train_index,]
y_test <- phn_data[-train_index,]
```
---
# Model Training and Tuning [1]
> ![width:50px](R_Logo.png) `glmnet`
> ***
> Fits generalized linear and similar models via penalized maximum likelihood. The regularization path is computed for the lasso or elastic net penalty at a grid of values (on the log scale) for the regularization parameter `lambda`.
<br />
- We can perform this tuning automatically with `caret`. Let's first create a custom parameter grid:
```r
# Define a custom tuning grid.
tune_grid <- expand.grid(alpha = seq(0.0001, 1, length = 5), # Values to try for "alpha".
lambda = 5) # Values to try for "lambda".
```
---
# Model Training and Tuning [2]
- The `trainControl` function allows us to further customize how our model is made:
```r
# Parameter tuning.
param_tune <- trainControl(method = "repeatedcv", # Method to be used for resampling.
number = 2, # Numebr of folds (subsets) for cross-validation.
repeats = 5, # Number of iterations for cross-validation.
trim = TRUE, # Reduces memory consumption.
search = "grid", # How the tuning grid should be made.
verboseIter = TRUE) # To display additional information.
```
- Other methods are available, such as a single cross-validation (`cv`), a leave-one-out cross-validation (`LOOCV`), or `none`, which will only train a single model.
- It is also possible to specify a `random` tuning grid.
---
# Model Training and Tuning [3]
- Now, we can finally train our model:
```r
# Train a model.
glmnet_model <- train(x_train, # Our predictors.
y_train$V1, # What we want to predict.
method = "glmnet", # The model we want to use.
metric = "MAE", # What metric should we use to pick the best model.
tuneGrid = tune_grid, # Our custom tuning grid.
trControl = param_tune) # Our custom tuning method.
```
---
# Model Evaluation [1]
```r
> glmnet
800 samples
1000 predictors
No pre-processing
Resampling: Cross-Validated (2 fold, repeated 5 times)
Summary of sample sizes: 400, 400, 400, 400, 400, 400, ...
Resampling results across tuning parameters:
alpha RMSE Rsquared MAE
0.000100 61.23760 0.002033470 52.85568
0.250075 61.83544 0.002543141 52.85747
0.500050 60.02843 0.002118089 52.03982
0.750025 59.16487 0.001651774 51.61048
1.000000 58.81907 0.002781994 51.40225
Tuning parameter 'lambda' was held constant at a value of 5
MAE was used to select the optimal model using the smallest value.
The final values used for the model were alpha = 1 and lambda = 5.
```
---
# Model Evaluation [2]
- Now, let's predict on our testing data:
```r
# Predict on testing data.
glmnet_prediction <- predict(glmnet_model, x_test)
# Check prediction accuracy.
postResample(pred = glmnet_prediction, # Our predicted values.
obs = y_test$V1) # Our expected vaues.
```
- This gives us a nice summary of the prediction:
```text
RMSE Rsquared MAE
5.649786e+01 6.306793e-06 4.883650e+01
```
---
# More Models
- `caret` supports *a lot* of models (last time I checked, $238$).
- More examples are given in the file for you to try:
- `earth`
- `keras`
- `xgboost`
- You can also define your own models as well.
---
![width:650px](Models.png)
---
# References
- `bigstatsR` Package:
[1] https://github.com/privefl/bigstatsr
[2] https://privefl.github.io/bigstatsr/articles/read-FBM-from-file.html
- `caret` Package:
[1] https://topepo.github.io/caret/
[2] https://topepo.github.io/caret/available-models.html
---
# Thank You!