1
0
caret-and-bigstatsr-workshop/bigstatsR_tutorial.R
2025-01-10 23:05:22 -05:00

79 lines
2.3 KiB
R
Executable File

library(bigreadr)
library(bigstatsr)
library(data.table)
library(impute)
# Function for checking the number of missing values in the dataframe.
check_na <- function(X, ind) {
sum(is.na(X[, ind]))
}
# Impute function to be used in data subsets.
impute_func <- function(X, ind) {
round(t(impute::impute.knn(t(X[, ind]), k = 50, rowmax = 1, colmax = 1)$data), 0)
}
# Read only the first row of the file.
# This allows us to obtain the number of columns.
first_row <- fread2("/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Genotype_Data.csv",
nrows = 1)
col_num <- ncol(first_row)
# Read the entire file using big_read.
gen_data <- big_read("/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Genotype_Data.csv",
select = 1:col_num,
backingfile = "G2F_Genotype_Data",
progress = TRUE,
type = "double")
# Examine the data.
gen_data
gen_data[1:10, 1:10]
str(gen_data)
sum(is.na(gen_data))
# Check how many missing values we have.
big_apply(gen_data,
a.FUN = check_na,
a.combine = "+",
ncores = 2)
# Subset the dataframe into sets of 100 columns.
sub_idx <- split(1:col_num, ceiling(seq_along(1:col_num) / 100))
for (i in 1:length(sub_idx)) {
# Display current subset being evaluated.
print(i)
# Get the Genotype Data for the current subset.
gen_subset <- gen_data[, sub_idx[[i]]]
gen_imputed <- round(t(impute::impute.knn(t(gen_subset), k = 50, rowmax = 1, colmax = 1)$data), 0)
# Apply the imputing function to the current subset.
# gen_imputed <- big_apply(gen_subset,
# a.FUN = impute_func,
# a.combine = "c",
# ncores = 2)
# "Save" imputed data in the original dataframe.
gen_data[, sub_idx[[i]]] <- gen_imputed
}
# Now, verify there are no missing values anymore.
big_apply(gen_data,
a.FUN = check_na,
a.combine = "+",
ncores = 2)
# Write the newly-imputed data to a new file.
big_write(gen_data,
"/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Genotype_Data_Imputed.csv",
every_nrow = 100,
progress = interactive())
# Remove backing files.
system("rm *.bk")
system("rm *.rds")