library(bigreadr) library(bigstatsr) library(data.table) library(impute) # Function for checking the number of missing values in the dataframe. check_na <- function(X, ind) { sum(is.na(X[, ind])) } # Impute function to be used in data subsets. impute_func <- function(X, ind) { round(t(impute::impute.knn(t(X[, ind]), k = 50, rowmax = 1, colmax = 1)$data), 0) } # Read only the first row of the file. # This allows us to obtain the number of columns. first_row <- fread2("/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Genotype_Data.csv", nrows = 1) col_num <- ncol(first_row) # Read the entire file using big_read. gen_data <- big_read("/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Genotype_Data.csv", select = 1:col_num, backingfile = "G2F_Genotype_Data", progress = TRUE, type = "double") # Examine the data. gen_data gen_data[1:10, 1:10] str(gen_data) sum(is.na(gen_data)) # Check how many missing values we have. big_apply(gen_data, a.FUN = check_na, a.combine = "+", ncores = 2) # Subset the dataframe into sets of 100 columns. sub_idx <- split(1:col_num, ceiling(seq_along(1:col_num) / 100)) for (i in 1:length(sub_idx)) { # Display current subset being evaluated. print(i) # Get the Genotype Data for the current subset. gen_subset <- gen_data[, sub_idx[[i]]] gen_imputed <- round(t(impute::impute.knn(t(gen_subset), k = 50, rowmax = 1, colmax = 1)$data), 0) # Apply the imputing function to the current subset. # gen_imputed <- big_apply(gen_subset, # a.FUN = impute_func, # a.combine = "c", # ncores = 2) # "Save" imputed data in the original dataframe. gen_data[, sub_idx[[i]]] <- gen_imputed } # Now, verify there are no missing values anymore. big_apply(gen_data, a.FUN = check_na, a.combine = "+", ncores = 2) # Write the newly-imputed data to a new file. big_write(gen_data, "/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Genotype_Data_Imputed.csv", every_nrow = 100, progress = interactive()) # Remove backing files. system("rm *.bk") system("rm *.rds")