79 lines
2.3 KiB
R
Executable File
79 lines
2.3 KiB
R
Executable File
library(bigreadr)
|
|
library(bigstatsr)
|
|
library(data.table)
|
|
library(impute)
|
|
|
|
# Function for checking the number of missing values in the dataframe.
|
|
check_na <- function(X, ind) {
|
|
sum(is.na(X[, ind]))
|
|
}
|
|
|
|
# Impute function to be used in data subsets.
|
|
impute_func <- function(X, ind) {
|
|
round(t(impute::impute.knn(t(X[, ind]), k = 50, rowmax = 1, colmax = 1)$data), 0)
|
|
}
|
|
|
|
# Read only the first row of the file.
|
|
# This allows us to obtain the number of columns.
|
|
first_row <- fread2("/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Genotype_Data.csv",
|
|
nrows = 1)
|
|
col_num <- ncol(first_row)
|
|
|
|
# Read the entire file using big_read.
|
|
gen_data <- big_read("/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Genotype_Data.csv",
|
|
select = 1:col_num,
|
|
backingfile = "G2F_Genotype_Data",
|
|
progress = TRUE,
|
|
type = "double")
|
|
|
|
# Examine the data.
|
|
gen_data
|
|
gen_data[1:10, 1:10]
|
|
str(gen_data)
|
|
|
|
sum(is.na(gen_data))
|
|
|
|
# Check how many missing values we have.
|
|
big_apply(gen_data,
|
|
a.FUN = check_na,
|
|
a.combine = "+",
|
|
ncores = 2)
|
|
|
|
# Subset the dataframe into sets of 100 columns.
|
|
sub_idx <- split(1:col_num, ceiling(seq_along(1:col_num) / 100))
|
|
for (i in 1:length(sub_idx)) {
|
|
|
|
# Display current subset being evaluated.
|
|
print(i)
|
|
|
|
# Get the Genotype Data for the current subset.
|
|
gen_subset <- gen_data[, sub_idx[[i]]]
|
|
|
|
gen_imputed <- round(t(impute::impute.knn(t(gen_subset), k = 50, rowmax = 1, colmax = 1)$data), 0)
|
|
|
|
# Apply the imputing function to the current subset.
|
|
# gen_imputed <- big_apply(gen_subset,
|
|
# a.FUN = impute_func,
|
|
# a.combine = "c",
|
|
# ncores = 2)
|
|
|
|
# "Save" imputed data in the original dataframe.
|
|
gen_data[, sub_idx[[i]]] <- gen_imputed
|
|
}
|
|
|
|
# Now, verify there are no missing values anymore.
|
|
big_apply(gen_data,
|
|
a.FUN = check_na,
|
|
a.combine = "+",
|
|
ncores = 2)
|
|
|
|
# Write the newly-imputed data to a new file.
|
|
big_write(gen_data,
|
|
"/home/ami/Projects/P0004 - caret and bigstatsR Workshop/Test_Genotype_Data_Imputed.csv",
|
|
every_nrow = 100,
|
|
progress = interactive())
|
|
|
|
# Remove backing files.
|
|
system("rm *.bk")
|
|
system("rm *.rds")
|