caret-and-bigstatsr-workshop/bigstatsR_tutorial.R

library(bigreadr)
library(bigstatsr)
library(data.table)
library(impute)

# Function for checking the number of missing values in the dataframe.
check_na <- function(X, ind) {
  sum(is.na(X[, ind]))
}

# Impute function to be used in data subsets.
impute_func <- function(X, ind) {
  round(t(impute::impute.knn(t(X[, ind]), k = 50, rowmax = 1, colmax = 1)$data), 0)
}

# Read only the first row of the file.
# This allows us to obtain the number of columns.
first_row <- fread2("Test_Genotype_Data.csv",
                    nrows = 1)
col_num <- ncol(first_row)

# Read the entire file using big_read.
gen_data <- big_read("Test_Genotype_Data.csv",
                     select = 1:col_num,
                     backingfile = "G2F_Genotype_Data",
                     progress = TRUE,
                     type = "double")

# Examine the data.
gen_data
gen_data[1:10, 1:10]
str(gen_data)

sum(is.na(gen_data))

# Check how many missing values we have.
big_apply(gen_data,
          a.FUN = check_na,
          a.combine = "+",
          ncores = 2)

# Subset the dataframe into sets of 100 columns.
sub_idx <- split(1:col_num, ceiling(seq_along(1:col_num) / 100))
for (i in 1:length(sub_idx)) {

  # Display current subset being evaluated.
  print(i)

  # Get the Genotype Data for the current subset.
  gen_subset <- gen_data[, sub_idx[[i]]]

  gen_imputed <- round(t(impute::impute.knn(t(gen_subset), k = 50, rowmax = 1, colmax = 1)$data), 0)

  # Apply the imputing function to the current subset.
  # gen_imputed <- big_apply(gen_subset,
  #                          a.FUN = impute_func,
  #                          a.combine = "c",
  #                          ncores = 2)

  # "Save" imputed data in the original dataframe.
  gen_data[, sub_idx[[i]]] <- gen_imputed
}

# Now, verify there are no missing values anymore.
big_apply(gen_data,
          a.FUN = check_na,
          a.combine = "+",
          ncores = 2)

# Write the newly-imputed data to a new file.
big_write(gen_data,
          "Test_Genotype_Data_Imputed.csv",
          every_nrow = 100,
          progress = interactive())

# Remove backing files.
system("rm *.bk")
system("rm *.rds")