mirror of
https://github.com/ami-sc/AgAdapt.git
synced 2024-07-07 03:17:50 +02:00
Add XGBoost Model Training script.
This commit is contained in:
parent
5de251343d
commit
f93b6f7727
961
Code/Models/XGBoost_Model.py
Normal file
961
Code/Models/XGBoost_Model.py
Normal file
|
@ -0,0 +1,961 @@
|
|||
"""
|
||||
XGBoost Model
|
||||
|
||||
This script trains XGBoost models using the Leave-One-Field-Out (LOFO) approach,
|
||||
given a directory with master datasets and a master file with the model
|
||||
specifications.
|
||||
|
||||
**** AgAdapt Project ****
|
||||
"""
|
||||
|
||||
|
||||
import os
|
||||
import time
|
||||
import argparse
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
from sklearn.metrics import mean_absolute_error
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.model_selection import train_test_split
|
||||
from itertools import cycle
|
||||
from threading import Thread
|
||||
from colorama import Fore, Style
|
||||
|
||||
|
||||
status = False
|
||||
current = 0
|
||||
total = 1
|
||||
|
||||
|
||||
def tick_msg(text):
|
||||
"""
|
||||
Prints a message after a green '✔'.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text : str
|
||||
Message to be printed.
|
||||
"""
|
||||
print(Fore.GREEN + "[ ✔ ]" + Style.RESET_ALL + " " + text)
|
||||
|
||||
|
||||
def info_msg(text):
|
||||
"""
|
||||
Prints a message after a blue 'i'.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text : str
|
||||
Message to be printed.
|
||||
"""
|
||||
print(Fore.CYAN + "[ i ]" + Style.RESET_ALL + " " + text)
|
||||
|
||||
|
||||
def anim_loading(text):
|
||||
"""
|
||||
Displays a loading animation.
|
||||
|
||||
The animation will be displayed until the global variable "status" is set
|
||||
to false.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text : str
|
||||
Message to be printed along with the animation.
|
||||
"""
|
||||
frames = ["●○○", "○●○", "○○●", "○●○"]
|
||||
|
||||
for i in cycle(frames):
|
||||
if status:
|
||||
print(Fore.MAGENTA + "[" + i + "] " + Style.RESET_ALL + text,
|
||||
end = "\r", flush = True)
|
||||
time.sleep(0.4)
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
def anim_process(text):
|
||||
"""
|
||||
Displays a loading animation and the current process being performed.
|
||||
|
||||
The current process is given by the global variable "current". The total
|
||||
number of processes are given by the global variable "total". The animation
|
||||
will be displayed until the global variable "status" is set to false.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text : str
|
||||
Message to be printed along with the animation.
|
||||
"""
|
||||
frames = ["●○○", "○●○", "○○●", "○●○"]
|
||||
|
||||
for frame in cycle(frames):
|
||||
|
||||
if status:
|
||||
print(Fore.MAGENTA + "[" + frame + "]" + Style.RESET_ALL + " "
|
||||
+ str(current) + "/" + str(total) + " "
|
||||
+ "(" + format((current / total) * 100, ".2f") + "%)"
|
||||
+ " | " + text, end = "\r", flush = True)
|
||||
|
||||
time.sleep(0.4)
|
||||
|
||||
else:
|
||||
print(Fore.MAGENTA + "[●●●]" + Style.RESET_ALL + " "
|
||||
+ str(total) + "/" + str(total) + " "
|
||||
+ "(" + format(100, ".2f") + "%)"
|
||||
+ " | " + text, end = "\n", flush = True)
|
||||
|
||||
break
|
||||
|
||||
|
||||
def cross_validation(parameters, d_train, random_seed):
|
||||
"""
|
||||
Performs k-fold cross validation on an XGBoost Model. Validation will run
|
||||
for a maximum of 999 boosting rounds, or until no improvement is observed
|
||||
for at least 10 boosting rounds.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
parameters : dict
|
||||
Parameter dictionary for the XGBoost Model validation.
|
||||
d_train : xgb.DMatrix
|
||||
DMatrix containing the training data to be used in the validation.
|
||||
random_seed : int
|
||||
Random seed to be used for reproducibility of the validation.
|
||||
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
The lowest Mean Absolute Error (MAE) obtained from the cross-validation.
|
||||
"""
|
||||
# Run cross-validation test using current combination.
|
||||
cv_model = xgb.cv(
|
||||
parameters,
|
||||
d_train,
|
||||
num_boost_round = 999,
|
||||
seed = random_seed,
|
||||
nfold = 5,
|
||||
metrics = "mae",
|
||||
early_stopping_rounds = 10
|
||||
)
|
||||
|
||||
# Retrieve the lowest MAE from the test.
|
||||
return cv_model["test-mae-mean"].min()
|
||||
|
||||
|
||||
def train_model(lofo_field, predictor_params, phn_trait, master_df, test_size,
|
||||
random_seed, max_range):
|
||||
"""
|
||||
Trains an XGBoost Model given a set of predictor features and a phenotype
|
||||
trait to be predicted. Model training will follow a Leave-One-Field-Out
|
||||
(LOFO) approach, where the data for a target field will be excluded from
|
||||
the training dataset and used exclusively for testing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
lofo_field : str
|
||||
Target field to be used exclusively for testing.
|
||||
predictor_params : list
|
||||
Features to be used as predictors in the model.
|
||||
phn_trait : str
|
||||
Phenotype trait to be predicted.
|
||||
master_df : pd.DataFrame
|
||||
Master Dataset containing both the predictor parameters and the target
|
||||
phenotype trait, on a per-field arrangement.
|
||||
test_size : float
|
||||
Test size to be used for the train_test_split() function.
|
||||
random_seed : int
|
||||
Random seed to be used for reproducibility of model training.
|
||||
max_range : int
|
||||
Maximum range to be used when tuning the max_depth and min_child_weight
|
||||
parameters of the model.
|
||||
|
||||
Returns
|
||||
-------
|
||||
data_stats : list
|
||||
Calculated statistics for the training and testing datasets used in the
|
||||
model.
|
||||
performance_stats : list
|
||||
Calculated statistics about the performance of the model.
|
||||
cv_dfs : list
|
||||
List of DataFrames containing tested combinations for parameter tuning.
|
||||
tuned_params : list
|
||||
List containing the best values for each parameter of the model, as
|
||||
evaluated by parameter tuning.
|
||||
model_tuned : xgb.Booster
|
||||
Trained XGBoost Model.
|
||||
"""
|
||||
global status, total, current
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Data Selection
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
model_df = master_df[["Field", phn_trait] + predictor_params]
|
||||
|
||||
test_df = model_df.loc[model_df["Field"] == lofo_field].copy()
|
||||
model_df = model_df.loc[model_df["Field"] != lofo_field].copy()
|
||||
|
||||
test_df.reset_index(drop = True, inplace = True)
|
||||
model_df.reset_index(drop = True, inplace = True)
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Calculation of Data Statistics
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
model_mean = model_df[phn_trait].mean()
|
||||
data_stats = [test_df[phn_trait].mean(), test_df[phn_trait].std(),
|
||||
model_mean, model_df[phn_trait].std()]
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Baseline Model
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
performance_stats = []
|
||||
|
||||
mean_array = np.ones(test_df.shape[0])
|
||||
mean_array = mean_array.dot(model_mean)
|
||||
|
||||
performance_stats.append(mean_absolute_error(test_df[phn_trait],
|
||||
mean_array))
|
||||
performance_stats.append(mean_squared_error(test_df[phn_trait],
|
||||
mean_array, squared = False))
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# DMatrix Generation
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
d_test = xgb.DMatrix(test_df[predictor_params], label = test_df[phn_trait])
|
||||
|
||||
x_train, x_tune, y_train, y_tune = train_test_split(
|
||||
model_df[predictor_params],
|
||||
model_df[phn_trait],
|
||||
test_size = test_size,
|
||||
random_state = random_seed
|
||||
)
|
||||
|
||||
d_train = xgb.DMatrix(x_train, label = y_train)
|
||||
d_tune = xgb.DMatrix(x_tune, label = y_tune)
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Default Parameter Dictionary
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
parameters = {
|
||||
# Parameters for Tree Booster
|
||||
"max_depth": 6,
|
||||
"min_child_weight": 1,
|
||||
"eta": 0.3,
|
||||
"subsample": 1.0,
|
||||
"colsample_bytree": 1.0,
|
||||
|
||||
# Learning Task Parameters
|
||||
"objective": "reg:squarederror",
|
||||
"eval_metric": "mae"
|
||||
}
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# max_depth & min_child_weight Parameter Tuning
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
cv_dfs = []
|
||||
tuned_params = []
|
||||
|
||||
md_values = range(1, max_range + 1)
|
||||
mcw_values = range(0, max_range + 1)
|
||||
|
||||
best_mae = float("inf")
|
||||
best_md = None
|
||||
best_mcw = None
|
||||
|
||||
cv_df = pd.DataFrame(np.nan, index = md_values, columns = mcw_values)
|
||||
|
||||
current = 0
|
||||
total = len(md_values) * len(mcw_values)
|
||||
status = True
|
||||
loading = Thread(target = anim_process,
|
||||
args = ("max_depth & min_child_weight Parameter Tuning",))
|
||||
loading.start()
|
||||
|
||||
for md in md_values:
|
||||
for mcw in mcw_values:
|
||||
|
||||
parameters["max_depth"] = md
|
||||
parameters["min_child_weight"] = mcw
|
||||
|
||||
cv_mae = cross_validation(parameters, d_train, random_seed)
|
||||
cv_df.loc[md, mcw] = cv_mae
|
||||
|
||||
if cv_mae < best_mae:
|
||||
best_mae = cv_mae
|
||||
best_md = md
|
||||
best_mcw = mcw
|
||||
|
||||
current += 1
|
||||
|
||||
status = False
|
||||
loading.join()
|
||||
tick_msg("Successfully tuned max_depth & min_child_weight parameters.")
|
||||
|
||||
parameters["max_depth"] = best_md
|
||||
tuned_params.append(best_md)
|
||||
parameters["min_child_weight"] = best_mcw
|
||||
tuned_params.append(best_mcw)
|
||||
cv_dfs.append(cv_df)
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# subsample & colsample_bytree Parameter Tuning
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
ssmpl_values = [i / 10 for i in range(1, 11)]
|
||||
cb_values = [i / 10 for i in range(1, 11)]
|
||||
|
||||
best_mae = float("inf")
|
||||
best_ssmpl = None
|
||||
best_cb = None
|
||||
|
||||
cv_df = pd.DataFrame(np.nan, index = ssmpl_values, columns = cb_values)
|
||||
|
||||
current = 0
|
||||
total = len(ssmpl_values) * len(cb_values)
|
||||
status = True
|
||||
loading = Thread(target = anim_process,
|
||||
args = ("subsample & colsample_bytree Parameter Tuning",))
|
||||
loading.start()
|
||||
|
||||
for ssmpl in ssmpl_values:
|
||||
for cb in cb_values:
|
||||
|
||||
parameters["subsample"] = ssmpl
|
||||
parameters["colsample_bytree"] = cb
|
||||
|
||||
cv_mae = cross_validation(parameters, d_train, random_seed)
|
||||
cv_df.loc[ssmpl, cb] = cv_mae
|
||||
|
||||
if cv_mae < best_mae:
|
||||
best_mae = cv_mae
|
||||
best_ssmpl = ssmpl
|
||||
best_cb = cb
|
||||
|
||||
current += 1
|
||||
|
||||
status = False
|
||||
loading.join()
|
||||
tick_msg("Successfully tuned subsample & colsample_bytree parameters.")
|
||||
|
||||
parameters["subsample"] = best_ssmpl
|
||||
tuned_params.append(best_ssmpl)
|
||||
parameters["colsample_bytree"] = best_cb
|
||||
tuned_params.append(best_cb)
|
||||
cv_dfs.append(cv_df)
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# eta Parameter Tuning
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
eta_values = [0.5, 0.4, 0.3, 0.2, 0.1, 0.01, 0.001]
|
||||
|
||||
cv_df = pd.DataFrame(np.nan, index = ["MAE"], columns = eta_values)
|
||||
|
||||
best_mae = float("inf")
|
||||
best_eta = None
|
||||
|
||||
current = 0
|
||||
total = len(eta_values)
|
||||
status = True
|
||||
loading = Thread(target = anim_process,
|
||||
args = ("eta Parameter Tuning",))
|
||||
loading.start()
|
||||
|
||||
for eta in eta_values:
|
||||
|
||||
parameters["eta"] = eta
|
||||
|
||||
cv_mae = cross_validation(parameters, d_train, random_seed)
|
||||
cv_df.loc["MAE", eta] = cv_mae
|
||||
|
||||
if cv_mae < best_mae:
|
||||
best_mae = cv_mae
|
||||
best_eta = eta
|
||||
|
||||
current += 1
|
||||
|
||||
status = False
|
||||
loading.join()
|
||||
tick_msg("Successfully tuned eta parameter.")
|
||||
|
||||
parameters["eta"] = best_eta
|
||||
tuned_params.append(best_eta)
|
||||
cv_dfs.append(cv_df)
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Model Training
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
boosting_rounds = 999
|
||||
|
||||
status = True
|
||||
loading = Thread(target = anim_loading,
|
||||
args = ("Finding best number of boosting rounds.",))
|
||||
loading.start()
|
||||
|
||||
# Train a model using tuned parameters to find best number of rounds.
|
||||
tuned_model = xgb.train(
|
||||
parameters,
|
||||
d_train,
|
||||
num_boost_round = boosting_rounds,
|
||||
evals = [(d_tune, "Boosting_Rounds_Test")],
|
||||
early_stopping_rounds = 15,
|
||||
verbose_eval = False
|
||||
)
|
||||
|
||||
status = False
|
||||
loading.join()
|
||||
tick_msg("Successfully found best number of boosting rounds.")
|
||||
|
||||
# Update best number of rounds.
|
||||
boosting_rounds = tuned_model.best_iteration + 1
|
||||
tuned_params.append(boosting_rounds)
|
||||
|
||||
evaluation_result = {}
|
||||
|
||||
status = True
|
||||
loading = Thread(target = anim_loading,
|
||||
args = ("Training final model.",))
|
||||
loading.start()
|
||||
|
||||
# Re-train model with appropriate number of rounds.
|
||||
model_tuned = xgb.train(
|
||||
parameters,
|
||||
d_train,
|
||||
num_boost_round = boosting_rounds,
|
||||
evals = [(d_tune, "Final_Model_MAE")],
|
||||
evals_result = evaluation_result,
|
||||
verbose_eval = False
|
||||
)
|
||||
|
||||
train_mae = min(evaluation_result["Final_Model_MAE"]["mae"])
|
||||
performance_stats.append(train_mae)
|
||||
|
||||
parameters["eval_metric"] = "rmse"
|
||||
|
||||
evaluation_result = {}
|
||||
|
||||
xgb.train(
|
||||
parameters,
|
||||
d_train,
|
||||
num_boost_round = boosting_rounds,
|
||||
evals = [(d_tune, "Final_Model_RMSE")],
|
||||
evals_result = evaluation_result,
|
||||
verbose_eval = False)
|
||||
|
||||
train_rms = min(evaluation_result["Final_Model_RMSE"]["rmse"])
|
||||
performance_stats.append(train_rms)
|
||||
parameters["eval_metric"] = "mae"
|
||||
|
||||
status = False
|
||||
loading.join()
|
||||
tick_msg("Successfully trained final model.")
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Prediction and Testing
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
status = True
|
||||
loading = Thread(target = anim_loading,
|
||||
args = ("Predicting in " + lofo_field + " field.",))
|
||||
loading.start()
|
||||
|
||||
# Make a prediction using the tuned model and show the MAE and RMSE.
|
||||
prediction_mae = mean_absolute_error(model_tuned.predict(d_test),
|
||||
test_df[phn_trait])
|
||||
performance_stats.append(prediction_mae)
|
||||
prediction_rms = mean_squared_error(model_tuned.predict(d_test),
|
||||
test_df[phn_trait], squared = False)
|
||||
performance_stats.append(prediction_rms)
|
||||
|
||||
status = False
|
||||
loading.join()
|
||||
tick_msg("Successfully predicted target phenotype trait.")
|
||||
|
||||
return data_stats, performance_stats, cv_dfs, tuned_params, model_tuned
|
||||
|
||||
|
||||
def model_summary(field, data_stats, performance_stats, cv_dfs, tuned_params,
|
||||
save_path, model_code, model_name, units):
|
||||
"""
|
||||
Generates a summary of the trained XGBoost Model in markdown format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
field : str
|
||||
LOFO field used in the model.
|
||||
data_stats : list
|
||||
Calculated statistics for the training and testing datasets used in the
|
||||
model.
|
||||
performance_stats : list
|
||||
Calculated statistics about the performance of the model.
|
||||
cv_dfs : list
|
||||
List of DataFrames containing tested combinations for parameter tuning.
|
||||
tuned_params : list
|
||||
List containing the best values for each parameter of the model, as
|
||||
evaluated by parameter tuning.
|
||||
save_path : str
|
||||
Path to .md file to save created summary.
|
||||
model_code : str
|
||||
Versioning code to be appended to the filename.
|
||||
model_name : str
|
||||
Given name of the XGBoost Model.
|
||||
units : str
|
||||
Units of the predicted phenotype trait.
|
||||
"""
|
||||
summary_path = save_path + '/' + "README.md"
|
||||
summary = open(summary_path, "w")
|
||||
|
||||
summary.write("# " + field + " - " + model_name + " Model Summary ["
|
||||
+ model_code + "]")
|
||||
summary.write("\n\n")
|
||||
|
||||
summary.write("***")
|
||||
summary.write("\n\n")
|
||||
summary.write("### Model Performance")
|
||||
summary.write("\n\n")
|
||||
|
||||
summary.write("- Baseline Model [MAE] = " +
|
||||
"{:.4f}".format(performance_stats[0]) + '\n')
|
||||
summary.write("- Baseline Model [RMSE] = " +
|
||||
"{:.4f}".format(performance_stats[1]) + '\n')
|
||||
summary.write("- Trained Model [MAE] = " +
|
||||
"{:.4f}".format(performance_stats[2]) + '\n')
|
||||
summary.write("- Trained Model [RMSE] = " +
|
||||
"{:.4f}".format(performance_stats[3]) + '\n')
|
||||
summary.write("- Prediction [MAE] = " +
|
||||
"{:.4f}".format(performance_stats[4]) + '\n')
|
||||
summary.write("- Prediction [RMSE] = " +
|
||||
"{:.4f}".format(performance_stats[5]) + '\n')
|
||||
|
||||
summary.write("***")
|
||||
summary.write("\n\n")
|
||||
summary.write("### Dataset Statistics")
|
||||
summary.write("\n\n")
|
||||
|
||||
summary.write("- LOFO Field [Mean] = " +
|
||||
"{:.4f} ".format(data_stats[0]) + units + '\n')
|
||||
summary.write("- LOFO Field [Standard Deviation] = " +
|
||||
"{:.4f} ".format(data_stats[1]) + units + '\n')
|
||||
summary.write("- Model Dataset [Mean] = " +
|
||||
"{:.4f} ".format(data_stats[2]) + units + '\n')
|
||||
summary.write("- Model Dataset [Standard Deviation] = " +
|
||||
"{:.4f} ".format(data_stats[3]) + units + '\n')
|
||||
|
||||
summary.write("***")
|
||||
summary.write("\n\n")
|
||||
summary.write("### max_depth & min_child_weight Grid Search")
|
||||
summary.write("\n\n")
|
||||
|
||||
cv_dfs[0].index.name = r"md \ mcw"
|
||||
summary.write(cv_dfs[0].to_markdown(tablefmt = "github"))
|
||||
summary.write("\n\n")
|
||||
|
||||
summary.write("***")
|
||||
summary.write("\n\n")
|
||||
summary.write("### subsample & colsample_bytree Grid Search")
|
||||
summary.write("\n\n")
|
||||
|
||||
cv_dfs[1].index.name = r"ssmpl \ cb"
|
||||
summary.write(cv_dfs[1].to_markdown(tablefmt = "github"))
|
||||
summary.write("\n\n")
|
||||
|
||||
summary.write("***")
|
||||
summary.write("\n\n")
|
||||
summary.write("### eta Grid Search")
|
||||
summary.write("\n\n")
|
||||
|
||||
cv_dfs[2].index.name = r"eta"
|
||||
summary.write(cv_dfs[2].to_markdown(tablefmt = "github"))
|
||||
summary.write("\n\n")
|
||||
|
||||
summary.write("***")
|
||||
summary.write("\n\n")
|
||||
summary.write("### Tuned Parameters")
|
||||
summary.write("\n\n")
|
||||
|
||||
summary.write("- max_depth = " + str(tuned_params[0]) + '\n')
|
||||
summary.write("- min_child_weight = " + str(tuned_params[1]) + '\n')
|
||||
summary.write("- subsample = " + str(tuned_params[2]) + '\n')
|
||||
summary.write("- colsample_bytree = " + str(tuned_params[3]) + '\n')
|
||||
summary.write("- eta = " + str(tuned_params[4]) + '\n')
|
||||
summary.write("- num_boost_round = " + str(tuned_params[5]) + '\n')
|
||||
|
||||
summary.close()
|
||||
|
||||
|
||||
def main():
|
||||
global status, current, total
|
||||
|
||||
parser = argparse.ArgumentParser(description = __doc__)
|
||||
parser.add_argument(
|
||||
"-mf", "--master_file",
|
||||
type = str,
|
||||
help = "Path to .txt file containing dataset specifications.",
|
||||
required = True)
|
||||
parser.add_argument(
|
||||
"-ts", "--test_size",
|
||||
type = float,
|
||||
help = "Percentage of given data to be used for cross validation.",
|
||||
required = True)
|
||||
parser.add_argument(
|
||||
"-s", "--seed",
|
||||
type = int,
|
||||
help = "Random seed to be used.",
|
||||
required = True)
|
||||
parser.add_argument(
|
||||
"-mx", "--max_range",
|
||||
type = int,
|
||||
help = "Maximum range (inclusive) for parameters used in hyperparameter"
|
||||
" tuning.",
|
||||
required = True)
|
||||
parser.add_argument(
|
||||
"-md", "--model_directory",
|
||||
type = str,
|
||||
help = "Path to directory that will store tuned models.",
|
||||
required = True)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("\n")
|
||||
|
||||
field_list = []
|
||||
field_dfs = []
|
||||
phn_params = []
|
||||
model_names = []
|
||||
gen_predict = []
|
||||
tmp_predict = []
|
||||
model_code = ""
|
||||
|
||||
status = True
|
||||
loading = Thread(target = anim_loading,
|
||||
args = ("Loading master datasets.",))
|
||||
loading.start()
|
||||
|
||||
with open(args.master_file, "r") as master_file:
|
||||
|
||||
for line in master_file:
|
||||
if line.startswith("SRC"):
|
||||
master_directory = line.split(",")[1].split()[0]
|
||||
fields = os.listdir(master_directory)
|
||||
|
||||
for field in fields:
|
||||
if field.endswith(".h5"):
|
||||
field_list.append(field.split("_")[0])
|
||||
field_path = master_directory + "/" + field
|
||||
field_dfs.append(pd.read_hdf(field_path, "Master"))
|
||||
|
||||
elif line.startswith("PHN"):
|
||||
phn_params.append(line.split(",")[1])
|
||||
model_names.append(line.split(",")[2].split("\n")[0])
|
||||
|
||||
elif line.startswith("GEN"):
|
||||
gen_predict.append(line)
|
||||
|
||||
elif line.startswith("TMP"):
|
||||
tmp_predict.append(line)
|
||||
|
||||
elif line.startswith("MCD"):
|
||||
model_code = line.split(",")[1].split("\n")[0]
|
||||
|
||||
master_df = pd.concat(field_dfs, ignore_index = True, sort = True)
|
||||
|
||||
status = False
|
||||
loading.join()
|
||||
tick_msg("Successfully loaded master datasets.\n")
|
||||
|
||||
gen_params = []
|
||||
|
||||
status = True
|
||||
loading = Thread(target = anim_loading,
|
||||
args = ("Identifying target genotype parameters.",))
|
||||
loading.start()
|
||||
|
||||
for predictor in gen_predict:
|
||||
predictor_specs = predictor.split(",")
|
||||
|
||||
if predictor_specs[1] == "Latent Dimensions":
|
||||
|
||||
num_features = predictor_specs[2].split("\n")[0]
|
||||
|
||||
# Drop individuals with no latent dimension data.
|
||||
master_df.dropna(subset = ["LD_01"], inplace = True)
|
||||
|
||||
# Get column names.
|
||||
lat_dim = master_df.loc[:, master_df.columns.str.startswith("LD")]
|
||||
lat_dim = lat_dim.columns.to_list()
|
||||
|
||||
# Use maximum possible number of Latent Dimensions.
|
||||
if num_features == "MAX":
|
||||
gen_params = gen_params + lat_dim
|
||||
|
||||
# Use target number of Latent Dimensions.
|
||||
else:
|
||||
gen_params = gen_params + lat_dim[0:int(num_features)]
|
||||
|
||||
elif predictor_specs[1] == "Principal Components":
|
||||
|
||||
num_features = predictor_specs[2].split("\n")[0]
|
||||
|
||||
# Drop individuals with no Principal Components.
|
||||
master_df.dropna(subset = ["PC_001"], inplace = True)
|
||||
|
||||
# Get column names.
|
||||
prin_comp = master_df.loc[:, master_df.columns.str.startswith("PC")]
|
||||
prin_comp = prin_comp.columns.to_list()
|
||||
|
||||
# Use maximum possible number of Principal Components.
|
||||
if num_features == "MAX":
|
||||
gen_params = gen_params + prin_comp
|
||||
|
||||
# Use target number of Principal Components.
|
||||
else:
|
||||
gen_params = gen_params + prin_comp[0:int(num_features)]
|
||||
|
||||
status = False
|
||||
loading.join()
|
||||
tick_msg("Successfully identified target genotype parameters.\n")
|
||||
|
||||
tmp_params = []
|
||||
|
||||
status = True
|
||||
loading = Thread(target = anim_loading,
|
||||
args = ("Identifying target temperature features.",))
|
||||
loading.start()
|
||||
|
||||
for predictor in tmp_predict:
|
||||
predictor_specs = predictor.split(",")
|
||||
|
||||
if predictor_specs[1] == "Air Features":
|
||||
|
||||
num_features = predictor_specs[2].split("\n")[0]
|
||||
|
||||
# Drop individuals with no Air Temperature Features.
|
||||
master_df.dropna(subset = ["ATF_01"], inplace = True)
|
||||
|
||||
# Get column names.
|
||||
atf = \
|
||||
master_df.loc[:, master_df.columns.str.startswith("ATF")].copy()
|
||||
# Drop features that are not shared by all individuals.
|
||||
atf.dropna(axis = "columns", inplace = True)
|
||||
atf = atf.columns.to_list()
|
||||
|
||||
# Use maximum possible number of Air Temperature Features.
|
||||
if num_features == "MAX":
|
||||
tmp_params = tmp_params + atf
|
||||
|
||||
# Use target number of Air Temperature Features.
|
||||
else:
|
||||
tmp_params = tmp_params + atf[0:int(num_features)]
|
||||
|
||||
elif predictor_specs[1] == "Soil Features":
|
||||
|
||||
num_features = predictor_specs[2].split("\n")[0]
|
||||
|
||||
# Drop individuals with no Soil Temperature Features.
|
||||
master_df.dropna(subset = ["STF_01"], inplace = True)
|
||||
|
||||
# Get column names.
|
||||
stf = \
|
||||
master_df.loc[:, master_df.columns.str.startswith("STF")].copy()
|
||||
# Drop features that are not shared by all individuals.
|
||||
stf.dropna(axis = "columns", inplace = True)
|
||||
stf = stf.columns.to_list()
|
||||
|
||||
# Use maximum possible number of Soil Temperature Features.
|
||||
if num_features == "MAX":
|
||||
tmp_params = tmp_params + stf
|
||||
|
||||
# Use target number of Soil Temperature Features.
|
||||
else:
|
||||
tmp_params = tmp_params + stf[0:int(num_features)]
|
||||
|
||||
status = False
|
||||
loading.join()
|
||||
tick_msg("Successfully identified target temperature features.\n")
|
||||
|
||||
# Drop individuals that do not have any phenotype data.
|
||||
master_df.dropna(subset = phn_params, inplace = True)
|
||||
|
||||
# Recalculate field list.
|
||||
field_list = (master_df["Field"].unique())
|
||||
|
||||
master_df.reset_index(drop = True, inplace = True)
|
||||
|
||||
model_num = 0
|
||||
for phn_feature in phn_params:
|
||||
|
||||
col_names = [
|
||||
"Baseline Model [MAE]",
|
||||
"Baseline Model [RMSE]",
|
||||
"Trained Model [MAE]",
|
||||
"Trained Model [RMSE]",
|
||||
"Prediction [MAE]",
|
||||
"Prediction [RMSE]"
|
||||
]
|
||||
|
||||
gen_stats = pd.DataFrame(np.nan, index = field_list,
|
||||
columns = col_names)
|
||||
env_stats = pd.DataFrame(np.nan, index = field_list,
|
||||
columns = col_names)
|
||||
gen_env_stats = pd.DataFrame(np.nan, index = field_list,
|
||||
columns = col_names)
|
||||
|
||||
units = phn_feature.split(" ")[2].split("\n")[0]
|
||||
|
||||
# Directory to store files for the current model.
|
||||
model_name = model_names[model_num]
|
||||
directory_path = args.model_directory + "/" + model_name
|
||||
os.mkdir(directory_path)
|
||||
|
||||
for field in field_list:
|
||||
|
||||
# Directory to store files for the current field.
|
||||
field_path = directory_path + "/" + field
|
||||
os.mkdir(field_path)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Genome Model
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
info_msg(field + " " + model_name + " Genome Model Training")
|
||||
print("**************************************************")
|
||||
|
||||
# Directory to store Genome Model.
|
||||
gen_model_path = field_path + "/G__Genome_Model"
|
||||
os.mkdir(gen_model_path)
|
||||
|
||||
# Set parameters only to Genotype Data.
|
||||
predictor_params = gen_params
|
||||
|
||||
# Train a model with given parameters.
|
||||
data_stats, performance_stats, cv_dfs, tuned_params, tuned_model = \
|
||||
train_model(field, predictor_params, phn_feature, master_df,
|
||||
args.test_size, args.seed, args.max_range)
|
||||
|
||||
status = True
|
||||
loading = Thread(target = anim_loading,
|
||||
args = ("Saving model and generating summary.",))
|
||||
loading.start()
|
||||
|
||||
# Save the trained model to .json file.
|
||||
model_path = gen_model_path + "/" + field + "_XGB_"\
|
||||
+ model_name + "_Model__G.json"
|
||||
tuned_model.save_model(model_path)
|
||||
|
||||
# Generate a summary of the hyperparameter tuning performance.
|
||||
model_summary(field, data_stats, performance_stats, cv_dfs,
|
||||
tuned_params, gen_model_path, model_code,
|
||||
model_name, units)
|
||||
|
||||
gen_stats.loc[field] = performance_stats
|
||||
|
||||
status = False
|
||||
loading.join()
|
||||
tick_msg("Successfully saved model and generated summary.")
|
||||
print("**************************************************\n\n")
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Environment Model
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
info_msg(field + " " + model_name + " Environment Model Training")
|
||||
print("**************************************************")
|
||||
|
||||
# Directory to store Environment Model.
|
||||
env_model_path = field_path + "/E__Environment_Model"
|
||||
os.mkdir(env_model_path)
|
||||
|
||||
# Set parameters only to Environmental Data.
|
||||
predictor_params = tmp_params
|
||||
|
||||
# Train a model with given parameters.
|
||||
data_stats, performance_stats, cv_dfs, tuned_params, tuned_model = \
|
||||
train_model(field, predictor_params, phn_feature, master_df,
|
||||
args.test_size, args.seed, args.max_range)
|
||||
|
||||
status = True
|
||||
loading = Thread(target = anim_loading,
|
||||
args = ("Saving model and generating summary.",))
|
||||
loading.start()
|
||||
|
||||
# Save the trained model to .json file.
|
||||
model_path = env_model_path + "/" + field + "_XGB_" \
|
||||
+ model_name + "_Model__E.json"
|
||||
tuned_model.save_model(model_path)
|
||||
|
||||
# Generate a summary of the hyperparameter tuning performance.
|
||||
model_summary(field, data_stats, performance_stats, cv_dfs,
|
||||
tuned_params, env_model_path, model_code,
|
||||
model_name, units)
|
||||
|
||||
env_stats.loc[field] = performance_stats
|
||||
|
||||
status = False
|
||||
loading.join()
|
||||
tick_msg("Successfully saved model and generated summary.")
|
||||
print("**************************************************\n\n")
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Genome + Environment Model
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
info_msg(field + " " + model_name
|
||||
+ " Genome + Environment Model Training")
|
||||
print("**************************************************")
|
||||
|
||||
# Directory to store Environment Model.
|
||||
gen_env_model_path = field_path + "/GE__Genome_Environment_Model"
|
||||
os.mkdir(gen_env_model_path)
|
||||
|
||||
# Set parameters only to Environmental Data.
|
||||
predictor_params = gen_params + tmp_params
|
||||
|
||||
# Train a model with given parameters.
|
||||
data_stats, performance_stats, cv_dfs, tuned_params, tuned_model = \
|
||||
train_model(field, predictor_params, phn_feature, master_df,
|
||||
args.test_size, args.seed, args.max_range)
|
||||
|
||||
status = True
|
||||
loading = Thread(target = anim_loading,
|
||||
args = ("Saving model and generating summary.",))
|
||||
loading.start()
|
||||
|
||||
# Save the trained model to .json file.
|
||||
model_path = gen_env_model_path + "/" + field + "_XGB_" \
|
||||
+ model_name + "_Model__GE.json"
|
||||
tuned_model.save_model(model_path)
|
||||
|
||||
# Generate a summary of the hyperparameter tuning performance.
|
||||
model_summary(field, data_stats, performance_stats, cv_dfs,
|
||||
tuned_params, gen_env_model_path, model_code,
|
||||
model_name, units)
|
||||
|
||||
gen_env_stats.loc[field] = performance_stats
|
||||
|
||||
status = False
|
||||
loading.join()
|
||||
tick_msg("Successfully saved model and generated summary.")
|
||||
print("**************************************************\n\n")
|
||||
|
||||
gen_stats.to_csv(directory_path +
|
||||
"/G__Genome_Model_Performance.csv")
|
||||
env_stats.to_csv(directory_path +
|
||||
"/E__Environment_Model_Performance.csv")
|
||||
gen_env_stats.to_csv(directory_path +
|
||||
"/GE__Genome_Environment_Model_Performance.csv")
|
||||
|
||||
model_num += 1
|
||||
|
||||
tick_msg("Done!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user