mirror of
https://github.com/ami-sc/AgAdapt.git
synced 2024-07-07 03:17:50 +02:00
Revert "SNP feature selection"
This commit is contained in:
parent
c7c3ccf85c
commit
f967c80c16
|
@ -1,49 +0,0 @@
|
|||
---
|
||||
title: "Feature Selection: Genotype Data"
|
||||
format: pdf
|
||||
editor: visual
|
||||
bibliography: references.bib
|
||||
editor_options:
|
||||
markdown:
|
||||
wrap: 72
|
||||
---
|
||||
|
||||
The maize GxE [competition
|
||||
data](https://drive.google.com/drive/folders/1leYJY4bA3341S-JxjBIgmmAWMwVDHYRb)
|
||||
includes genotype data for 4,928 individuals at 437,214 variant sites.
|
||||
To reduce the number of genetic features in this dataset, we leveraged
|
||||
knowledge of maize evolutionarily history to identify a subset of SNPs
|
||||
important for adaptation to diverse environments. Specifically, we
|
||||
select a subset of 2,513 SNPs that were previously implicated in
|
||||
adaptation of 4,471 maize landraces from across Mexico, Central America,
|
||||
and South America to altitude [@romeronavarro2017]. The majority of
|
||||
these SNPs (61%) were also associated with variation in flowering time
|
||||
in experiments [@romeronavarro2017]. Early flowering is an important
|
||||
adaptation to high latitudes and high elevations, where there is a
|
||||
shorter growing season. SNPs predictive of flowering time variation
|
||||
could be important for explaining variation in performance of different
|
||||
individuals across the G2F locations.
|
||||
|
||||
Genotype data for the landraces studied by @romeronavarro2017 were
|
||||
downloaded from the CIMMYT Seeds of Discovery project [@hearne2012].
|
||||
Hapmap files were subset to SNPs present in Supplemental Table 6 from
|
||||
@romeronavarro2017, combined, and converted to VCF using TASSEL version
|
||||
5.2.64 [@bradbury2007].
|
||||
|
||||
```{bash}
|
||||
# filter
|
||||
~/tools/tassel-5-standalone/run_pipeline.pl -Xmx30g -fork1 -h ~/scratch/maize/CIMMYT/GBS/AllZeaGBSv2.7_SEED_Beagle4_chr9.hmp.txt -includeSiteNamesInFile Romero_altitude.txt -export chr9.alt.flt
|
||||
|
||||
# combine
|
||||
~/tools/tassel-5-standalone/run_pipeline.pl -Xmx30g -fork1 -h alt.hmp.txt -fork2 -h alt2.hmp.txt -fork3 -h alt3.hmp.txt -fork4 -h chr10.alt.flt.hmp.txt -combine5 -input1 -input2 -input3 -input4 -mergeGenotypeTables -export romero_alt_zea27 -runfork1 -runfork2 -runfork3 -runfork4
|
||||
|
||||
# convert to VCF
|
||||
~/tools/tassel-5-standalone/run_pipeline.pl -Xmx30g -fork1 -h romero_alt_zea27.hmp.txt -export romero_alt_zea27 -exportType VCF
|
||||
```
|
||||
|
||||
Coordinates based on the AGPv2 reference genome were then converted to
|
||||
Zm-B73-REFERENCE-NAM-5.0Zm using the [Assembly Converter online
|
||||
tool](https://plants.ensembl.org/Zea_mays/Tools/AssemblyConverter?db=core)
|
||||
from EnsemblPlants, which is based on CrossMap [@zhao2014].
|
||||
|
||||
## References
|
|
@ -11,6 +11,3 @@
|
|||
|
||||
- `04__Hierarchical_Clustering`:
|
||||
- Code for performing and visualizing hierarchical clustering on the Offspring Latent Dimensions.
|
||||
|
||||
- `05__SNP_Feature_Selection`:
|
||||
- Example code, rationale, and citations for evolutionarily-informed SNP feature selection.
|
||||
|
|
|
@ -1,55 +0,0 @@
|
|||
|
||||
@article{romeronavarro2017,
|
||||
title = {A study of allelic diversity underlying flowering-time adaptation in maize landraces},
|
||||
author = {Romero Navarro, J Alberto and Willcox, Martha and {Burgueño}, Juan and Romay, Cinta and Swarts, Kelly and Trachsel, Samuel and Preciado, Ernesto and Terron, Arturo and Delgado, Humberto Vallejo and Vidal, Victor and Ortega, Alejandro and Banda, Armando Espinoza and Montiel, {Noel Orlando Gómez} and Ortiz-Monasterio, Ivan and Vicente, {Félix San} and Espinoza, Armando Guadarrama and Atlin, Gary and Wenzl, Peter and Hearne, Sarah and Buckler, Edward S},
|
||||
year = {2017},
|
||||
month = {03},
|
||||
date = {2017-03-01},
|
||||
journal = {Nature Genetics},
|
||||
pages = {476--480},
|
||||
volume = {49},
|
||||
number = {3},
|
||||
doi = {10.1038/ng.3784},
|
||||
url = {https://doi.org/10.1038/ng.3784}
|
||||
}
|
||||
|
||||
@article{hearne2012,
|
||||
title = {Imputed GbS derived SNPs for maize landrace accessions represented in the SeeD-maize GWAS panel: Imputation using Beagle v.4},
|
||||
author = {Hearne, Sarah and Chen, Charles and Buckler, Ed and Mitchell, Sharon and Romero, Alberto and Swarts, Kelly and Li, Huihui},
|
||||
editor = {International Maize and Wheat Improvement Center, },
|
||||
year = {2012},
|
||||
date = {2012},
|
||||
url = {https://hdl.handle.net/11529/10035},
|
||||
note = {Edition: DRAFT VERSION
|
||||
Section: 2014-12-19 16:14:55.881}
|
||||
}
|
||||
|
||||
@article{bradbury2007,
|
||||
title = {TASSEL: software for association mapping of complex traits in diverse samples},
|
||||
author = {Bradbury, P. J. and Zhang, Z. and Kroon, D. E. and Casstevens, T. M. and Ramdoss, Y. and Buckler, E. S.},
|
||||
year = {2007},
|
||||
month = {10},
|
||||
date = {2007-10-01},
|
||||
journal = {Bioinformatics},
|
||||
pages = {2633--2635},
|
||||
volume = {23},
|
||||
number = {19},
|
||||
doi = {10.1093/bioinformatics/btm308},
|
||||
url = {https://academic.oup.com/bioinformatics/article-lookup/doi/10.1093/bioinformatics/btm308},
|
||||
langid = {en}
|
||||
}
|
||||
|
||||
@article{zhao2014,
|
||||
title = {CrossMap: a versatile tool for coordinate conversion between genome assemblies},
|
||||
author = {Zhao, Hao and Sun, Zhifu and Wang, Jing and Huang, Haojie and Kocher, Jean-Pierre and Wang, Liguo},
|
||||
year = {2014},
|
||||
month = {04},
|
||||
date = {2014-04-01},
|
||||
journal = {Bioinformatics},
|
||||
pages = {1006--1007},
|
||||
volume = {30},
|
||||
number = {7},
|
||||
doi = {10.1093/bioinformatics/btt730},
|
||||
url = {https://academic.oup.com/bioinformatics/article-lookup/doi/10.1093/bioinformatics/btt730},
|
||||
langid = {en}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -1,10 +1,7 @@
|
|||
`/Results`: This directory contains 3 directories:
|
||||
`/Results`: This directory contains 1 directory:
|
||||
|
||||
- `/Models`:
|
||||
- XGBoost models and corresponding performance data.
|
||||
|
||||
- `/Plots`:
|
||||
- Figures generated from dats analysis.
|
||||
|
||||
- `/FeatureEngineering`:
|
||||
- Subsets of features for competition
|
||||
|
|
Loading…
Reference in New Issue
Block a user