ISSRseq R Analyses
The purpose of this script is to provide a guide to the simple population genetic analyses used in our manuscript "ISSRseq: an extensible method for reduced representation sequencing". More specifically result examples displayed here are from our analysis of the C. striata dataset. Packages utilized here are capable of of a large variety of analyses not presented in our work. Additionally, figures in this document were finalized in Adobe Illustrator for publication but can be edited to increase quality within R by the user.
Package for data processing from VCF file
library(vcfR)
Packages for population genetic analyses (AMOVA, bootstrapping ect.)
library(poppr); library(adegenet); library(ape); library(mmod);
library(magrittr); library(hierfstat); library(pegas); library(qvalue)
Package for local adaptation analysis.
library(pcadapt)
Package for admixture analysis
library(LEA)
Packages for figures and plots
library(ggplot2); library(plot3D); library(plotly); library(igraph);
library(RColorBrewer); library(grid); library(R.devices)
Package for Wes Anderson themed color palettes (Range 4-5 colors).
Choices include: BottleRocket1, BottleRocket2, Rushmore1, Royal1, Royal2, Zissou1, Darjeeling1, Darjeeling2, Chevalier1,
FantasticFox1, Moonrise1, Moonrise2, Moonrise3, Cavalcanti1, GrandBudapest1, GrandBudapest2, IsleofDogs1, IsleofDogs2
library(wesanderson)
- Import GATK filtered VCF file.
filtered_vcf <- read.vcfR("striata_vcf_thinned.recode.vcf")
- VCF can be stored as different object types including genlight (binary variants), genid (allelic counts), and genclone (multilocus genotype for clonal populations).
my_genlight <- vcfR2genlight(filtered_vcf)
my_genind <- vcfR2genind(filtered_vcf)
- Import sample ID and population map.
pop_data <- read.table("Striata_allsamples.txt", sep = "\t", header = FALSE)
- Example of population sample file. V1 = Individual; V2 = Sampling locality; V3 = Region; V4 = Combination of V1, V2, and V3.
head(pop_data)
V1 V2 V3 V4
1 103a_NM Otero-NM var-vreelandii-SW-USA 103a_Otero-NM_var-vreelandii-SW-USA
2 103b_NM Otero-NM var-vreelandii-SW-USA 103b_Otero-NM_var-vreelandii-SW-USA
3 103d_NM Otero-NM var-vreelandii-SW-USA 103d_Otero-NM_var-vreelandii-SW-USA
4 103e_NM Otero-NM var-vreelandii-SW-USA 103e_Otero-NM_var-vreelandii-SW-USA
5 110a_AZ Graham-AZ var-vreelandii-SW-USA 110a_Graham-AZ_var-vreelandii-SW-USA
6 110b_AZ Graham-AZ var-vreelandii-SW-USA 110b_Graham-AZ_var-vreelandii-SW-USA
- Make sure to double check that the order of individuals between your population sample file and variant matrix match.
my_genlight@ind.names
[1] "103a_NM" "103b_NM" "103d_NM" "103e_NM" "110a_AZ" "110b_AZ"
[7] "110c_AZ" "112b_AZ" "114a_UT" "114b_UT" "114c_UT" "114d_UT"
[13] "116a_UT" "116b_UT" "116c_UT" "116d_UT" "116e_UT" "120a_UT"
[19] "120b_UT" "120c_UT" "120d_UT" "120e_UT" "135a_WY" "135b_WY"
[25] "135c_WY" "135d_WY" "13b_OR" "13c_OR" "163a_CO" "163b_CO"
[31] "163c_CO" "163d_CO" "163e_CO" "177_4_ID" "177_5_ID" "177a_ID"
[37] "177b_ID" "187_1_MT" "187_2_MT" "187_3_MT" "187_4_MT" "187_5_MT"
[43] "219b_BC" "219c_BC" "242a_CA" "242b_CA" "242c_CA" "242d_CA"
[49] "242e_CA" "246b_CA" "246f_CA" "246g_CA" "253a_CA" "253b_CA"
[55] "253c_CA" "253d_CA" "253e_CA" "29a_OR" "29b_OR" "312a_CA"
[61] "312b_CA" "312c_CA" "350a_OR" "350b_OR" "374c_CA" "374d_CA"
[67] "374e_CA" "374f_CA" "374g_CA" "39a_WA" "39b_WA" "427a_CA"
[73] "427b_CA" "428b_CA" "432a_CA" "432b_CA" "432c_CA" "432d_CA"
[79] "48a_WA" "48b_WA" "9a_CA" "HeshA_Man" "HeshB_Man"
- Set the ploidy of your individuals and assign the population groups you want to use for PCA. Here we have used C. striata region from the population sample file.
ploidy(my_genlight) <- 2;
ploidy(my_genind) <- 2;
pop(my_genlight) <- pop_data$V3;
pop(my_genind) <- pop_data$V3
- Next assign strata to your variant matrix for use in building hierarchies in AMOVA.
samplingstrata <- as.data.frame(pop_data$V4);
strata(my_genlight) <- samplingstrata;
strata(my_genind) <- samplingstrata;
splitStrata(my_genlight) <- ~Individual/Locality/Region;
splitStrata(my_genind) <- ~Individual/Locality/Region
- Finally, you can create datasets from your genlight and genind objects that are helpful in analyzing clonal populations.
my_snpclone <- poppr::as.snpclone(my_genlight)
my_genclone <- poppr::as.genclone(my_genind)
For more information about the poppr package: https://grunwaldlab.github.io/poppr/reference/poppr-package.html
AMOVA with hierarchy of Sampling locality within Region
striata_AMOVA <- poppr.amova(my_genclone, ~Region/Locality, within = TRUE)
striata_AMOVA$results
Variable | Df | Sum Sq | Mean Sq |
---|---|---|---|
Between Region | 4 | 2219.0865 | 554.77162 |
Between samples Within Region | 20 | 919.4987 | 45.97493 |
Within samples | 58 | 1634.8983 | 28.1879 |
Total | 82 | 4773.4835 | 58.21321 |
striata_AMOVA$componentsofcovariance
Variable | Sigma | % |
---|---|---|
Variations Between Region | 32.607068 | 49.215815 |
Variations Between samples Within Region | 5.458261 | 8.238483 |
Variations Within samples | 28.187902 | 42.545702 |
Total variations | 66.253231 | 100.000000 |
striata_AMOVA$statphi
Variable | Phi |
---|---|
Phi-samples-total | 0.574543 |
Phi-samples-Region | 0.1622254 |
Phi-Region-total | 0.4921581 |
Permutational significance test for AMOVA
striatasignif <- randtest(striata_AMOVA, nrepet = 999)
class: krandtest lightkrandtest
Monte-Carlo tests
Call: randtest.amova(xtest = striata_AMOVA, nrepet = 999)
Number of tests: 3
Adjustment method for multiple comparisons: none
Permutation number: 999
Test Obs Std.Obs Alter Pvalue
1 Variations within samples 28.187902 -18.897416 less 0.001
2 Variations between samples 5.458261 9.983612 greater 0.001
3 Variations between Region 32.607068 12.168321 greater 0.001
plot(striatasignif)
Convert genind objects from adegenet into a hierfstat data and calculate population statistics. Here we have used Region as our defined population.
my_genindFst <- genind2hierfstat(my_genind);
basicstat <- basic.stats(my_genindFst, diploid = TRUE, digits = 4)
basicstat$overall
Ho | Hs | Ht | Dst | Htp | Dstp | Fst | Fstp | Fis | Dest |
---|---|---|---|---|---|---|---|---|---|
0.0765 | 0.0955 | 0.1191 | 0.0236 | 0.1274 | 0.032 | 0.1983 | 0.2509 | 0.1992 | 0.0353 |
Calculate pairwise Fst with method options: "Dch","Da","Ds","Fst","Dm","Dr","Cp", "WC84, "Nei87", or "X2"
pairwiseFst<-genet.dist(my_genind, method = "WC84")
var-vreelandii-SW-USA var-striata-N-USA Sierra-Nevada
var-striata-N-USA 0.31484853
Sierra-Nevada 0.25325256 0.23539583
Coast-Ranges-Cascades 0.29789541 0.27089574 0.09334809
Run PCA and after evaluating eigenvalues select appropriate number of axes. Here we have chosen 4 axes for our analysis.
aPCA <- glPca(my_genlight)
Determine percent variance explained by PCA axes.
eig.perc <- 100*aPCA$eig/sum(aPCA$eig)
Output
[1] 12.7551258 10.5229880 4.9075508 2.9516855 2.1872124 1.8943962 1.8237295 1.8147981 1.6610328 1.5865176 1.5297921
[12] 1.4929874 1.4284810 1.3926084 1.3787864 1.3478963 1.3056970 1.2782287 1.2640752 1.2381259 1.2002892 1.1772555
[23] 1.1664989 1.1529655 1.1119130 1.0835264 1.0725911 1.0583647 1.0469517 1.0139171 0.9782371 0.9637621 0.9448887
[34] 0.9330783 0.9137121 0.8833411 0.8606239 0.8544544 0.8453463 0.8270547 0.8178980 0.8020338 0.7815581 0.7718478
[45] 0.7626070 0.7362362 0.7335280 0.7184904 0.7027461 0.6818578 0.6781932 0.6739189 0.6680140 0.6479063 0.6473984
[56] 0.6386501 0.6286747 0.6137138 0.6083952 0.5976042 0.5881836 0.5680544 0.5604242 0.5404736 0.5261534 0.5233942
[67] 0.5154044 0.5036466 0.4968981 0.4854849 0.4774669 0.4674792 0.4582305 0.4420337 0.4292758 0.4233475 0.4077924
[78] 0.3891733 0.3853907 0.3687565 0.3521972 0.3289799
Plot 3D PCA results
aPCApoints<-as.data.frame(aPCA$scores, row.names = TRUE)
colors <- wes_palette(n = 4, name = "Moonrise2")
colors <- colors[as.numeric(my_genlight@pop)]
scatter3D(aPCApoints$PC1, aPCApoints$PC2, aPCApoints$PC3, theta = 290, phi = 40, bty = "g",pch = 20, cex = 2, ticktype = "detailed", colvar = NULL, col = colors)
For more information about DAPC in the ADEgenet package: https://adegenet.r-forge.r-project.org/files/tutorial-dapc.pdf
Before starting analysis determine an appropriate number of axes and clusters.
grp <- find.clusters(my_snpclone, max.n.clust = 40)
Run DAPC to determine membership probabilities.
vcf_dapc <- dapc(my_snpclone, n.pca = 4, n.da = 5)
Create a dot plot of DAPC results
scatter(vcf_dapc, col = wes_palette(n = 4, name = "Moonrise2"), xax = 1, yax = 3, cex = 2, scree.da=FALSE, legend = TRUE, grp = my_snpclone@pop)
Create a component plot of membership probabilities (current figure displays every other individual in dataset).
barplot(t(DPACmatrix[,1:4]), col = wes_palette(n = 4, name = "Moonrise2"), border = NA, space = 0,
ylab = "Membership probability", name = DPACmatrix$individual, las=2, legend = TRUE, cex.axis = 0.5, cex.names = 0.3)