The lab GWAS exercise examined the association between each locus and phenotype one at a time. For genomic prediction, we want to take account of differences at all loci simultaneously. The basic approach is the same, it’s just that we now have a multiple regression instead of a simple regression on one genotype. We do, however, throw in one complication. In one analysis we include the population from which an individual was collected as an additional covariate to explain phenotype. In the other, we don’t.
This analysis will report estimates only for the 20 loci with the largest magnitude of effect.
options(tidyverse.quiet = TRUE)
library(tidyverse)
library(rstan)
library(brms)
library(popkin)
rm(list = ls())
options(mc.cores = parallel::detectCores())
## standardize observations to a mean of zero and a standard deviation of one
##
standardize <- function(x) {
return((x - mean(x, na.rm = TRUE))/sd(x, na.rm = TRUE))
}
summarize_analysis <- function(results_brms, header, n_report = 20) {
results_dat <- as.data.frame(results_brms) %>%
select(starts_with("b_X"))
if (n_report > ncol(results_dat)) {
n_report <- ncol(results_dat)
}
cat(header, "\n")
cat(" Mean: ( 2.5%, 10%, 50%, 90%, 97.5%)\n")
locus_means <- apply(results_dat, 2, mean)
ordered_names <- names(sort(abs(locus_means), decreasing = TRUE))
for (i in 1:n_report) {
ci <- quantile(results_dat[[ordered_names[i]]],
c(0.025, 0.1, 0.5, 0.9, 0.975))
output <- sprintf("%8s: %8.5f (%8.5f, %8.5f, %8.5f, %8.5f, %8.5f)\n",
substring(ordered_names[i], 3),
locus_means[[ordered_names[i]]],
ci[1], ci[2], ci[3], ci[4], ci[5])
cat(output)
}
}
## read the data
##
dat <- read_csv("gypsymoth.csv",
show_col_types = FALSE)
## get the relatedness matrix
##
genos <- dat %>% select(starts_with("X"))
A <- popkin(t(as.matrix(genos)),
subpops = dat$pops)
## identify the rows with the sample names
##
rownames(A) <- dat$sample
n_loci <- ncol(genos)
## construct the model formula for Mass
##
formula_string <- paste("standardize(Mass)", paste(colnames(genos), collapse = " + "), sep = " ~ ")
formula_string <- paste(formula_string, "+ (1|gr(sample, cov = A))")
## run the model
##
mass_wo_pop <- brm(formula_string,
data = dat,
data2 = list(A = A),
family = gaussian(),
prior = set_prior(horseshoe(par_ratio = 20/n_loci)),
refresh = 0)
## construct the model formula for Mass with population as a fixed effect
##
formula_string <- paste("standardize(Mass)", paste("pops + ", colnames(genos), collapse = " + "), sep = " ~ ")
formula_string <- paste(formula_string, "+ (1|gr(sample, cov = A))")
## run the model
##
mass_w_pop <- brm(formula_string,
data = dat,
data2 = list(A = A),
family = gaussian(),
prior = set_prior(horseshoe(par_ratio = 20/n_loci)),
refresh = 0)
summarize_analysis(mass_wo_pop, " Without population effect")
Without population effect
Mean: ( 2.5%, 10%, 50%, 90%, 97.5%)
X34862: 0.02470 (-0.00831, -0.00226, 0.00008, 0.06000, 0.31456)
X65887: -0.02407 (-0.29400, -0.07090, -0.00012, 0.00230, 0.01018)
X44522: -0.01177 (-0.18916, -0.01420, -0.00006, 0.00287, 0.01026)
X40856: 0.01093 (-0.01057, -0.00284, 0.00004, 0.01298, 0.17299)
X68674: 0.00807 (-0.01097, -0.00256, 0.00004, 0.01087, 0.12414)
X29507: -0.00759 (-0.12539, -0.00824, -0.00002, 0.00334, 0.01245)
X71254: -0.00696 (-0.10894, -0.00897, -0.00002, 0.00328, 0.01228)
X36978: 0.00662 (-0.01117, -0.00302, 0.00004, 0.00882, 0.09413)
X4293: 0.00638 (-0.01156, -0.00295, 0.00002, 0.00918, 0.09334)
X53601: 0.00632 (-0.00984, -0.00306, 0.00003, 0.00831, 0.10153)
X28982: 0.00546 (-0.01414, -0.00350, 0.00002, 0.00902, 0.07618)
X65931: 0.00532 (-0.01306, -0.00303, 0.00002, 0.00705, 0.07795)
X28981: 0.00489 (-0.01371, -0.00339, 0.00002, 0.00841, 0.06885)
X38743: -0.00486 (-0.06550, -0.00792, -0.00003, 0.00318, 0.01243)
X37952: 0.00482 (-0.01259, -0.00303, 0.00004, 0.00767, 0.07170)
X65932: 0.00453 (-0.01261, -0.00311, 0.00003, 0.00705, 0.06292)
X59849: -0.00443 (-0.05809, -0.00694, -0.00001, 0.00356, 0.01376)
X86695: -0.00423 (-0.05088, -0.00729, -0.00002, 0.00338, 0.01205)
X25831: -0.00410 (-0.05654, -0.00697, -0.00003, 0.00270, 0.01201)
X17757: -0.00400 (-0.05978, -0.00624, -0.00002, 0.00332, 0.01290)
summarize_analysis(mass_w_pop, "\n\n With population effect")
With population effect
Mean: ( 2.5%, 10%, 50%, 90%, 97.5%)
X34862: 0.03316 (-0.00831, -0.00212, 0.00014, 0.13146, 0.34870)
X65887: -0.03270 (-0.32256, -0.15828, -0.00020, 0.00238, 0.00971)
X44522: -0.01267 (-0.18647, -0.02072, -0.00008, 0.00265, 0.01037)
X40856: 0.01254 (-0.01119, -0.00299, 0.00007, 0.01956, 0.20033)
X68674: 0.00908 (-0.01230, -0.00325, 0.00006, 0.01158, 0.13172)
X53601: 0.00769 (-0.01134, -0.00322, 0.00004, 0.01034, 0.12343)
X71254: -0.00720 (-0.11691, -0.01004, -0.00004, 0.00321, 0.01350)
X59849: -0.00704 (-0.12261, -0.00907, -0.00003, 0.00349, 0.01319)
X36978: 0.00645 (-0.01273, -0.00305, 0.00003, 0.00882, 0.10430)
X25831: -0.00624 (-0.09950, -0.00970, -0.00003, 0.00324, 0.01230)
X4293: 0.00593 (-0.01167, -0.00315, 0.00006, 0.00900, 0.08140)
X65932: 0.00583 (-0.01343, -0.00341, 0.00003, 0.00939, 0.09117)
X29507: -0.00565 (-0.08421, -0.00906, -0.00003, 0.00370, 0.01220)
X65930: 0.00531 (-0.01383, -0.00328, 0.00003, 0.00820, 0.07696)
X28982: 0.00521 (-0.01122, -0.00337, 0.00002, 0.00811, 0.05955)
X65931: 0.00517 (-0.01375, -0.00347, 0.00003, 0.00775, 0.07923)
X58326: -0.00473 (-0.07266, -0.00921, -0.00004, 0.00320, 0.01257)
X28981: 0.00455 (-0.01325, -0.00340, 0.00001, 0.00865, 0.06230)
X17757: -0.00424 (-0.05360, -0.00676, -0.00002, 0.00296, 0.01066)
X38743: -0.00399 (-0.06360, -0.00742, -0.00001, 0.00345, 0.01299)
Notice that although the first four loci are the same in the two analyses, different loci appear in the output after that. How can we tell which of these models is best for prediction? There are two ways in which we can compare the models: (1) proportion of variance they explain, \(R^2\) and (2) their ability to predict data that wasn’t included in the analysis.
\(R^2\) is the easiest to get.
R2_wo <- bayes_R2(mass_wo_pop)
R2_w <- bayes_R2(mass_w_pop)
cat("R^2 (without pop): ", round(R2_wo[1], 3), "\n",
"R^2 (with pop): ", round(R2_w[1], 3), "\n", sep = "")
R^2 (without pop): 0.191
R^2 (with pop): 0.19
The model including population has a slightly lower \(R^2\), but it’s only a tiny amount lower.
The best way to see how well a model predicts data is to have the model predict data. The way we’ll do that is to leave one data point out of the model, fit the model, and predict the left out data point from the model. We’ll do that for every data point we have and see how much difference there is between the predictions and the data. The model in which the difference is smaller is preferred. Fortunately there’s a sneaky way to do this “leave one out cross validation” that doesn’t require us to re-run the code 141 times with 141 different data sets.
mass_wo_pop <- add_criterion(mass_wo_pop, "loo")
Warning: Found 1 observations with a pareto_k > 0.7 in model 'mass_wo_pop'. It is recommended to set 'moment_match = TRUE' in order to perform moment matching for problematic observations.
mass_w_pop <- add_criterion(mass_w_pop, "loo")
Warning: Found 2 observations with a pareto_k > 0.7 in model 'mass_w_pop'. It is recommended to set 'moment_match = TRUE' in order to perform moment matching for problematic observations.
Those warning messages mean that we have one or two observations that are a bit of an outlier. If we follow the advice, the warning messages disappear.
mass_wo_pop <- add_criterion(mass_wo_pop, "loo", moment_match = TRUE)
mass_w_pop <- add_criterion(mass_w_pop, "loo", moment_match = TRUE)
Now we can compare the two models.
loo_compare(mass_wo_pop, mass_w_pop)
elpd_diff se_diff
mass_wo_pop 0.0 0.0
mass_w_pop -0.2 0.4
Again, the model that doesn’t include population effects performs slightly better,1 but the magnitude of the difference is less than twice the standard error, so we don’t have a good reason to prefer one to the other. We’ll proceed without including population in the predictions. Feel free to run through the examples below with population included to see if it makes a difference.
Let’s see how well we can predict phenotypes simply by knowing their genotype.
plot_obs_vs_pred <- function(y, result) {
y_rep <- posterior_predict(result)
y_rep_mean <- apply(y_rep, 2, mean)
for_plot <- tibble(Observed = y, Predicted = y_rep_mean)
p <- ggplot(for_plot, aes(x = Predicted, y = Observed)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
theme_bw()
p
}
plot_obs_vs_pred(standardize(dat$Mass), mass_wo_pop)
The dashed line is the 1:1 line for a perfect fit. The predictions look fairly reasonable.2
## construct the model formula for PD
##
formula_string <- paste("standardize(PD)", paste(colnames(genos), collapse = " + "), sep = " ~ ")
formula_string <- paste(formula_string, "+ (1|gr(sample, cov = A))")
## run the model
##
pd <- brm(formula_string,
data = dat,
data2 = list(A = A),
family = gaussian(),
prior = set_prior(horseshoe(par_ratio = 20/n_loci)),
refresh = 0)
summarize_analysis(pd, "")
Mean: ( 2.5%, 10%, 50%, 90%, 97.5%)
X10472: 0.05728 (-0.00726, -0.00159, 0.00033, 0.27752, 0.45797)
X66820: -0.02863 (-0.34438, -0.09914, -0.00007, 0.00219, 0.00820)
X13105: -0.01416 (-0.21150, -0.01486, -0.00008, 0.00248, 0.00811)
X21382: 0.01236 (-0.01118, -0.00278, 0.00003, 0.01718, 0.18275)
X85712: -0.00991 (-0.18155, -0.01044, -0.00004, 0.00316, 0.01384)
X71305: 0.00646 (-0.01328, -0.00290, 0.00004, 0.01031, 0.10142)
X86543: 0.00642 (-0.01140, -0.00267, 0.00006, 0.01513, 0.07262)
X86542: 0.00529 (-0.01357, -0.00277, 0.00003, 0.01021, 0.07095)
X31591: -0.00516 (-0.11673, -0.00952, -0.00002, 0.00340, 0.01566)
X54861: 0.00475 (-0.01345, -0.00263, 0.00008, 0.01285, 0.05384)
X72788: 0.00471 (-0.01361, -0.00295, 0.00004, 0.01011, 0.05693)
X70551: -0.00467 (-0.06586, -0.00644, -0.00001, 0.00281, 0.01302)
X40182: 0.00449 (-0.01043, -0.00280, 0.00003, 0.01242, 0.05288)
X71254: -0.00434 (-0.10383, -0.00939, -0.00004, 0.00393, 0.02542)
X86539: 0.00430 (-0.00999, -0.00264, 0.00004, 0.00638, 0.04848)
X27546: 0.00415 (-0.01099, -0.00267, 0.00003, 0.00665, 0.04727)
X10914: -0.00386 (-0.03973, -0.01117, -0.00005, 0.00255, 0.01112)
X86806: 0.00384 (-0.01107, -0.00278, 0.00001, 0.00490, 0.04743)
X86541: 0.00382 (-0.01006, -0.00300, -0.00000, 0.00600, 0.04879)
X4680: 0.00375 (-0.01072, -0.00359, 0.00000, 0.00588, 0.05273)
plot_obs_vs_pred(standardize(dat$PD), pd)
## construct the model formula for PD
##
formula_string <- paste("standardize(TDT)", paste(colnames(genos), collapse = " + "), sep = " ~ ")
formula_string <- paste(formula_string, "+ (1|gr(sample, cov = A))")
## run the model
##
tdt <- brm(formula_string,
data = dat,
data2 = list(A = A),
family = gaussian(),
prior = set_prior(horseshoe(par_ratio = 20/n_loci)),
refresh = 0)
summarize_analysis(tdt, "")
Mean: ( 2.5%, 10%, 50%, 90%, 97.5%)
X16239: -0.01639 (-0.24355, -0.02754, -0.00008, 0.00277, 0.01086)
X44522: -0.01625 (-0.24763, -0.02057, -0.00006, 0.00307, 0.01130)
X65887: -0.01582 (-0.22369, -0.02816, -0.00009, 0.00278, 0.01160)
X71727: -0.01457 (-0.23450, -0.01625, -0.00004, 0.00323, 0.01213)
X16238: -0.01271 (-0.20599, -0.01502, -0.00004, 0.00320, 0.01358)
X36978: 0.01231 (-0.01145, -0.00322, 0.00007, 0.01517, 0.18926)
X27546: -0.01017 (-0.15769, -0.01638, -0.00005, 0.00310, 0.01344)
X75548: -0.00880 (-0.14268, -0.01322, -0.00004, 0.00339, 0.01236)
X4612: 0.00859 (-0.01309, -0.00325, 0.00003, 0.01007, 0.12053)
X86539: -0.00825 (-0.12206, -0.00933, -0.00005, 0.00366, 0.01537)
X86541: -0.00822 (-0.11857, -0.01078, -0.00003, 0.00357, 0.01263)
X29505: -0.00634 (-0.08264, -0.01053, -0.00005, 0.00346, 0.01313)
X29507: -0.00633 (-0.08188, -0.00962, -0.00004, 0.00332, 0.01379)
X10914: 0.00594 (-0.01399, -0.00345, 0.00003, 0.00912, 0.08638)
X6913: -0.00576 (-0.08061, -0.01029, -0.00004, 0.00312, 0.01130)
X86695: -0.00565 (-0.07921, -0.00887, -0.00004, 0.00326, 0.01350)
X15183: 0.00540 (-0.01540, -0.00373, 0.00003, 0.00939, 0.06860)
X38743: -0.00536 (-0.07641, -0.00853, -0.00003, 0.00391, 0.01358)
X58326: -0.00535 (-0.09226, -0.00765, -0.00002, 0.00330, 0.01378)
X54640: 0.00528 (-0.01246, -0.00312, 0.00002, 0.00933, 0.07188)
plot_obs_vs_pred(standardize(dat$TDT), tdt)
Now that we have all of the predictions we can see whether the individuals with similar genomic predictions for one trait are also similar for another.
library(cowplot)
mass_pred <- apply(posterior_predict(mass_wo_pop), 2, mean)
pd_pred <- apply(posterior_predict(pd), 2, mean)
tdt_pred <- apply(posterior_predict(tdt), 2, mean)
panel <- function(trait_1, trait_2, pred_1, pred_2) {
for_plot <- tibble(Trait_1 = pred_1,
Trait_2 = pred_2)
p <- ggplot(for_plot, aes(x = Trait_1, y = Trait_2)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
xlab(trait_1) +
ylab(trait_2) +
theme_bw()
return(p)
}
mass_pd <- panel("Mass", "PD", mass_pred, pd_pred)
mass_tdt <- panel("Mass", "TDT", mass_pred, tdt_pred)
tdt_pd <- panel("TDT", "PD", tdt_pred, pd_pred)
plot_grid(mass_pd, mass_tdt, tdt_pd,
nrows = 2)
The association among genomic values fairly weak. Just eyeballing the relationships, it appears that selection on body mass (Mass) might lead to a change in total development time (TDT), because of the positive association between the genomic values.