27 juin 2015

Exercise of genomic evaluation

This is an example of G-BLUP versus pedigree BLUP. The former uses a marker-based additive relationship matrix (G), while the latter uses the well known pedigree-based additive relationship matrix (A). The former (G-BLUP) is the simplest and one of the most used genomic evaluation method worldwide.

Steps of the exercise

This exercise has the following steps

  • read the genotypic and phenotypic data
  • run a genetic model based on pedigree (A matrix)
  • calculate from genotypes the marker-based additive relationship matrix (G)
  • calculate the incidence matrix
  • run a genetic model based on given incidence and G matrices
  • calculate the correlations between true BV and predicted BV
  • plot a heatmap of A and G
  • try two different datasets (different traits)
  • try decreasing subsamples of markers to degrade G signal

Libraries

library(breedR) # no way without it!
## Loading required package: sp

Libraries for pedigrees

library(pedantics)
## Loading required package: MasterBayes
## Loading required package: coda
## Loading required package: genetics
## Loading required package: combinat
## 
## Attaching package: 'combinat'
## 
## The following object is masked from 'package:utils':
## 
##     combn
## 
## Loading required package: gdata
## gdata: Unable to locate valid perl interpreter
## gdata: 
## gdata: read.xls() will be unable to read Excel XLS and XLSX files
## gdata: unless the 'perl=' argument is used to specify the location
## gdata: of a valid perl intrpreter.
## gdata: 
## gdata: (To avoid display of this message in the future, please
## gdata: ensure perl is installed and available on the executable
## gdata: search path.)
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLX' (Excel 97-2004) files.
## 
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLSX' (Excel 2007+) files.
## 
## gdata: Run the function 'installXLSXsupport()'
## gdata: to automatically download and install the perl
## gdata: libaries needed to support Excel XLS and XLSX formats.
## 
## Attaching package: 'gdata'
## 
## The following object is masked from 'package:stats':
## 
##     nobs
## 
## The following object is masked from 'package:utils':
## 
##     object.size
## 
## Loading required package: gtools
## Loading required package: MASS
## Loading required package: mvtnorm
## 
## 
## NOTE: THIS PACKAGE IS NOW OBSOLETE.
## 
## 
## 
##   The R-Genetics project has developed an set of enhanced genetics
## 
##   packages to replace 'genetics'. Please visit the project homepage
## 
##   at http://rgenetics.org for informtion.
## 
## 
## 
## 
## Attaching package: 'genetics'
## 
## The following objects are masked from 'package:base':
## 
##     %in%, as.factor, order
## 
## Loading required package: kinship2
## Loading required package: Matrix
## Loading required package: quadprog
## Loading required package: MCMCglmm
## Loading required package: ape
## Loading required package: grid
library(pedigree) # pedigree functions
## Loading required package: HaploSim
## Loading required package: reshape
## 
## Attaching package: 'reshape'
## 
## The following object is masked from 'package:Matrix':
## 
##     expand
## 
## 
## Attaching package: 'pedigree'
## 
## The following object is masked from 'package:MasterBayes':
## 
##     orderPed
library(pedigreemm) # pedigree functions
## Loading required package: lme4
## Warning: package 'lme4' was built under R version 3.2.1
## 
## Attaching package: 'pedigreemm'
## 
## The following object is masked from 'package:kinship2':
## 
##     pedigree

Libraries for graphics

library(car) # graphic functions
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:gtools':
## 
##     logit
library(hexbin) # graphic functions
## Warning: package 'hexbin' was built under R version 3.2.1

Import pedigree file already including founders

pheno_ped <- read.table("pheno_ped_case2.txt", header = T)
names(pheno_ped) <- c("self", "dad", "mum", "gen", "BV_X", "phe_X")
pheno_ped4 <- subset(pheno_ped, gen==4)
pedig <- build_pedigree(c('self', 'dad', 'mum'), data = pheno_ped)
## Warning in build_pedigree(c("self", "dad", "mum"), data = pheno_ped): The
## pedigree has been recoded. Check attr(ped, 'map').
num_tot <- as.numeric(dim(pheno_ped)[1])
num_rec <- as.numeric(dim(pheno_ped4)[1])
founders <- num_tot-num_rec

check variables in data.frame

summary(pheno_ped)
##       self           dad            mum            gen          BV_X      
##  Min.   :   2   Min.   :   0   Min.   :   0   Min.   :4    Min.   : 5.00  
##  1st Qu.:1826   1st Qu.:1188   1st Qu.:1219   1st Qu.:4    1st Qu.:16.00  
##  Median :1993   Median :1291   Median :1366   Median :4    Median :19.00  
##  Mean   :1844   Mean   :1252   Mean   :1262   Mean   :4    Mean   :18.99  
##  3rd Qu.:2160   3rd Qu.:1534   3rd Qu.:1554   3rd Qu.:4    3rd Qu.:22.00  
##  Max.   :2328   Max.   :1705   Max.   :1689   Max.   :4    Max.   :30.00  
##                                               NA's   :95   NA's   :95     
##      phe_X        
##  Min.   :-0.0668  
##  1st Qu.:14.5234  
##  Median :18.6149  
##  Mean   :18.8215  
##  3rd Qu.:23.2682  
##  Max.   :37.2241  
##  NA's   :95

plot trait distribution

run genetic model based on pedigree

pedmod_phe_X <- remlf90(fixed  = phe_X ~ 1,
                      genetic = list(model    = 'add_animal',
                                     pedigree = pedig,
                                     id       = 'self'),
                      data   = pheno_ped4)
## No specification of initial variances.
##      Using default value of 1 for all variance components.
##      See ?breedR.getOption.

variance components and effects

summary(pedmod_phe_X)
## Linear Mixed Model with pedigree and spatial effects fit by AI-REMLF90 ver. 1.110 
##    Data: pheno_ped4 
##   AIC  BIC logLik
##  3783 3791  -1889
## 
## Parameters of special components:
## 
## 
## Variance components:
##          Estimated variances  S.E.
## genetic                7.383 4.156
## Residual              36.560 3.226
## 
## Fixed effects:
##            value   s.e.
## Intercept 18.631 1.2513
hh_phe_X <- pedmod_phe_X$var[1,1]/(pedmod_phe_X$var[1,1]+pedmod_phe_X$var[2,1])
hh_phe_X
## [1] 0.1680207

imports genotypic data and creates X matrix

# import genotypic data
genot <- read.table("genotypes_case2.txt", sep = " ", header = F)
# removes self from file, follows same order as production file
genot_no_self <- genot[c(-1)]
# number of markers
n_SNP <- as.numeric(dim.data.frame(genot_no_self)[2])
# matrix of genotypes
X <- as.matrix(genot_no_self,nrow=num_rec,ncol=n_SNP,dimnames=NULL)

calculates G matrix

# calculates frequency of favourable allele per marker
Pi <- apply(X,2,sum)/(2*num_rec)
mat_Pi <- matrix(rep(Pi,num_rec),ncol=n_SNP,byrow=T)
W <- matrix(0,nrow=num_rec,ncol=n_SNP)
W <- X - (2*mat_Pi)
het <- 2*sum(Pi*(1-Pi))
G <- W%*%t(W) / het
# inverse is not needed
G_inv <- solve(G+diag(num_rec)*0.01)

creates incidence and structure matrices

# Make it index matrix, stored as 1-based integer index vectors.
# An index matrix is a matrix with exactly one non-zero entry per row.
# Index matrices are useful for mapping observations.
gen.inc.alt0 <- matrix(0,num_rec,1)
gen.inc.alt1 <- diag(1,num_rec,num_rec)
gen.inc.alt <- as(cbind(gen.inc.alt1),"indMatrix")
# Makes G a symmetric & sparse numeric matrix in compressed
# column-oriented format (dsCMatrix).
gen.cov <- as(G, "dsCMatrix")

runs a generic model with given incidence and G matrices

gsmod_phe_X <- remlf90(fixed   = phe_X ~ 1,
                     generic = list(gen = list(gen.inc.alt,gen.cov)),
                     data    = pheno_ped4)
## No specification of initial variances.
##      Using default value of 1 for all variance components.
##      See ?breedR.getOption.

summary generic model with given incidence and G matrices

summary(gsmod_phe_X)
## Linear Mixed Model with pedigree and spatial effects fit by AI-REMLF90 ver. 1.110 
##    Data: pheno_ped4 
##   AIC  BIC logLik
##  3715 3724  -1856
## 
## Parameters of special components:
## 
## 
## Variance components:
##          Estimated variances  S.E.
## gen                    13.14 3.325
## Residual               31.28 2.194
## 
## Fixed effects:
##            value   s.e.
## Intercept 18.836 0.2354
hh_gs_phe_X <- gsmod_phe_X$var[1,1]/(gsmod_phe_X$var[1,1]+gsmod_phe_X$var[2,1])
hh_gs_phe_X
## [1] 0.2957968

calculates correlations with true BV

PBV.full.gen <- ranef(gsmod_phe_X)$gen
PBV.gen <- model.matrix(gsmod_phe_X)$gen %*% PBV.full.gen

PBV.full.ped <- ranef(pedmod_phe_X)$genetic
PBV.ped <- model.matrix(pedmod_phe_X)$genetic %*% PBV.full.ped

cor_gen <- cor(pheno_ped4$BV_X,PBV.gen)
cor_ped <- cor(pheno_ped4$BV_X,PBV.ped)
# calculates min and max for graphical purposes
max_BV <- max(pheno_ped4$BV_X)
min_BV <- min(pheno_ped4$BV_X)
max_EBV <- max(PBV.gen,PBV.ped)
min_EBV <- min(PBV.gen,PBV.ped)

scatterplots between True and predicted BVs for G-BLUP

## scatterplots between True and predicted BVs for pedigree-BLUP ## hitmap of G matrix ## hitmap of G matrix