###BreedR workshop (Jaca, Spain, 2015) #R script: prepared by Dr. Vincent Segura #R script: to perform spatial and genetic analysis of forest genetic trials using breedR #the data file "DW.predicted.practice.txt" represents the predicted total dry biomass yield of black poplar experimental design in Orléans (France) in 2011 ###Part1 #set working directory setwd("F:/mngebreselas/Doc-INRA/Workshops/BreedR_Jaca_Spain_2015/Practical_session") #import data into R my_data<-read.table("DW.predicted.practice.txt", header=T, sep="\t") #check the phenotype data summary(my_data$DW.predicted.model1) #there are NA's. Total number of NA's = 1704 out of 6745 observations of one variable. #remove the NA's data_ok <- subset(my_data, !is.na(DW.predicted.model1)) rm(my_data) #After removing the NA's, the levels of the variable "Ident" is not changing accordingly. So correct the levels of "Ident". #so re-set the factor "Ident" data_ok$Ident <- as.factor(as.character(data_ok$Ident)) #conversion of coordinates into meters (tree spacing: 1m*2m) data_ok$X_ok <- data_ok$X * 2 data_ok$Y_ok <- data_ok$Y * 1 #2D map of phenotypic data library(ggplot2) p <- qplot(Y_ok, X_ok, fill = DW.predicted.model1, geom = "tile", data = data_ok, main = "DW.predicted.model1", xlab = "Y (m)", ylab = "X (m)") + scale_fill_gradient(low = "green", high = "red") bloc_limits <- cbind(aggregate(data_ok$Y_ok, list(data_ok$Bloc), min)$x, aggregate(data_ok$Y_ok, list(data_ok$Bloc), max)$x, aggregate(data_ok$X_ok, list(data_ok$Bloc), min)$x, aggregate(data_ok$X_ok, list(data_ok$Bloc), max)$x) rownames(bloc_limits) <- paste("Bloc", 1:6, sep = "") colnames(bloc_limits) <- c("Y_min", "Y_max", "X_min", "X_max") p + annotate("rect", xmin = bloc_limits[, 1] - 0.5, xmax = bloc_limits[, 2] + 0.5, ymin = bloc_limits[, 3] - 1, ymax = bloc_limits[, 4] + 1, col = "black", alpha = .1) #histogram:to show distribution of phenotypic data hist(data_ok$DW.predicted.model1, col = "grey", xlab = "DW.predicted.model1", main = "") #data are skewed towards lower values. ###Part2 #fixed effect ANOVA linmod <- lm(DW.predicted.model1 ~ as.factor(Bloc) + Ident, data = data_ok) anova(linmod) #model QC: #boxplot per block boxplot(DW.predicted.model1 ~ as.factor(Bloc), data = data_ok, col = "grey", xlab = "Block", ylab = "DW.predicted.model1") #suggests within block variance heterogeneity #look for distribution of residuals op <- par(mfrow = c(2, 2)) plot(linmod) par(op) #model QC shows that residuals are not iid #it suggests that data should be transformed #box-cox transformation library(MASS) boxcox_transf <- boxcox(linmod) lambda <- boxcox_transf$x[which.max(boxcox_transf$y)] lambda #boxcox procedure suggests the need for data transformation #boxcox procedure suggests that the best lambda value = 0.3434343 #box-cox transformation: Y_trsf = (Y^lambda -1)/lambda data_ok$DW.predicted.model1.trsf <- (data_ok$DW.predicted.model1^lambda-1)/lambda #check distribution of phenotype data hist(data_ok$DW.predicted.model1.trsf, col = "grey", xlab = "DW.predicted.model1.trsf", main = "") #check if the anova model is improved after data transformation: linmod_trsf <- lm(DW.predicted.model1.trsf ~ as.factor(Bloc) + Ident, data = data_ok) anova(linmod_trsf) #boxplot per block for phenotypic data after transformation boxplot(DW.predicted.model1.trsf ~ as.factor(Bloc), data = data_ok, col = "grey", xlab = "Block", ylab = "DW.predicted.model1.trsf") #now within block variances seem more homogeneous #check distribution of residuals op <- par(mfrow = c(2, 2)) plot(linmod_trsf) par(op) #it looks better! ###Part3 #Linear mixed model analysis without a spatial effect using breedR (classical model) library(breedR) mixmod_breedR <- remlf90(fixed = DW.predicted.model1.trsf ~ 1 + as.factor(Bloc), random = ~ Ident, data = data_ok, method = "ai") summary(mixmod_breedR) #create a new data frame "resid" that includes the data frame "data_ok" and a new variable "resid_breedR" resid <- data.frame(data_ok, "resid_breedR" = residuals(mixmod_breedR)) ##spatial effect diagnosis #2D plot of residuals from the classical model "mixmod_breedR" r <- qplot(Y_ok, X_ok, fill = resid_breedR, geom = "tile", data = resid, main = "mixmod_breedR_residuals", xlab = "Y (m)", ylab = "X (m)") + scale_fill_gradient(low = "green", high = "red") r + annotate("rect", xmin = bloc_limits[, 1] - 0.5, xmax = bloc_limits[, 2] + 0.5, ymin = bloc_limits[, 3] - 1, ymax = bloc_limits[, 4] + 1, col = "black", alpha = .1) #variograms of residuals from the classical model "mixmod_breedR" variogram(mixmod_breedR, coord = data_ok[, colnames(data_ok) %in% c("X_ok", "Y_ok")], R = 60) #it confirms the existence of spatial effects. #AIC from the mixed model w/o spatial effects extractAIC(mixmod_breedR) ###Part 4 #Linear mixed model with a spatial effect: autoregressive with Block effects. mixmod_breedR_AR1_bloc_grid0 <- remlf90(fixed = DW.predicted.model1.trsf ~ 1 + as.factor(Bloc), random = ~ Ident, spatial = list(model = "AR", coordinates = data_ok[, c("X_ok", "Y_ok")]), data = data_ok, method = "ai") save(mixmod_breedR_AR1_bloc_grid0, file = "mixmod_breedR_AR1_bloc_grid0_DW.predicted_model1_trsf.Rdata") #Once the 2 previous commands are executed, one can skip them and replace them by the next one to save time... load("mixmod_breedR_AR1_bloc_grid0_DW.predicted_model1_trsf.Rdata") qplot(rho_r, rho_c, fill = loglik, geom = "tile", data = mixmod_breedR_AR1_bloc_grid0$rho) extractAIC(mixmod_breedR_AR1_bloc_grid0) summary(mixmod_breedR_AR1_bloc_grid0) #let's try to refine the grid of rho to improve model fit and thus decrease the AIC #grid1 rho.grid <- expand.grid(rho_r = seq(0.5, 0.99, length = 4), rho_c = seq(0.5, 0.99, length = 4)) mixmod_breedR_AR1_bloc_grid1 <- remlf90(fixed = DW.predicted.model1.trsf ~ 1 + as.factor(Bloc), random = ~ Ident, spatial = list(model = "AR", coordinates = data_ok[, c("X_ok", "Y_ok")], rho = rho.grid), data = data_ok, method = "ai") save(mixmod_breedR_AR1_bloc_grid1, file = "mixmod_breedR_AR1_bloc_grid1_DW.predicted_model1_trsf.Rdata") # load("mixmod_breedR_AR1_bloc_grid1_DW.predicted_model1_trsf.Rdata") qplot(rho_r, rho_c, fill = loglik, geom = "tile", data = mixmod_breedR_AR1_bloc_grid1$rho) extractAIC(mixmod_breedR_AR1_bloc_grid1) summary(mixmod_breedR_AR1_bloc_grid1) #grid2:refine further rho.grid <- expand.grid(rho_r = seq(0.81, 0.84, length = 4), rho_c = seq(0.98, 0.99, length = 4)) mixmod_breedR_AR1_bloc_grid2 <- remlf90(fixed = DW.predicted.model1.trsf ~ 1 + as.factor(Bloc), random = ~ Ident, spatial = list(model = "AR", coordinates = data_ok[, c("X_ok", "Y_ok")], rho = rho.grid), data = data_ok, method = "ai") save(mixmod_breedR_AR1_bloc_grid2, file = "mixmod_breedR_AR1_bloc_grid2_DW.predicted_model1_trsf.Rdata") # load("mixmod_breedR_AR1_bloc_grid2_DW.predicted_model1_trsf.Rdata") qplot(rho_r, rho_c, fill = loglik, geom = "tile", data = mixmod_breedR_AR1_bloc_grid2$rho) extractAIC(mixmod_breedR_AR1_bloc_grid2) summary(mixmod_breedR_AR1_bloc_grid2) #grid3 rho.grid <- expand.grid(rho_r = seq(0.83, 0.85, length = 4), rho_c = seq(0.97, 0.99, length = 4)) mixmod_breedR_AR1_bloc_grid3 <- remlf90(fixed = DW.predicted.model1.trsf ~ 1 + as.factor(Bloc), random = ~ Ident, spatial = list(model = "AR", coordinates = data_ok[, c("X_ok", "Y_ok")], rho = rho.grid), data = data_ok, method = "ai") save(mixmod_breedR_AR1_bloc_grid3, file = "mixmod_breedR_AR1_bloc_grid3_DW.predicted_model1_trsf.Rdata") # load("mixmod_breedR_AR1_bloc_grid3_DW.predicted_model1_trsf.Rdata") qplot(rho_r, rho_c, fill = loglik, geom = "tile", data = mixmod_breedR_AR1_bloc_grid3$rho) extractAIC(mixmod_breedR_AR1_bloc_grid3) summary(mixmod_breedR_AR1_bloc_grid3) #grid4 rho.grid <- expand.grid(rho_r = seq(0.84, 0.88, length = 4), rho_c = seq(0.96, 0.99, length = 4)) mixmod_breedR_AR1_bloc_grid4 <- remlf90(fixed = DW.predicted.model1.trsf ~ 1 + as.factor(Bloc), random = ~ Ident, spatial = list(model = "AR", coordinates = data_ok[, c("X_ok", "Y_ok")], rho = rho.grid), data = data_ok, method = "ai") save(mixmod_breedR_AR1_bloc_grid4, file = "mixmod_breedR_AR1_bloc_grid4_DW.predicted_model1_trsf.Rdata") # load("mixmod_breedR_AR1_bloc_grid4_DW.predicted_model1_trsf.Rdata") qplot(rho_r, rho_c, fill = loglik, geom = "tile", data = mixmod_breedR_AR1_bloc_grid4$rho) extractAIC(mixmod_breedR_AR1_bloc_grid4) summary(mixmod_breedR_AR1_bloc_grid4) #grid5: rho.grid <- expand.grid(rho_r = seq(0.83, 0.88, length = 4), rho_c = seq(0.95, 0.99, length = 4)) mixmod_breedR_AR1_bloc_grid5 <- remlf90(fixed = DW.predicted.model1.trsf ~ 1 + as.factor(Bloc), random = ~ Ident, spatial = list(model = "AR", coordinates = data_ok[, c("X_ok", "Y_ok")], rho = rho.grid), data = data_ok, method = "ai") save(mixmod_breedR_AR1_bloc_grid5, file = "mixmod_breedR_AR1_bloc_grid5_DW.predicted_model1_trsf.Rdata") # load("mixmod_breedR_AR1_bloc_grid5_DW.predicted_model1_trsf.Rdata") qplot(rho_r, rho_c, fill = loglik, geom = "tile", data = mixmod_breedR_AR1_bloc_grid5$rho) extractAIC(mixmod_breedR_AR1_bloc_grid5) summary(mixmod_breedR_AR1_bloc_grid5) #compare model AICs extractAIC(mixmod_breedR) extractAIC(mixmod_breedR_AR1_bloc_grid0) extractAIC(mixmod_breedR_AR1_bloc_grid1) extractAIC(mixmod_breedR_AR1_bloc_grid2) extractAIC(mixmod_breedR_AR1_bloc_grid3) extractAIC(mixmod_breedR_AR1_bloc_grid4) extractAIC(mixmod_breedR_AR1_bloc_grid5) #mixmod_breedR_AR1_bloc_grid5:is the best model(minimum AIC) #best autoregressive parameters for rows and columns: (0.846, 0.976) ##mixed model with a spatial effect with the best autoregressive parameters for rows and columns: autoregressive with block effect selmod <- remlf90(fixed = DW.predicted.model1.trsf ~ 1 + as.factor(Bloc), random = ~ Ident, spatial = list(model = "AR", coordinates = data_ok[, c("X_ok", "Y_ok")], rho = c(rho_r= 0.846, rho_c = 0.976)), data = data_ok, method = "ai") save(selmod, file = "selmod_DW.predicted_model1_trsf.Rdata") # load("selmod_DW.predicted_model1_trsf.Rdata") summary(selmod) ##Part 5: Focus on selected spatial model "selmod" #extraction of predicted values for fixed and random effects BLUP_genot <- data.frame("Ident" = rownames(selmod$ranef$Ident), GBLUP = selmod$ranef$Ident$value) Output <- data.frame(na.omit(data_ok, "Coord" = paste(data_ok$X_ok, data_ok$Y_ok, sep = "-"))) Output$Coord <- paste(Output$X_ok, Output$Y_ok, sep = "-") Block_effect <- as.vector(sapply(selmod$fixed, function(x){x$value})) Output$Block_effect <- Block_effect[1] for (i in 2:length(Block_effect)){ Output$Block_effect[Output$Bloc == i] <- Block_effect[i] } Output <- merge(Output, BLUP_genot, by = "Ident") spatial <- data.frame(selmod$effects$spatial$effects$ar$coordinates, Spatial = selmod$effects$spatial$effects$ar$incidence.matrix %*% ranef(selmod)$spatial) Output <- merge(Output, data.frame("Coord" = paste(spatial$X_ok, spatial$Y_ok, sep = "-"), "Spatial" = spatial$Spatial), by = "Coord") Output$Fitted <- Output$Block_effect + Output$GBLUP + Output$Spatial resid$Selmod <- na.omit(residuals(selmod)) Output <- merge(Output, data.frame("Coord" = paste(resid$X_ok, resid$Y_ok, sep = "-"), "Resid" = resid$Selmod), by = "Coord") Output$Phen_Adj <- mean(Output$DW.predicted.model1.trsf) + Output$GBLUP + Output$Resid ##Save the predicted values for further analysis save(list = c("Output", "BLUP_genot"), file = "Output_BreedR_Orl_DW.predicted.model1.trsf.Rdata") write.table(Output, file = "Output_BreedR_Orl_DW.predicted.model1.trsf.txt", sep = "\t", row.names = FALSE) #2D map of spatial effects s <- qplot(Output$Y_ok, Output$X_ok, fill = Output$Spatial, geom = "tile", main ="Spatial effects", xlab = "Y (m)", ylab = "X (m)") + scale_fill_gradient(low = "green", high = "red") s + annotate("rect", xmin = bloc_limits[, 1] - 0.5, xmax = bloc_limits[, 2] + 0.5, ymin = bloc_limits[, 3] - 1, ymax = bloc_limits[, 4] + 1, col = "black", alpha = .1) #2D map of residuals from selected model "selmod" t <- qplot(Output$Y_ok, Output$X_ok, fill = Output$Resid, geom = "tile", data = resid, main ="residuals", xlab = "Y (m)", ylab = "X (m)") + scale_fill_gradient(low = "green", high = "red") t + annotate("rect", xmin = bloc_limits[, 1] - 0.5, xmax = bloc_limits[, 2] + 0.5, ymin = bloc_limits[, 3] - 1, ymax = bloc_limits[, 4] + 1, col = "black", alpha = .1) #2D map of genotype BLUPs from selected model "selmod" u <- qplot(Output$Y_ok, Output$X_ok, fill = Output$GBLUP, geom = "tile", main ="Genotype BLUPs", xlab = "Y (m)", ylab = "X (m)") + scale_fill_gradient(low = "green", high = "red") u + annotate("rect", xmin = bloc_limits[, 1] - 0.5, xmax = bloc_limits[, 2] + 0.5, ymin = bloc_limits[, 3] - 1, ymax = bloc_limits[, 4] + 1, col = "black", alpha = .1) #check distribution of genotype BLUPs per Bloc boxplot(GBLUP ~ Bloc, data = Output, col = "grey", xlab = "Block", ylab = "Genotype BLUP") #compare variograms of residuals from classical model (mixmod_breedR) & selected spatial model(selmod) #plot the 2 variograms on the same scale using the variogram function of breedR variogram_null <- variogram(mixmod_breedR, coord = data_ok[, colnames(data_ok) %in% c("X_ok", "Y_ok")], R = 60) variogram_selmod <- variogram(selmod, R = 60) # Extract both isotropic variograms to be compared isotropic_variograms <- rbind(cbind(model = "Block", variogram_null[["isotropic"]]), cbind(model = "AR1_Block", variogram_selmod[["isotropic"]])) # Then, you can plot the 2 variograms together under the same scale ggplot(isotropic_variograms, aes(distance, variogram)) + geom_point() + geom_line() + stat_smooth(se = FALSE, method = 'auto') + facet_wrap(~ model) #histogram of random effect blups from selected spatial model op <- par(mfrow = c(2, 2), mar = c(4.5, 4.5, 2, 1)) hist(BLUP_genot$GBLUP, main = "Genotype", xlab = "BLUPs", col = "grey") hist(Output$Spatial, main = "Spatial effect", xlab = "BLUPs", col = "grey") hist(Output$Resid, col = "grey", "xlab" = "Resid", main = "residuals") hist(Output$Phen_Adj, col = "grey", "xlab" = "Phenotype", main = "Adjusted phenotype") par(op) ##Part 6: Genetic analysis based on the selected model "selmod" #extraction of the average information (AI) matrix from breedR output object "selmod" aimat <- which(selmod$reml$output == " inverse of AI matrix (Sampling Variance)") varcov_mat_breedR <- matrix(na.omit(as.numeric(unlist(apply(data.frame(selmod$reml$output[ (aimat + 1):(aimat + 3)]), 1, function(x){strsplit(x, " ")})))), 3, 3) colnames(varcov_mat_breedR) <- c("Ident", "spatial", "Residual") rownames(varcov_mat_breedR) <- c("Ident", "spatial", "Residual") #delta method for H²ind library(msm) H2 <- selmod$var["Ident", "Estimated variances"] / (selmod$var["Ident", "Estimated variances"] + selmod$var["Residual", "Estimated variances"]) se_H2 <- deltamethod(~ x1 / (x1 + x2), + c(selmod$var["Ident", "Estimated variances"], + selmod$var["Residual", "Estimated variances"]), + varcov_mat_breedR[c("Ident", "Residual"), c("Ident", "Residual")]) round(c(H2, 1.96*se_H2), 2) #H²mean with se nb_obs_genot <- length(na.omit(selmod$mf$DW.predicted.model1.trsf)) / length(levels(selmod$mf$Ident)) H2_mean <- selmod$var["Ident", "Estimated variances"] / (selmod$var["Ident", "Estimated variances"] + selmod$var["Residual", "Estimated variances"] / nb_obs_genot) se_H2_mean <- deltamethod(~ x1 / (x1 + x3 / nb_obs_genot), selmod$var[, "Estimated variances"], varcov_mat_breedR) round(c(H2_mean, 1.96*se_H2_mean), 2) #coefficient of genetic variation (CV) CVg <- sqrt(selmod$var["Ident", "Estimated variances"]) / mean(selmod$mf$DW.predicted.model1.trsf, na.rm = TRUE) * 100 round(CVg, 2) ########## ########### ############## #Tips! #stacking the AICs into a vector mixmods<-list(mixmod_breedR, selmod) names(mixmods)<-c("Block", "AR1_Block") AICs<-sapply(mixmods,extractAIC) #classical model vs selected spatial model: estimation of individual broad-sense heritability with each model herit <- sapply(mixmods, function(x){x$var["Ident", "Estimated variances"] / (x$var["Ident", "Estimated variances"] + x$var["Residual", "Estimated variances"])}) ##############Hope you will enjoy breedR!!!