New.GWAS.Scoring

Last update: 2018-01-18

Code version: 5996c19850dc5fc65723f30bc33ab1c8d083a7e6

Setting important directories. Also loading important libraries and custom functions for analysis.

seq_dir <- "/Volumes/PAULHOOK/sc-da-parkinsons/data"
file_dir <- "/Volumes/PAULHOOK/sc-da-parkinsons/output"
Rdata_dir <- "/Volumes/PAULHOOK/sc-da-parkinsons/data"
Script_dir <- "/Volumes/PAULHOOK/sc-da-parkinsons/code"
source(file.path(Script_dir,'init.R'))
source(file.path(Script_dir,"tools_R.r"))

# Loading extra libraries
library(DT)

# Loading a CDS so gene names can be retrieved
dat <- readRDS(file.path(Rdata_dir,"P7.Mb.dat.filter.final.Rds"))

Loading genesets needed for scoring

In order to score the PD GWAS loci genes, we need to load: 1) SN DA neuron expressed genes from our data 2) SN DA neuron expressed genes from La Manno, et al. 3) P7 MB differentially expressed genes from our data 4) SN DA neuron specific genes 5) Genes in WGCNA modules enriched for ‘Parkinson’s disease’ genesets

# SNC expressed genes in our data
SN.expressed.genes <- readRDS(file = file.path(Rdata_dir, "SN.expressed.genes.rds"))
length(SN.expressed.genes)

## [1] 6126

# SNC expresed genes in Linnarsson Data
lin.snc.exprs <- readRDS(file = file.path(Rdata_dir,"lin.snc.expressed.rds"))
length(lin.snc.exprs)

## [1] 5406

# P7 differentially expressed genes (between all clusters)
P7.Mb.diff.genes <- readRDS(file = file.path(Rdata_dir,"P7.diff.genes.rds"))
length(P7.Mb.diff.genes)

## [1] 3971

# SN Specific genes
specific.genes <- read.delim(file.path(file_dir,"P7.specific.genes.txt"))
SN.specific.genes <- specific.genes$P7.MB.4
length(SN.specific.genes)

## [1] 110

# Co-regulated with PD
WGCNA.brown <- readRDS(file = file.path(file_dir, "WGCNA.ParkGeneSet.Brown.rds"))
WGCNA.green <- readRDS(file = file.path(file_dir, "WGCNA.ParkGeneSet.Green.rds"))
co.reg.genes <- lookupGeneName(dat,unique(c(WGCNA.green, WGCNA.brown)))
length(co.reg.genes)

## [1] 215

Loading the PD GWAS loci gene table created

PD.GWAS.gene.table <- read.delim(file = file.path(file_dir,"PD.loci.genes.final-new.txt"), header = T)

Scoring the PD GWAS loci genes

# Copy over the table
scoring.table <- PD.GWAS.gene.table

# Score based on expression in our data
scoring.table$expression <- if_else(scoring.table$MouseSymbol %in% SN.expressed.genes, 1, 0)

# Scoring based on expression in La Manno data
scoring.table$lin.exprs <- if_else(scoring.table$MouseSymbol %in% lin.snc.exprs, 1, 0)

# Score based on being a SN DA specific gene
scoring.table$specific <- if_else(scoring.table$MouseSymbol %in% SN.specific.genes, 1, 0)

# Score based on being in a WGCNA PD module
scoring.table$co.reg <- if_else(scoring.table$MouseSymbol %in% co.reg.genes, 1, 0)

# Score on being a differentially regulated gene (differentially expressed)
scoring.table$diff <- if_else(scoring.table$MouseSymbol %in% P7.Mb.diff.genes, 1, 0)

# Adding up the scores
scoring.table$score <- scoring.table$co.reg + scoring.table$specific + scoring.table$diff + scoring.table$lin.exprs + scoring.table$expression

# Add pLI scores
pli.table <- read.delim(file = file.path(file_dir,"fordist_cleaned_exac_r03_march16_z_pli_rec_null_data.txt")) %>% dplyr::select(gene, pLI)

# Merge the scoring and pLI tables
scoring.table <- merge(x = scoring.table, y = pli.table, by.x = "HumanSymbol", by.y = "gene", all.x = T)

# If a gene does not have a pLI, give it a score of 0
scoring.table$pLI[is.na(scoring.table$pLI)] <- 0

# Adding pLI to the total score
score.pLI <- scoring.table$pLI + scoring.table$score
scoring.table$score.pLI <- round(score.pLI,3)

Writing out the table

We now want to write the scoring table out. There is code contained in here which will write out each locus separately.

# Writing out each individual loci ordered by "score"
scoring.table$locus <- as.factor(scoring.table$locus)
locus.levels <- levels(scoring.table$locus)

#Results.folder <- "/Volumes/PAULHOOK/Revision/New.scoring"
#for(i in 1:length(locus.levels)){
  #x <- scoring.table[scoring.table$locus == locus.levels[i],]
  #y <- x[!(duplicated(x$HumanSymbol)),]
  #z <- y[order(y$score, decreasing = T),]
  #write.table(z, file = paste0(Results.folder,".",locus.levels[i],".locus-new.txt"), row.names = F, quote = F, sep = "\t")
#}

# Writing out the entire table ordered by score
write.table(scoring.table[order(scoring.table$score.pLI, decreasing = T),],
            file = file.path(file_dir,"PD.GWAS.Score.Final.txt"),
            row.names = F, quote = F, sep = "\t")

Writing out a locus summary table

We now want to write out a summary table that summarizes each locus.

# Group by locus and count how many genes are in each locus and how many are expressed in either
sum.table <- scoring.table %>% 
  group_by(locus) %>% 
  dplyr::summarize(num_genes = length(locus),
                   num_expressed_either = sum(lin.exprs == 1 | expression == 1),
                   num_expressed_both = sum(lin.exprs == 1 & expression == 1),
                   num_homolog = sum(!is.na(MouseSymbol)),
                   num_no_homolog = sum(is.na(MouseSymbol)))

## Warning: package 'bindrcpp' was built under R version 3.3.2

# Writing out summary table
write.table(sum.table, file = file.path(file_dir, "PD-GWAS-Summary-Table.txt"),
            row.names = F, quote = F, sep = "\t")


# Getting summary stats
#sum(sum.table$num_genes)
#sum(sum.table$num_homolog)
#sum(sum.table$num_no_homolog)

#sum(sum.table$num_expressed_both)
#sum(sum.table$num_expressed_either)

#nrow(scoring.table[is.na(scoring.table$MouseSymbol),]) #462
#nrow(scoring.table[is.na(scoring.table$MouseSymbol) & scoring.table$gene_biotype == "protein_coding",]) #87

View scoring table

datatable(scoring.table, rownames = F, caption = "PD GWAS gene scoring")

View locus summary table

datatable(sum.table, rownames = F, caption = "PD GWAS summary of scoring table")

Session Info

sessionInfo()

## R version 3.3.0 (2016-05-03)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.11.6 (El Capitan)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      splines   stats4    parallel  stats     graphics  grDevices
##  [8] utils     datasets  methods   base     
## 
## other attached packages:
##  [1] bindrcpp_0.2        DT_0.2              ggbiplot_0.55      
##  [4] scales_0.5.0        SC3_1.1.4           ROCR_1.0-7         
##  [7] jackstraw_1.1.1     lfa_1.2.2           tsne_0.1-3         
## [10] gridExtra_2.3       slackr_1.4.2        vegan_2.4-4        
## [13] permute_0.9-4       MASS_7.3-47         gplots_3.0.1       
## [16] RColorBrewer_1.1-2  Hmisc_4.0-3         Formula_1.2-2      
## [19] survival_2.41-3     lattice_0.20-35     Heatplus_2.18.0    
## [22] Rtsne_0.13          pheatmap_1.0.8      tidyr_0.7.1        
## [25] dplyr_0.7.4         plyr_1.8.4          heatmap.plus_1.3   
## [28] stringr_1.2.0       marray_1.50.0       limma_3.28.21      
## [31] reshape2_1.4.3      monocle_2.2.0       DDRTree_0.1.5      
## [34] irlba_2.2.1         VGAM_1.0-2          ggplot2_2.2.1      
## [37] Biobase_2.32.0      BiocGenerics_0.18.0 Matrix_1.2-11      
## 
## loaded via a namespace (and not attached):
##  [1] RSelenium_1.7.1        colorspace_1.3-2       class_7.3-14          
##  [4] rprojroot_1.2          htmlTable_1.9          corpcor_1.6.9         
##  [7] base64enc_0.1-3        mvtnorm_1.0-6          codetools_0.2-15      
## [10] doParallel_1.0.11      robustbase_0.92-7      knitr_1.17            
## [13] jsonlite_1.5           cluster_2.0.6          semver_0.2.0          
## [16] shiny_1.0.5            rrcov_1.4-3            httr_1.3.1            
## [19] backports_1.1.1        assertthat_0.2.0       lazyeval_0.2.1        
## [22] acepack_1.4.1          htmltools_0.3.6        tools_3.3.0           
## [25] igraph_1.1.2           gtable_0.2.0           glue_1.1.1            
## [28] binman_0.1.0           doRNG_1.6.6            Rcpp_0.12.14          
## [31] slam_0.1-37            gdata_2.18.0           nlme_3.1-131          
## [34] iterators_1.0.8        mime_0.5               rngtools_1.2.4        
## [37] gtools_3.5.0           WriteXLS_4.0.0         XML_3.98-1.9          
## [40] DEoptimR_1.0-8         yaml_2.1.15            pkgmaker_0.22         
## [43] rpart_4.1-11           fastICA_1.2-1          latticeExtra_0.6-28   
## [46] stringi_1.1.5          pcaPP_1.9-72           foreach_1.4.3         
## [49] e1071_1.6-8            checkmate_1.8.4        caTools_1.17.1        
## [52] rlang_0.1.6            pkgconfig_2.0.1        matrixStats_0.52.2    
## [55] bitops_1.0-6           qlcMatrix_0.9.5        evaluate_0.10.1       
## [58] purrr_0.2.4            bindr_0.1              htmlwidgets_0.9       
## [61] magrittr_1.5           R6_2.2.2               combinat_0.0-8        
## [64] wdman_0.2.2            foreign_0.8-69         mgcv_1.8-22           
## [67] nnet_7.3-12            tibble_1.3.4           KernSmooth_2.23-15    
## [70] rmarkdown_1.8          data.table_1.10.4      HSMMSingleCell_0.106.2
## [73] digest_0.6.12          xtable_1.8-2           httpuv_1.3.5          
## [76] openssl_0.9.7          munsell_0.4.3          registry_0.3

```

This R Markdown site was created with workflowr