Last update: 2018-01-18
Code version: 5996c19850dc5fc65723f30bc33ab1c8d083a7e6
seq_dir <- "/Volumes/PAULHOOK/sc-da-parkinsons/data"
file_dir <- "/Volumes/PAULHOOK/sc-da-parkinsons/output"
Rdata_dir <- "/Volumes/PAULHOOK/sc-da-parkinsons/data"
Script_dir <- "/Volumes/PAULHOOK/sc-da-parkinsons/code"
source(file.path(Script_dir,'init.R'))
source(file.path(Script_dir,"tools_R.r"))
# Loading extra libraries
library(DT)
# Loading a CDS so gene names can be retrieved
dat <- readRDS(file.path(Rdata_dir,"P7.Mb.dat.filter.final.Rds"))
In order to score the PD GWAS loci genes, we need to load: 1) SN DA neuron expressed genes from our data 2) SN DA neuron expressed genes from La Manno, et al. 3) P7 MB differentially expressed genes from our data 4) SN DA neuron specific genes 5) Genes in WGCNA modules enriched for ‘Parkinson’s disease’ genesets
# SNC expressed genes in our data
SN.expressed.genes <- readRDS(file = file.path(Rdata_dir, "SN.expressed.genes.rds"))
length(SN.expressed.genes)
## [1] 6126
# SNC expresed genes in Linnarsson Data
lin.snc.exprs <- readRDS(file = file.path(Rdata_dir,"lin.snc.expressed.rds"))
length(lin.snc.exprs)
## [1] 5406
# P7 differentially expressed genes (between all clusters)
P7.Mb.diff.genes <- readRDS(file = file.path(Rdata_dir,"P7.diff.genes.rds"))
length(P7.Mb.diff.genes)
## [1] 3971
# SN Specific genes
specific.genes <- read.delim(file.path(file_dir,"P7.specific.genes.txt"))
SN.specific.genes <- specific.genes$P7.MB.4
length(SN.specific.genes)
## [1] 110
# Co-regulated with PD
WGCNA.brown <- readRDS(file = file.path(file_dir, "WGCNA.ParkGeneSet.Brown.rds"))
WGCNA.green <- readRDS(file = file.path(file_dir, "WGCNA.ParkGeneSet.Green.rds"))
co.reg.genes <- lookupGeneName(dat,unique(c(WGCNA.green, WGCNA.brown)))
length(co.reg.genes)
## [1] 215
PD.GWAS.gene.table <- read.delim(file = file.path(file_dir,"PD.loci.genes.final-new.txt"), header = T)
# Copy over the table
scoring.table <- PD.GWAS.gene.table
# Score based on expression in our data
scoring.table$expression <- if_else(scoring.table$MouseSymbol %in% SN.expressed.genes, 1, 0)
# Scoring based on expression in La Manno data
scoring.table$lin.exprs <- if_else(scoring.table$MouseSymbol %in% lin.snc.exprs, 1, 0)
# Score based on being a SN DA specific gene
scoring.table$specific <- if_else(scoring.table$MouseSymbol %in% SN.specific.genes, 1, 0)
# Score based on being in a WGCNA PD module
scoring.table$co.reg <- if_else(scoring.table$MouseSymbol %in% co.reg.genes, 1, 0)
# Score on being a differentially regulated gene (differentially expressed)
scoring.table$diff <- if_else(scoring.table$MouseSymbol %in% P7.Mb.diff.genes, 1, 0)
# Adding up the scores
scoring.table$score <- scoring.table$co.reg + scoring.table$specific + scoring.table$diff + scoring.table$lin.exprs + scoring.table$expression
# Add pLI scores
pli.table <- read.delim(file = file.path(file_dir,"fordist_cleaned_exac_r03_march16_z_pli_rec_null_data.txt")) %>% dplyr::select(gene, pLI)
# Merge the scoring and pLI tables
scoring.table <- merge(x = scoring.table, y = pli.table, by.x = "HumanSymbol", by.y = "gene", all.x = T)
# If a gene does not have a pLI, give it a score of 0
scoring.table$pLI[is.na(scoring.table$pLI)] <- 0
# Adding pLI to the total score
score.pLI <- scoring.table$pLI + scoring.table$score
scoring.table$score.pLI <- round(score.pLI,3)
We now want to write the scoring table out. There is code contained in here which will write out each locus separately.
# Writing out each individual loci ordered by "score"
scoring.table$locus <- as.factor(scoring.table$locus)
locus.levels <- levels(scoring.table$locus)
#Results.folder <- "/Volumes/PAULHOOK/Revision/New.scoring"
#for(i in 1:length(locus.levels)){
#x <- scoring.table[scoring.table$locus == locus.levels[i],]
#y <- x[!(duplicated(x$HumanSymbol)),]
#z <- y[order(y$score, decreasing = T),]
#write.table(z, file = paste0(Results.folder,".",locus.levels[i],".locus-new.txt"), row.names = F, quote = F, sep = "\t")
#}
# Writing out the entire table ordered by score
write.table(scoring.table[order(scoring.table$score.pLI, decreasing = T),],
file = file.path(file_dir,"PD.GWAS.Score.Final.txt"),
row.names = F, quote = F, sep = "\t")
We now want to write out a summary table that summarizes each locus.
# Group by locus and count how many genes are in each locus and how many are expressed in either
sum.table <- scoring.table %>%
group_by(locus) %>%
dplyr::summarize(num_genes = length(locus),
num_expressed_either = sum(lin.exprs == 1 | expression == 1),
num_expressed_both = sum(lin.exprs == 1 & expression == 1),
num_homolog = sum(!is.na(MouseSymbol)),
num_no_homolog = sum(is.na(MouseSymbol)))
## Warning: package 'bindrcpp' was built under R version 3.3.2
# Writing out summary table
write.table(sum.table, file = file.path(file_dir, "PD-GWAS-Summary-Table.txt"),
row.names = F, quote = F, sep = "\t")
# Getting summary stats
#sum(sum.table$num_genes)
#sum(sum.table$num_homolog)
#sum(sum.table$num_no_homolog)
#sum(sum.table$num_expressed_both)
#sum(sum.table$num_expressed_either)
#nrow(scoring.table[is.na(scoring.table$MouseSymbol),]) #462
#nrow(scoring.table[is.na(scoring.table$MouseSymbol) & scoring.table$gene_biotype == "protein_coding",]) #87
datatable(scoring.table, rownames = F, caption = "PD GWAS gene scoring")
datatable(sum.table, rownames = F, caption = "PD GWAS summary of scoring table")
sessionInfo()
## R version 3.3.0 (2016-05-03)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.11.6 (El Capitan)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid splines stats4 parallel stats graphics grDevices
## [8] utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2 DT_0.2 ggbiplot_0.55
## [4] scales_0.5.0 SC3_1.1.4 ROCR_1.0-7
## [7] jackstraw_1.1.1 lfa_1.2.2 tsne_0.1-3
## [10] gridExtra_2.3 slackr_1.4.2 vegan_2.4-4
## [13] permute_0.9-4 MASS_7.3-47 gplots_3.0.1
## [16] RColorBrewer_1.1-2 Hmisc_4.0-3 Formula_1.2-2
## [19] survival_2.41-3 lattice_0.20-35 Heatplus_2.18.0
## [22] Rtsne_0.13 pheatmap_1.0.8 tidyr_0.7.1
## [25] dplyr_0.7.4 plyr_1.8.4 heatmap.plus_1.3
## [28] stringr_1.2.0 marray_1.50.0 limma_3.28.21
## [31] reshape2_1.4.3 monocle_2.2.0 DDRTree_0.1.5
## [34] irlba_2.2.1 VGAM_1.0-2 ggplot2_2.2.1
## [37] Biobase_2.32.0 BiocGenerics_0.18.0 Matrix_1.2-11
##
## loaded via a namespace (and not attached):
## [1] RSelenium_1.7.1 colorspace_1.3-2 class_7.3-14
## [4] rprojroot_1.2 htmlTable_1.9 corpcor_1.6.9
## [7] base64enc_0.1-3 mvtnorm_1.0-6 codetools_0.2-15
## [10] doParallel_1.0.11 robustbase_0.92-7 knitr_1.17
## [13] jsonlite_1.5 cluster_2.0.6 semver_0.2.0
## [16] shiny_1.0.5 rrcov_1.4-3 httr_1.3.1
## [19] backports_1.1.1 assertthat_0.2.0 lazyeval_0.2.1
## [22] acepack_1.4.1 htmltools_0.3.6 tools_3.3.0
## [25] igraph_1.1.2 gtable_0.2.0 glue_1.1.1
## [28] binman_0.1.0 doRNG_1.6.6 Rcpp_0.12.14
## [31] slam_0.1-37 gdata_2.18.0 nlme_3.1-131
## [34] iterators_1.0.8 mime_0.5 rngtools_1.2.4
## [37] gtools_3.5.0 WriteXLS_4.0.0 XML_3.98-1.9
## [40] DEoptimR_1.0-8 yaml_2.1.15 pkgmaker_0.22
## [43] rpart_4.1-11 fastICA_1.2-1 latticeExtra_0.6-28
## [46] stringi_1.1.5 pcaPP_1.9-72 foreach_1.4.3
## [49] e1071_1.6-8 checkmate_1.8.4 caTools_1.17.1
## [52] rlang_0.1.6 pkgconfig_2.0.1 matrixStats_0.52.2
## [55] bitops_1.0-6 qlcMatrix_0.9.5 evaluate_0.10.1
## [58] purrr_0.2.4 bindr_0.1 htmlwidgets_0.9
## [61] magrittr_1.5 R6_2.2.2 combinat_0.0-8
## [64] wdman_0.2.2 foreign_0.8-69 mgcv_1.8-22
## [67] nnet_7.3-12 tibble_1.3.4 KernSmooth_2.23-15
## [70] rmarkdown_1.8 data.table_1.10.4 HSMMSingleCell_0.106.2
## [73] digest_0.6.12 xtable_1.8-2 httpuv_1.3.5
## [76] openssl_0.9.7 munsell_0.4.3 registry_0.3
```
This R Markdown site was created with workflowr