Last update: 2018-01-18
Code version: 5996c19850dc5fc65723f30bc33ab1c8d083a7e6
seq_dir <- "/Volumes/PAULHOOK/sc-da-parkinsons/data"
file_dir <- "/Volumes/PAULHOOK/sc-da-parkinsons/output"
Rdata_dir <- "/Volumes/PAULHOOK/sc-da-parkinsons/data"
Script_dir <- "/Volumes/PAULHOOK/sc-da-parkinsons/code"
source(file.path(Script_dir,'init.R'))
source(file.path(Script_dir,"tools_R.r"))
# Loading additional libraries. This may be redundant
library(biomaRt)
library(GenomicRanges)
library(stringr)
library(dplyr)
library(tidyr)
library(data.table)
library(DT)
In order to find all the genes in our specified search regions, we first need to make sure we have all the PD GWAS SNP coordinates from Nalls (2014) and Chang (2017) in the same genome. In our case the human genome version we are using is GRCh38.
# Loading a file that contains all the lead SNPs
dat <- read.delim(file = file.path(Rdata_dir,"new.PD.loci.txt"), header = T)
# Using GRCh38 - Ensembl v87
snpmart<-useEnsembl(biomart = "snp",dataset = 'hsapiens_snp', version = 87)
#Obtaining all the SNP attributes we want
snpPos<-getBM(attributes = c('refsnp_id','chr_name','chrom_start','chrom_end'),
filters = c('snp_filter'),
values = dat$Lead.SNP,
mart = snpmart)
# Removing duplicated SNPs
snpPos<-snpPos[!duplicated(snpPos$refsnp_id),]
# Changing column names and order
names(snpPos) <- c('snp','chr','start','end')
snpPos <- snpPos[,c(2,3,4,1)]
# Writing out a bed file and making an interactive table
write.table(snpPos, file = file.path(file_dir,"all.PD.snps.bed"), sep = "\t", row.names = F, col.names = F, quote = F)
datatable(snpPos,rownames = F, caption = "PD GWAS SNPs")
Now that we have a BED file containing PD GWAS SNP locations, we can now start searching for genes. The first “range” that we wanted to extract genes from was +/- 1 Mb. This represents a conservative genomic range that would encompass the longest reported enhancer-promoter interaction
# Copying over the data
mega.snp <- snpPos
# Adding +/- 1 Mb
mega.snp$upstream <- mega.snp$start-1e+06
mega.snp$downstream <- mega.snp$end+1e+06
# Adding chromosome coordinates
mega.snp$coordinates <- paste0(mega.snp$chr,":",mega.snp$upstream,":", mega.snp$downstream)
# Using biomart to find genes
ensembl<-useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl", version = 87)
# From https://www.biostars.org/p/167818/
mega.coords.list <- as.list(mega.snp$coordinates)
# Finding all genes in the intervials in GRCh38, ensembl version 87
mega.results <- data.frame()
for(i in 1:length(mega.coords.list)){
results<-getBM(attributes = c("hgnc_symbol", "chromosome_name", "start_position",
"end_position","gene_biotype"),
filters = c("chromosomal_region"),
values = list(chromosomal_region=mega.coords.list[i]),
mart = ensembl)
results$snp <- mega.snp$snp[i]
mega.results <- rbind(mega.results,results)
}
# Removing all human genes with no official symbol
mega.results <- mega.results[!(mega.results$hgnc_symbol == ""),]
# Checking that all the gene names are unique to a locus
summary(duplicated(mega.results[,c(1,6)]))
## Mode FALSE TRUE NA's
## logical 1692 2 0
# Some are duplicated by that may be becuase there are overlaps between the loci
# Removing duplicated gene-locus
mega.results.final <- mega.results[!(duplicated(mega.results[,c(1,6)])),]
summary(duplicated(mega.results.final[,c(1,6)]))
## Mode FALSE NA's
## logical 1692 0
head(mega.results.final)
## hgnc_symbol chromosome_name start_position end_position
## 1 RNU5A-5P 1 231670635 231670750
## 4 MAP10 1 232804892 232808407
## 6 SIPA1L2 1 232397965 232561558
## 7 RN7SL299P 1 232222866 232223145
## 8 NTPCR 1 232950605 232983882
## 10 RPS7P3 1 233288868 233289447
## gene_biotype snp
## 1 snRNA rs10797576
## 4 protein_coding rs10797576
## 6 protein_coding rs10797576
## 7 misc_RNA rs10797576
## 8 protein_coding rs10797576
## 10 processed_pseudogene rs10797576
The second “range” that we wanted to extract genes from was from within TADs. This range was chosen becasue it has been shown that enhancer-promoter interactions preferentially occur in TADs. We used TADs from human ESCs (Dixon, 2012). These TADs are in hg18, so the coordinates were lifted over to GRCh38.
# Load data and change column names
hESC.GRCh38.domains <-read.delim(file = file.path(Rdata_dir,"Dixon.hESC.combined.domains.hg38.sorted.bed"), header = F)
colnames(hESC.GRCh38.domains) <- c("chr","start","end")
# Make both the domain bed and the snp bed in to GRanges objects so they can be intersected
domains.bed <- with(hESC.GRCh38.domains, GRanges(chr, IRanges(start+1, end)))
snp.bed <- with(snpPos, GRanges(paste0("chr",chr), IRanges(start,end, names = snp)), snp = snp)
# Finding overlaps between the two
overlaps <- findOverlaps(snp.bed, domains.bed, ignore.strand = T, type = "any")
# Matching SNPs in GRCh38 TADs
match_hit <- data.frame(snp.bed[queryHits(overlaps),],
domains.bed[subjectHits(overlaps),],
names(snp.bed)[queryHits(overlaps)],
stringsAsFactors=F
)
# Looking at which SNPs are not in TADs in GRCh38
names(snp.bed)[!(names(snp.bed) %in% names(snp.bed)[queryHits(overlaps)])]
## [1] "rs11060180" "rs12637471" "rs143918452" "rs199347" "rs2694528"
## [6] "rs2740594" "rs601999" "rs76904798"
# Making a bedfile for TADs containing SNPs
snp.tads <- match_hit %>%
dplyr::select(seqnames.1, start.1, end.1, width.1, names.snp.bed..queryHits.overlaps..) %>%
rename(seqnames.1 = "chr",
start.1 = "start",
end.1 = "end",
width.1 = "tad.length",
names.snp.bed..queryHits.overlaps.. = "snp")
# Writing out the table
write.table(snp.tads[,c(1,2,3,5)], file = file.path(file_dir,"pd.gwas.snp.tads.bed"),sep = "\t", row.names = F, col.names = F, quote = F)
head(snp.tads[,c(1,2,3,5)])
## chr start end snp
## 1 chr1 232117632 232637631 rs10797576
## 2 chr10 15397996 15717995 rs10906923
## 3 chr14 54503533 54903532 rs11158026
## 4 chr16 18761178 19521177 rs11343
## 5 chr3 87268161 88068160 rs115185635
## 6 chr4 15669280 16269279 rs11724635
Now that we have all the TAD boundaries surrounding SNPs, we can now extract all the genes falling within those boundaries.
# Rearranging the data and modifying it
pd.tads <- snp.tads[,c(1,2,3,5)]
names(pd.tads) <- c("chr","start","end","snp")
pd.tads$chr <- str_extract(pd.tads$chr,"[0-9]+")
# Creating coordinates
pd.tads$coordinates <- paste0(pd.tads$chr, ":",pd.tads$start,":", pd.tads$end)
# From https://www.biostars.org/p/167818/
tad.coords.list <- as.list(pd.tads$coordinates)
# Finding all genes in the intervials in CRCh38, ensembl version 87
ensembl<-useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl", version = 87)
tad.results <- data.frame()
for(i in 1:length(tad.coords.list)){
results<-getBM(attributes = c("hgnc_symbol", "chromosome_name", "start_position",
"end_position","gene_biotype"),
filters = c("chromosomal_region"),
values = list(chromosomal_region=tad.coords.list[i]),
mart = ensembl)
results$snp <- pd.tads$snp[i]
tad.results <- rbind(tad.results,results)
}
# Removing all human genes with no official symbol
tad.results <- tad.results[!(tad.results$hgnc_symbol == ""),]
# Checking that all the gene names and snps are unique
summary(duplicated(tad.results[,c(1,6)]))
## Mode FALSE NA's
## logical 634 0
# no duplicates
Now that we have genes within Mb and TAD boundaries, we can now merge those and only keep the unique gene-loci
# Combine and only keep unique rows
all.results <- unique(rbind(mega.results, tad.results))
# Check for duplicated genes/snps
summary(duplicated(all.results[,c(1,6)])) #2 for whatever reason
## Mode FALSE TRUE NA's
## logical 1751 2 0
all.results.dedup <- all.results[!(duplicated(all.results[,c(1,6)])),]
summary(duplicated(all.results.dedup[,c(1,6)]))
## Mode FALSE NA's
## logical 1751 0
# Merge with locus names so that each gene is easy to track
PD.loci <- read.delim(file = file.path(Rdata_dir,"new.PD.loci.txt"))
all.results.final <- merge(x = all.results.dedup, y = PD.loci, by.x = "snp", by.y = "Lead.SNP") %>%
dplyr::select(hgnc_symbol, chromosome_name, snp, Locus, gene_biotype)
# Rename columns
names(all.results.final) <- c("HumanSymbol","chr", 'snp', 'locus', 'gene_biotype')
# Check for duplicates
summary(duplicated(all.results.final[,c(1,4)]))
## Mode FALSE NA's
## logical 1751 0
We have a dataframe that contains all the human genes within PD GWAS loci. We now need to match those human genes their mouse homologs, so we can use mouse data to score them. While homolog data is not perfect, we had to choose a database to use. We decided to the MGI database. This database was downloaded on 07-07-2017.
# Loading data
homolog.table <- read.delim(file = file.path(Rdata_dir,"HOM_MouseHumanSequence.rpt")) %>%
dplyr::select(HomoloGene.ID, NCBI.Taxon.ID,Symbol)
# Extracting just human data and just mouse data
human <- homolog.table[homolog.table$NCBI.Taxon.ID == "9606",]
mice <- homolog.table[homolog.table$NCBI.Taxon.ID == "10090",]
# Merging the human and mouse data based on the Homologene.ID
homolog.table.final <- merge(human,mice,by = "HomoloGene.ID",all.x = T) %>%
dplyr::select(Symbol.x, Symbol.y) %>%
dplyr::rename(HumanSymbol = Symbol.x, MouseSymbol = Symbol.y)
# check for duplicate rows
summary(duplicated(homolog.table.final))
## Mode FALSE TRUE NA's
## logical 19607 1 0
homolog.table.final <- homolog.table.final[!(duplicated(homolog.table.final)),]
# check for duplicate human genes (only want 1 to 1 homologs)
summary(duplicated(homolog.table.final$HumanSymbol))
## Mode FALSE TRUE NA's
## logical 19123 484 0
# There are duplicated human genes. We want to get rid of genes that don't have 1-to-1 mouse homologs.
idx.all <- duplicated(homolog.table.final$HumanSymbol) | duplicated(homolog.table.final$HumanSymbol, fromLast = TRUE)
small.df <- homolog.table.final[idx.all,]
nrow(small.df[order(small.df$HumanSymbol),])
## [1] 723
# We want to get rid of genes with without 1-to-1 mouse homologs so duplicated human genes should be thrown out
good.genes <- homolog.table.final[!(idx.all),]
summary(duplicated(good.genes))
## Mode FALSE NA's
## logical 18884 0
summary(duplicated(good.genes$HumanSymbol))
## Mode FALSE NA's
## logical 18884 0
# Now we want to see if there are any duplicated mouse genes
summary(duplicated(good.genes$MouseSymbol))
## Mode FALSE TRUE NA's
## logical 16530 2354 0
idx.mouse <- duplicated(good.genes$MouseSymbol) | duplicated(good.genes$MouseSymbol, fromLast = TRUE)
small.df <- good.genes[idx.mouse,]
nrow(small.df[order(small.df$HumanSymbol),])
## [1] 2415
# We want to remove these genes becasue they do not have 1-to-1 mouse-human homologs
one.to.one.genes <- good.genes[!(idx.mouse),]
# Checking for one to one matchups
summary(duplicated(one.to.one.genes))
## Mode FALSE NA's
## logical 16469 0
summary(duplicated(one.to.one.genes$MouseSymbol))
## Mode FALSE NA's
## logical 16469 0
summary(duplicated(one.to.one.genes$HumanSymbol))
## Mode FALSE NA's
## logical 16469 0
# Writing out the table
write.table(one.to.one.genes, file = file.path(file_dir,"MGI_one-to-one.bed"),sep = "\t", row.names = F, col.names = F, quote = F)
head(one.to.one.genes)
## HumanSymbol MouseSymbol
## 1 ACADM Acadm
## 2 ACADVL Acadvl
## 3 ACAT1 Acat1
## 4 ACVR1 Acvr1
## 5 SGCA Sgca
## 6 ADSL Adsl
We now have 1) the genes contained within our designated ranges and 2) a table with all the one-to-one mouse-human homologs from MGI. We will merge these tables.
# Merging based on the human symbol of the gene
final.table <- unique(merge(all.results.final, one.to.one.genes, by = "HumanSymbol", all.x = T))
# Checking to look for duplicated rows
summary(duplicated(final.table))
## Mode FALSE NA's
## logical 1751 0
# Looking at all the genes that are designated as "protein_coding" but do not have a mouse homolog
head(final.table[final.table$gene_biotype == "protein_coding" & is.na(final.table$MouseSymbol),]$HumanSymbol)
## [1] "ABHD14A-ACY1" "AHSP" "ANK2" "AQP10"
## [5] "ARIH2OS" "ARL17A"
# Protein coding genes with no apparent mouse homolog were manually curated using MGI website
manual.table <- read.delim(file = file.path(Rdata_dir,"no-homologs-manual-new.txt"))
manual.table$MouseSymbol <- gsub('\\s+', '', manual.table$MouseSymbol)
manual.table$HumanSymbol <- gsub('\\s+', '', manual.table$HumanSymbol)
# no.homolog table
no.homolog.protein <- final.table[final.table$gene_biotype == "protein_coding" & is.na(final.table$MouseSymbol),]
# homolog table
homolog.table <- final.table[!(final.table$gene_biotype == "protein_coding" & is.na(final.table$MouseSymbol)),]
# merging no.homolog table with maunally curated table
no.homolog.table <- merge(x = no.homolog.protein, y = manual.table, by = "HumanSymbol", all.x = T) %>%
dplyr::rename(MouseSymbol = MouseSymbol.y) %>%
dplyr::select(HumanSymbol, chr, snp, locus, gene_biotype, MouseSymbol)
# Remaking the final table
all.final.table <- rbind(homolog.table, no.homolog.table)
nrow(all.final.table) == nrow(final.table) # TRUE
## [1] TRUE
# Checking for duplicates
summary(duplicated(all.final.table[,c(1,4)]))
## Mode FALSE NA's
## logical 1751 0
# writing out table
write.table(all.final.table, file = file.path(file_dir,"PD.loci.genes.final-new.txt"), quote = F, sep = "\t", row.names = F)
This is an interactive table that displays all the genes in our PD GWAS loci with their associated mouse homologs.
datatable(all.final.table, rownames = F, caption = "PD GWAS loci genes")
sessionInfo()
## R version 3.3.0 (2016-05-03)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.11.6 (El Capitan)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid splines stats4 parallel stats graphics grDevices
## [8] utils datasets methods base
##
## other attached packages:
## [1] DT_0.2 data.table_1.10.4 GenomicRanges_1.24.3
## [4] GenomeInfoDb_1.8.7 IRanges_2.6.1 S4Vectors_0.10.3
## [7] biomaRt_2.28.0 ggbiplot_0.55 scales_0.5.0
## [10] SC3_1.1.4 ROCR_1.0-7 jackstraw_1.1.1
## [13] lfa_1.2.2 tsne_0.1-3 gridExtra_2.3
## [16] slackr_1.4.2 vegan_2.4-4 permute_0.9-4
## [19] MASS_7.3-47 gplots_3.0.1 RColorBrewer_1.1-2
## [22] Hmisc_4.0-3 Formula_1.2-2 survival_2.41-3
## [25] lattice_0.20-35 Heatplus_2.18.0 Rtsne_0.13
## [28] pheatmap_1.0.8 tidyr_0.7.1 dplyr_0.7.4
## [31] plyr_1.8.4 heatmap.plus_1.3 stringr_1.2.0
## [34] marray_1.50.0 limma_3.28.21 reshape2_1.4.3
## [37] monocle_2.2.0 DDRTree_0.1.5 irlba_2.2.1
## [40] VGAM_1.0-2 ggplot2_2.2.1 Biobase_2.32.0
## [43] BiocGenerics_0.18.0 Matrix_1.2-11
##
## loaded via a namespace (and not attached):
## [1] RSelenium_1.7.1 colorspace_1.3-2 class_7.3-14
## [4] rprojroot_1.2 htmlTable_1.9 XVector_0.12.1
## [7] corpcor_1.6.9 base64enc_0.1-3 bit64_0.9-7
## [10] AnnotationDbi_1.34.4 mvtnorm_1.0-6 codetools_0.2-15
## [13] doParallel_1.0.11 robustbase_0.92-7 knitr_1.17
## [16] jsonlite_1.5 cluster_2.0.6 semver_0.2.0
## [19] shiny_1.0.5 rrcov_1.4-3 httr_1.3.1
## [22] backports_1.1.1 assertthat_0.2.0 lazyeval_0.2.1
## [25] acepack_1.4.1 htmltools_0.3.6 tools_3.3.0
## [28] bindrcpp_0.2 igraph_1.1.2 gtable_0.2.0
## [31] glue_1.1.1 binman_0.1.0 doRNG_1.6.6
## [34] Rcpp_0.12.14 slam_0.1-37 gdata_2.18.0
## [37] nlme_3.1-131 iterators_1.0.8 mime_0.5
## [40] rngtools_1.2.4 gtools_3.5.0 WriteXLS_4.0.0
## [43] XML_3.98-1.9 DEoptimR_1.0-8 zlibbioc_1.18.0
## [46] yaml_2.1.15 memoise_1.1.0 pkgmaker_0.22
## [49] rpart_4.1-11 RSQLite_2.0 fastICA_1.2-1
## [52] latticeExtra_0.6-28 stringi_1.1.5 pcaPP_1.9-72
## [55] foreach_1.4.3 e1071_1.6-8 checkmate_1.8.4
## [58] caTools_1.17.1 rlang_0.1.6 pkgconfig_2.0.1
## [61] matrixStats_0.52.2 bitops_1.0-6 qlcMatrix_0.9.5
## [64] evaluate_0.10.1 purrr_0.2.4 bindr_0.1
## [67] htmlwidgets_0.9 bit_1.1-12 magrittr_1.5
## [70] R6_2.2.2 combinat_0.0-8 DBI_0.7
## [73] wdman_0.2.2 foreign_0.8-69 mgcv_1.8-22
## [76] RCurl_1.95-4.9 nnet_7.3-12 tibble_1.3.4
## [79] KernSmooth_2.23-15 rmarkdown_1.8 blob_1.1.0
## [82] HSMMSingleCell_0.106.2 digest_0.6.12 xtable_1.8-2
## [85] httpuv_1.3.5 openssl_0.9.7 munsell_0.4.3
## [88] registry_0.3
This R Markdown site was created with workflowr