Finding all genes within PD loci

Last update: 2018-01-18

Code version: 5996c19850dc5fc65723f30bc33ab1c8d083a7e6

Setting important directories. Also loading important libraries and custom functions for analysis.

seq_dir <- "/Volumes/PAULHOOK/sc-da-parkinsons/data"
file_dir <- "/Volumes/PAULHOOK/sc-da-parkinsons/output"
Rdata_dir <- "/Volumes/PAULHOOK/sc-da-parkinsons/data"
Script_dir <- "/Volumes/PAULHOOK/sc-da-parkinsons/code"
source(file.path(Script_dir,'init.R'))
source(file.path(Script_dir,"tools_R.r"))

# Loading additional libraries. This may be redundant
library(biomaRt)
library(GenomicRanges)
library(stringr)
library(dplyr)
library(tidyr)
library(data.table)
library(DT)

Obtaining SNP coordinates

In order to find all the genes in our specified search regions, we first need to make sure we have all the PD GWAS SNP coordinates from Nalls (2014) and Chang (2017) in the same genome. In our case the human genome version we are using is GRCh38.

# Loading a file that contains all the lead SNPs
dat <- read.delim(file = file.path(Rdata_dir,"new.PD.loci.txt"), header = T)

# Using GRCh38 - Ensembl v87
snpmart<-useEnsembl(biomart = "snp",dataset = 'hsapiens_snp', version = 87)

#Obtaining all the SNP attributes we want
snpPos<-getBM(attributes = c('refsnp_id','chr_name','chrom_start','chrom_end'), 
      filters = c('snp_filter'), 
      values = dat$Lead.SNP,
      mart = snpmart)

# Removing duplicated SNPs
snpPos<-snpPos[!duplicated(snpPos$refsnp_id),]

# Changing column names and order
names(snpPos) <- c('snp','chr','start','end')
snpPos <- snpPos[,c(2,3,4,1)]

# Writing out a bed file and making an interactive table
write.table(snpPos, file = file.path(file_dir,"all.PD.snps.bed"), sep = "\t", row.names = F, col.names = F, quote = F)

datatable(snpPos,rownames = F, caption = "PD GWAS SNPs")

Finding all genes within +/- 1 Mb

Now that we have a BED file containing PD GWAS SNP locations, we can now start searching for genes. The first “range” that we wanted to extract genes from was +/- 1 Mb. This represents a conservative genomic range that would encompass the longest reported enhancer-promoter interaction

# Copying over the data
mega.snp <- snpPos

# Adding +/- 1 Mb
mega.snp$upstream <- mega.snp$start-1e+06
mega.snp$downstream <- mega.snp$end+1e+06

# Adding chromosome coordinates
mega.snp$coordinates <- paste0(mega.snp$chr,":",mega.snp$upstream,":", mega.snp$downstream)

# Using biomart to find genes
ensembl<-useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl", version = 87)

# From https://www.biostars.org/p/167818/
mega.coords.list <- as.list(mega.snp$coordinates)

# Finding all genes in the intervials in GRCh38, ensembl version 87
mega.results <- data.frame()
for(i in 1:length(mega.coords.list)){
  results<-getBM(attributes = c("hgnc_symbol", "chromosome_name", "start_position",
                                "end_position","gene_biotype"),
                 filters = c("chromosomal_region"),
                 values = list(chromosomal_region=mega.coords.list[i]), 
                 mart = ensembl)
  results$snp <- mega.snp$snp[i]
  mega.results <- rbind(mega.results,results)
}

# Removing all human genes with no official symbol
mega.results <- mega.results[!(mega.results$hgnc_symbol == ""),]

# Checking  that all the gene names are unique to a locus
summary(duplicated(mega.results[,c(1,6)]))

##    Mode   FALSE    TRUE    NA's 
## logical    1692       2       0

# Some are duplicated by that may be becuase there are overlaps between the loci

# Removing duplicated gene-locus
mega.results.final <- mega.results[!(duplicated(mega.results[,c(1,6)])),] 
summary(duplicated(mega.results.final[,c(1,6)]))

##    Mode   FALSE    NA's 
## logical    1692       0

head(mega.results.final)

##    hgnc_symbol chromosome_name start_position end_position
## 1     RNU5A-5P               1      231670635    231670750
## 4        MAP10               1      232804892    232808407
## 6      SIPA1L2               1      232397965    232561558
## 7    RN7SL299P               1      232222866    232223145
## 8        NTPCR               1      232950605    232983882
## 10      RPS7P3               1      233288868    233289447
##            gene_biotype        snp
## 1                 snRNA rs10797576
## 4        protein_coding rs10797576
## 6        protein_coding rs10797576
## 7              misc_RNA rs10797576
## 8        protein_coding rs10797576
## 10 processed_pseudogene rs10797576

Intersecting SNPs with TADs

The second “range” that we wanted to extract genes from was from within TADs. This range was chosen becasue it has been shown that enhancer-promoter interactions preferentially occur in TADs. We used TADs from human ESCs (Dixon, 2012). These TADs are in hg18, so the coordinates were lifted over to GRCh38.

# Load data and change column names
hESC.GRCh38.domains <-read.delim(file = file.path(Rdata_dir,"Dixon.hESC.combined.domains.hg38.sorted.bed"), header = F)
colnames(hESC.GRCh38.domains) <- c("chr","start","end")

# Make both the domain bed and the snp bed in to GRanges objects so they can be intersected
domains.bed <- with(hESC.GRCh38.domains, GRanges(chr, IRanges(start+1, end)))
snp.bed <- with(snpPos, GRanges(paste0("chr",chr), IRanges(start,end, names = snp)), snp = snp)

# Finding overlaps between the two
overlaps <- findOverlaps(snp.bed, domains.bed, ignore.strand = T, type = "any")

# Matching SNPs in GRCh38 TADs
match_hit <- data.frame(snp.bed[queryHits(overlaps),],
                        domains.bed[subjectHits(overlaps),],
                        names(snp.bed)[queryHits(overlaps)],
                        stringsAsFactors=F
                        )

# Looking at which SNPs are not in TADs in GRCh38
names(snp.bed)[!(names(snp.bed) %in% names(snp.bed)[queryHits(overlaps)])]

## [1] "rs11060180"  "rs12637471"  "rs143918452" "rs199347"    "rs2694528"  
## [6] "rs2740594"   "rs601999"    "rs76904798"

# Making a bedfile for TADs containing SNPs
snp.tads <- match_hit %>%
  dplyr::select(seqnames.1, start.1, end.1, width.1, names.snp.bed..queryHits.overlaps..) %>%
  rename(seqnames.1 = "chr",
         start.1 = "start",
         end.1 = "end",
         width.1 = "tad.length",
         names.snp.bed..queryHits.overlaps.. = "snp")

# Writing out the table
write.table(snp.tads[,c(1,2,3,5)], file = file.path(file_dir,"pd.gwas.snp.tads.bed"),sep = "\t", row.names = F, col.names = F, quote = F)

head(snp.tads[,c(1,2,3,5)])

##     chr     start       end         snp
## 1  chr1 232117632 232637631  rs10797576
## 2 chr10  15397996  15717995  rs10906923
## 3 chr14  54503533  54903532  rs11158026
## 4 chr16  18761178  19521177     rs11343
## 5  chr3  87268161  88068160 rs115185635
## 6  chr4  15669280  16269279  rs11724635

Finding all genes within TADs

Now that we have all the TAD boundaries surrounding SNPs, we can now extract all the genes falling within those boundaries.

# Rearranging the data and modifying it
pd.tads <- snp.tads[,c(1,2,3,5)]
names(pd.tads) <- c("chr","start","end","snp")
pd.tads$chr <- str_extract(pd.tads$chr,"[0-9]+")

# Creating coordinates
pd.tads$coordinates <- paste0(pd.tads$chr, ":",pd.tads$start,":", pd.tads$end)

# From https://www.biostars.org/p/167818/
tad.coords.list <- as.list(pd.tads$coordinates)

# Finding all genes in the intervials in CRCh38, ensembl version 87
ensembl<-useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl", version = 87)
tad.results <- data.frame()
for(i in 1:length(tad.coords.list)){
  results<-getBM(attributes = c("hgnc_symbol", "chromosome_name", "start_position",
                                "end_position","gene_biotype"),
                 filters = c("chromosomal_region"),
                 values = list(chromosomal_region=tad.coords.list[i]), 
                 mart = ensembl)
  results$snp <- pd.tads$snp[i]
  tad.results <- rbind(tad.results,results)
}

# Removing all human genes with no official symbol
tad.results <- tad.results[!(tad.results$hgnc_symbol == ""),]

# Checking  that all the gene names and snps are unique
summary(duplicated(tad.results[,c(1,6)]))

##    Mode   FALSE    NA's 
## logical     634       0

# no duplicates

Merging Mb and TAD data

Now that we have genes within Mb and TAD boundaries, we can now merge those and only keep the unique gene-loci

# Combine and only keep unique rows
all.results <- unique(rbind(mega.results, tad.results))

# Check for duplicated genes/snps
summary(duplicated(all.results[,c(1,6)])) #2 for whatever reason

##    Mode   FALSE    TRUE    NA's 
## logical    1751       2       0

all.results.dedup <- all.results[!(duplicated(all.results[,c(1,6)])),]
summary(duplicated(all.results.dedup[,c(1,6)]))

##    Mode   FALSE    NA's 
## logical    1751       0

# Merge with locus names so that each gene is easy to track
PD.loci <- read.delim(file = file.path(Rdata_dir,"new.PD.loci.txt"))
all.results.final <- merge(x = all.results.dedup, y = PD.loci, by.x = "snp", by.y = "Lead.SNP") %>%
  dplyr::select(hgnc_symbol, chromosome_name, snp, Locus, gene_biotype)

# Rename columns
names(all.results.final) <- c("HumanSymbol","chr", 'snp', 'locus', 'gene_biotype')

# Check for duplicates
summary(duplicated(all.results.final[,c(1,4)]))

##    Mode   FALSE    NA's 
## logical    1751       0

Finding all mouse homologs

We have a dataframe that contains all the human genes within PD GWAS loci. We now need to match those human genes their mouse homologs, so we can use mouse data to score them. While homolog data is not perfect, we had to choose a database to use. We decided to the MGI database. This database was downloaded on 07-07-2017.

# Loading data
homolog.table <- read.delim(file = file.path(Rdata_dir,"HOM_MouseHumanSequence.rpt")) %>%
  dplyr::select(HomoloGene.ID, NCBI.Taxon.ID,Symbol)

# Extracting just human data and just mouse data  
human <- homolog.table[homolog.table$NCBI.Taxon.ID == "9606",]
mice <- homolog.table[homolog.table$NCBI.Taxon.ID == "10090",]

# Merging the human and mouse data based on the Homologene.ID
homolog.table.final <- merge(human,mice,by = "HomoloGene.ID",all.x = T) %>%
  dplyr::select(Symbol.x, Symbol.y) %>%
  dplyr::rename(HumanSymbol = Symbol.x, MouseSymbol = Symbol.y)

# check for duplicate rows
summary(duplicated(homolog.table.final))

##    Mode   FALSE    TRUE    NA's 
## logical   19607       1       0

homolog.table.final <- homolog.table.final[!(duplicated(homolog.table.final)),]

# check for duplicate human genes (only want 1 to 1 homologs)
summary(duplicated(homolog.table.final$HumanSymbol))

##    Mode   FALSE    TRUE    NA's 
## logical   19123     484       0

# There are duplicated human genes. We want to get rid of genes that don't have 1-to-1 mouse homologs.
idx.all <- duplicated(homolog.table.final$HumanSymbol) | duplicated(homolog.table.final$HumanSymbol, fromLast = TRUE) 
small.df <- homolog.table.final[idx.all,]
nrow(small.df[order(small.df$HumanSymbol),])

## [1] 723

# We want to get rid of genes with without 1-to-1 mouse homologs so duplicated human genes should be thrown out
good.genes <- homolog.table.final[!(idx.all),]
summary(duplicated(good.genes))

##    Mode   FALSE    NA's 
## logical   18884       0

summary(duplicated(good.genes$HumanSymbol))

##    Mode   FALSE    NA's 
## logical   18884       0

# Now we want to see if there are any duplicated mouse genes
summary(duplicated(good.genes$MouseSymbol))

##    Mode   FALSE    TRUE    NA's 
## logical   16530    2354       0

idx.mouse <- duplicated(good.genes$MouseSymbol) | duplicated(good.genes$MouseSymbol, fromLast = TRUE) 
small.df <- good.genes[idx.mouse,]
nrow(small.df[order(small.df$HumanSymbol),])

## [1] 2415

# We want to remove these genes becasue they do not have 1-to-1 mouse-human homologs
one.to.one.genes <- good.genes[!(idx.mouse),]

# Checking for one to one matchups
summary(duplicated(one.to.one.genes))

##    Mode   FALSE    NA's 
## logical   16469       0

summary(duplicated(one.to.one.genes$MouseSymbol))

##    Mode   FALSE    NA's 
## logical   16469       0

summary(duplicated(one.to.one.genes$HumanSymbol))

##    Mode   FALSE    NA's 
## logical   16469       0

# Writing out the table
write.table(one.to.one.genes, file = file.path(file_dir,"MGI_one-to-one.bed"),sep = "\t", row.names = F, col.names = F, quote = F)

head(one.to.one.genes)

##   HumanSymbol MouseSymbol
## 1       ACADM       Acadm
## 2      ACADVL      Acadvl
## 3       ACAT1       Acat1
## 4       ACVR1       Acvr1
## 5        SGCA        Sgca
## 6        ADSL        Adsl

Merging the tables

We now have 1) the genes contained within our designated ranges and 2) a table with all the one-to-one mouse-human homologs from MGI. We will merge these tables.

# Merging based on the human symbol of the gene
final.table <- unique(merge(all.results.final, one.to.one.genes, by = "HumanSymbol", all.x = T))

# Checking to look for duplicated rows
summary(duplicated(final.table))

##    Mode   FALSE    NA's 
## logical    1751       0

# Looking at all the genes that are designated as "protein_coding" but do not have a mouse homolog
head(final.table[final.table$gene_biotype == "protein_coding" & is.na(final.table$MouseSymbol),]$HumanSymbol)

## [1] "ABHD14A-ACY1" "AHSP"         "ANK2"         "AQP10"       
## [5] "ARIH2OS"      "ARL17A"

# Protein coding genes with no apparent mouse homolog were manually curated using MGI website
manual.table <- read.delim(file = file.path(Rdata_dir,"no-homologs-manual-new.txt"))
manual.table$MouseSymbol <- gsub('\\s+', '', manual.table$MouseSymbol)
manual.table$HumanSymbol <- gsub('\\s+', '', manual.table$HumanSymbol)

# no.homolog table
no.homolog.protein <- final.table[final.table$gene_biotype == "protein_coding" & is.na(final.table$MouseSymbol),]

# homolog table
homolog.table <- final.table[!(final.table$gene_biotype == "protein_coding" & is.na(final.table$MouseSymbol)),]

# merging no.homolog table with maunally curated table
no.homolog.table <- merge(x = no.homolog.protein, y = manual.table, by = "HumanSymbol", all.x = T) %>%
  dplyr::rename(MouseSymbol = MouseSymbol.y) %>%
  dplyr::select(HumanSymbol, chr, snp, locus, gene_biotype, MouseSymbol)

# Remaking the final table
all.final.table <- rbind(homolog.table, no.homolog.table)
nrow(all.final.table) == nrow(final.table) # TRUE

## [1] TRUE

# Checking for duplicates
summary(duplicated(all.final.table[,c(1,4)]))

##    Mode   FALSE    NA's 
## logical    1751       0

# writing out table
write.table(all.final.table, file = file.path(file_dir,"PD.loci.genes.final-new.txt"), quote = F, sep = "\t", row.names = F)

Making an interactive table for all PD GWAS loci genes

This is an interactive table that displays all the genes in our PD GWAS loci with their associated mouse homologs.

datatable(all.final.table, rownames = F, caption = "PD GWAS loci genes")

Session Info

sessionInfo()

## R version 3.3.0 (2016-05-03)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.11.6 (El Capitan)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      splines   stats4    parallel  stats     graphics  grDevices
##  [8] utils     datasets  methods   base     
## 
## other attached packages:
##  [1] DT_0.2               data.table_1.10.4    GenomicRanges_1.24.3
##  [4] GenomeInfoDb_1.8.7   IRanges_2.6.1        S4Vectors_0.10.3    
##  [7] biomaRt_2.28.0       ggbiplot_0.55        scales_0.5.0        
## [10] SC3_1.1.4            ROCR_1.0-7           jackstraw_1.1.1     
## [13] lfa_1.2.2            tsne_0.1-3           gridExtra_2.3       
## [16] slackr_1.4.2         vegan_2.4-4          permute_0.9-4       
## [19] MASS_7.3-47          gplots_3.0.1         RColorBrewer_1.1-2  
## [22] Hmisc_4.0-3          Formula_1.2-2        survival_2.41-3     
## [25] lattice_0.20-35      Heatplus_2.18.0      Rtsne_0.13          
## [28] pheatmap_1.0.8       tidyr_0.7.1          dplyr_0.7.4         
## [31] plyr_1.8.4           heatmap.plus_1.3     stringr_1.2.0       
## [34] marray_1.50.0        limma_3.28.21        reshape2_1.4.3      
## [37] monocle_2.2.0        DDRTree_0.1.5        irlba_2.2.1         
## [40] VGAM_1.0-2           ggplot2_2.2.1        Biobase_2.32.0      
## [43] BiocGenerics_0.18.0  Matrix_1.2-11       
## 
## loaded via a namespace (and not attached):
##  [1] RSelenium_1.7.1        colorspace_1.3-2       class_7.3-14          
##  [4] rprojroot_1.2          htmlTable_1.9          XVector_0.12.1        
##  [7] corpcor_1.6.9          base64enc_0.1-3        bit64_0.9-7           
## [10] AnnotationDbi_1.34.4   mvtnorm_1.0-6          codetools_0.2-15      
## [13] doParallel_1.0.11      robustbase_0.92-7      knitr_1.17            
## [16] jsonlite_1.5           cluster_2.0.6          semver_0.2.0          
## [19] shiny_1.0.5            rrcov_1.4-3            httr_1.3.1            
## [22] backports_1.1.1        assertthat_0.2.0       lazyeval_0.2.1        
## [25] acepack_1.4.1          htmltools_0.3.6        tools_3.3.0           
## [28] bindrcpp_0.2           igraph_1.1.2           gtable_0.2.0          
## [31] glue_1.1.1             binman_0.1.0           doRNG_1.6.6           
## [34] Rcpp_0.12.14           slam_0.1-37            gdata_2.18.0          
## [37] nlme_3.1-131           iterators_1.0.8        mime_0.5              
## [40] rngtools_1.2.4         gtools_3.5.0           WriteXLS_4.0.0        
## [43] XML_3.98-1.9           DEoptimR_1.0-8         zlibbioc_1.18.0       
## [46] yaml_2.1.15            memoise_1.1.0          pkgmaker_0.22         
## [49] rpart_4.1-11           RSQLite_2.0            fastICA_1.2-1         
## [52] latticeExtra_0.6-28    stringi_1.1.5          pcaPP_1.9-72          
## [55] foreach_1.4.3          e1071_1.6-8            checkmate_1.8.4       
## [58] caTools_1.17.1         rlang_0.1.6            pkgconfig_2.0.1       
## [61] matrixStats_0.52.2     bitops_1.0-6           qlcMatrix_0.9.5       
## [64] evaluate_0.10.1        purrr_0.2.4            bindr_0.1             
## [67] htmlwidgets_0.9        bit_1.1-12             magrittr_1.5          
## [70] R6_2.2.2               combinat_0.0-8         DBI_0.7               
## [73] wdman_0.2.2            foreign_0.8-69         mgcv_1.8-22           
## [76] RCurl_1.95-4.9         nnet_7.3-12            tibble_1.3.4          
## [79] KernSmooth_2.23-15     rmarkdown_1.8          blob_1.1.0            
## [82] HSMMSingleCell_0.106.2 digest_0.6.12          xtable_1.8-2          
## [85] httpuv_1.3.5           openssl_0.9.7          munsell_0.4.3         
## [88] registry_0.3

This R Markdown site was created with workflowr