Correlation with Histone Marks

In this analysis we are going to calculate correlations of the PARP1 reads with a variety of histone mark ChIP-seq data from UCSC/ENCODE. The PARP1 reads and histone mark data has already been processed and is made available as part of the fmdatabreastcaparp1 package.

library(GenomicRanges)
library(ggplot2)
library(fmcorrelationbreastcaparp1)
library(fmdatabreastcaparp1)
library(BSgenome.Hsapiens.UCSC.hg19)

TSS Windows

data(parp1_ln4_unique)
data(parp1_ln5_unique)
data(histone_marks)
data(tss_windows)

Calculate the weighted coverage of the PARP1 mcf7 and mdamb231 sample reads, and then sum the reads in each TSS window.

mcf7_cov <- coverage(parp1_ln4_unique, weight = "n_count")
mdamb231_cov <- coverage(parp1_ln5_unique, weight = "n_count")

tss_windows <- binned_function(tss_windows, mcf7_cov, "sum", "parp1_mcf7")
tss_windows <- binned_function(tss_windows, mdamb231_cov, "sum", "parp1_mdamb231")

Get the averaged ChIP-seq peak intensity in each tss window.

for (i_name in names(histone_marks)){
  histone_cov <- coverage(histone_marks[[i_name]], weight = "mcols.signal")
  tss_windows <- binned_function(tss_windows, histone_cov, "mean_nozero", i_name)
}

Now with the Parp1 reads and histone mark signal added to the TSS's, we can start doing some correlations.

non_zero <- "both"
h3k4me3k_v_mcf7 <- subsample_nonzeros(mcols(tss_windows), c("H3k4me3_r1", "parp1_mcf7"), non_zero = non_zero, n_points = 10000)
ggplot(h3k4me3k_v_mcf7, aes(x = H3k4me3_r1, y = parp1_mcf7)) + geom_point() + scale_y_log10() + scale_x_log10()

plot of chunk graphit

cor(log10(h3k4me3k_v_mcf7[,1]+1), log10(h3k4me3k_v_mcf7[,2]+1))
## [1] 0.3960252
h3k27ac_v_mcf7 <- subsample_nonzeros(mcols(tss_windows), c("H3k27ac", "parp1_mcf7"), non_zero = non_zero, n_points = 10000)
ggplot(h3k27ac_v_mcf7, aes(x = H3k27ac, y = parp1_mcf7)) + geom_point() + scale_y_log10() + scale_x_log10()

plot of chunk graphit

cor(log10(h3k27ac_v_mcf7[,1]+1), log10(h3k27ac_v_mcf7[,2]+1))
## [1] -0.2865006

Cool. Now we are showing some promise. Let's do them all.

all_comb <- expand.grid(names(histone_marks), c("parp1_mcf7", "parp1_mdamb231"), stringsAsFactors = FALSE)
out_cor <- lapply(seq(1, nrow(all_comb)), function(i_row){
  #print(i_row)
  correlate_non_zero(mcols(tss_windows), as.character(all_comb[i_row,]), log_transform = TRUE, non_zero = non_zero, test = TRUE)
})
all_comb_names <- paste(all_comb[,1], all_comb[,2], sep = "_v_")
out_cor <- do.call(rbind, out_cor)
rownames(out_cor) <- all_comb_names

TSS correlations:

corr_value p_value
H3k09me3_v_parp1_mcf7 -0.2667278 0
H3k27ac_v_parp1_mcf7 -0.3056719 0
H3k27me3_v_parp1_mcf7 -0.3466788 0
H3k36me3_v_parp1_mcf7 -0.3065474 0
H3k4me3_r1_v_parp1_mcf7 0.3869864 0
H3k4me3_r2_v_parp1_mcf7 0.3607083 0
H3k09me3_v_parp1_mdamb231 -0.1674508 0
H3k27ac_v_parp1_mdamb231 -0.1899501 0
H3k27me3_v_parp1_mdamb231 -0.1133071 0
H3k36me3_v_parp1_mdamb231 -0.2365975 0
H3k4me3_r1_v_parp1_mdamb231 0.3623120 0
H3k4me3_r2_v_parp1_mdamb231 0.3497517 0
out_graphs <- lapply(seq(1, nrow(all_comb)), function(i_row){
  use_vars <- as.character(all_comb[i_row,])
  subpoints <- subsample_nonzeros(mcols(tss_windows), use_vars, non_zero = non_zero, n_points = 10000)
  ggplot(subpoints, aes_string(x = use_vars[1], y = use_vars[2])) + geom_point() + scale_y_log10() + scale_x_log10()
})
out_graphs
## [[1]]

plot of chunk graphs

## 
## [[2]]

plot of chunk graphs

## 
## [[3]]

plot of chunk graphs

## 
## [[4]]

plot of chunk graphs

## 
## [[5]]

plot of chunk graphs

## 
## [[6]]

plot of chunk graphs

## 
## [[7]]

plot of chunk graphs

## 
## [[8]]

plot of chunk graphs

## 
## [[9]]

plot of chunk graphs

## 
## [[10]]

plot of chunk graphs

## 
## [[11]]

plot of chunk graphs

## 
## [[12]]

plot of chunk graphs

Genome Wide Comparison

Are these correlations a result of association with the TSS's? One way to test this is to set up a calculation genome-wide.

genome_tiles <- tileGenome(seqinfo(Hsapiens), tilewidth = 2000, cut.last.tile.in.chrom = TRUE)

genome_tiles <- binned_function(genome_tiles, mcf7_cov, "sum", "parp1_mcf7")
genome_tiles <- binned_function(genome_tiles, mdamb231_cov, "sum", "parp1_mdamb231")
for (i_name in names(histone_marks)){
  histone_cov <- coverage(histone_marks[[i_name]], weight = "mcols.signal")
  genome_tiles <- binned_function(genome_tiles, histone_cov, "mean_nozero", i_name)
}
genome_h3k4me3k_v_mcf7 <- subsample_nonzeros(mcols(genome_tiles), c("H3k4me3_r1", "parp1_mcf7"), non_zero = non_zero, n_points = 10000)
ggplot(genome_h3k4me3k_v_mcf7, aes(x = H3k4me3_r1, y = parp1_mcf7)) + scale_x_log10() + scale_y_log10() + geom_point()

plot of chunk genome_graph_test

cor(log(genome_h3k4me3k_v_mcf7[,1]+1), log(genome_h3k4me3k_v_mcf7[,2]+1))
## [1] 0.1492715
genome_cor <- lapply(seq(1, nrow(all_comb)), function(i_row){
  #print(i_row)
  correlate_non_zero(mcols(genome_tiles), as.character(all_comb[i_row,]), log_transform = TRUE, non_zero = non_zero, test = TRUE)
})
all_comb_names <- paste(all_comb[,1], all_comb[,2], sep = "_v_")
genome_cor <- do.call(rbind, genome_cor)
rownames(genome_cor) <- all_comb_names

Genome wide correlations:

corr_value p_value
H3k09me3_v_parp1_mcf7 -0.4098493 0
H3k27ac_v_parp1_mcf7 -0.4011359 0
H3k27me3_v_parp1_mcf7 -0.4200107 0
H3k36me3_v_parp1_mcf7 -0.3882289 0
H3k4me3_r1_v_parp1_mcf7 0.1566817 0
H3k4me3_r2_v_parp1_mcf7 0.1180702 0
H3k09me3_v_parp1_mdamb231 -0.2499175 0
H3k27ac_v_parp1_mdamb231 -0.2573392 0
H3k27me3_v_parp1_mdamb231 -0.1890674 0
H3k36me3_v_parp1_mdamb231 -0.2859072 0
H3k4me3_r1_v_parp1_mdamb231 0.2524751 0
H3k4me3_r2_v_parp1_mdamb231 0.2344856 0

We will save the correlation results in some plain text files.

saveloc <- "../inst/correlation_tables"
write.table(out_cor, file = file.path(saveloc, "histone_marks_tss.txt"), sep = "\t")
write.table(genome_cor, file = file.path(saveloc, "histone_marks_genome.txt"), sep = "\t")

Session Info

## [1] "2014-12-11 11:08:09 EST"
## R version 3.1.1 (2014-07-10)
## Platform: x86_64-unknown-linux-gnu (64-bit)
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] hgu133plus2.db_2.14.0               
##  [2] org.Hs.eg.db_2.14.0                 
##  [3] RSQLite_0.11.4                      
##  [4] DBI_0.3.1                           
##  [5] AnnotationDbi_1.26.1                
##  [6] Biobase_2.24.0                      
##  [7] fmdatabreastcaparp1_0.0.1           
##  [8] fmcorrelationbreastcaparp1_0.0.1    
##  [9] pracma_1.7.9                        
## [10] BSgenome.Hsapiens.UCSC.hg19_1.3.1000
## [11] BSgenome_1.32.0                     
## [12] Biostrings_2.32.1                   
## [13] XVector_0.4.0                       
## [14] ggplot2_1.0.0                       
## [15] GenomicRanges_1.16.4                
## [16] GenomeInfoDb_1.0.2                  
## [17] IRanges_1.22.10                     
## [18] BiocGenerics_0.10.0                 
## [19] devtools_1.6.1                      
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6     colorspace_1.2-4 digest_0.6.4     evaluate_0.5.5  
##  [5] formatR_1.0      grid_3.1.1       gtable_0.1.2     knitr_1.7       
##  [9] labeling_0.3     magrittr_1.0.1   markdown_0.7.4   MASS_7.3-35     
## [13] mime_0.2         munsell_0.4.2    plyr_1.8.1       proto_0.3-10    
## [17] Rcpp_0.11.3      reshape2_1.4     Rsamtools_1.16.1 scales_0.2.4    
## [21] stats4_3.1.1     stringr_0.6.2    tools_3.1.1      zlibbioc_1.10.0