In this analysis we are going to calculate correlations of the PARP1 reads with the methylated reads from UCSC/ENCODE. The PARP1 reads and methylation read data has already been processed and is made available as part of the fmdatabreastcaparp1 package.
library(GenomicRanges)
library(ggplot2)
library(fmcorrelationbreastcaparp1)
library(fmdatabreastcaparp1)
library(BSgenome.Hsapiens.UCSC.hg19)
data(parp1_ln4_unique)
data(parp1_ln5_unique)
data(methyl_rep1)
data(methyl_rep2)
data(tss_windows)
Calculate the weighted coverage of the ln4 and ln5 sample reads, and then sum the reads in each TSS window.
mcf7_cov <- coverage(parp1_ln4_unique, weight = "n_count")
mdamb231_cov <- coverage(parp1_ln5_unique, weight = "n_count")
tss_windows <- binned_function(tss_windows, mcf7_cov, "sum", "parp1_mcf7")
tss_windows <- binned_function(tss_windows, mdamb231_cov, "sum", "parp1_mdamb231")
With the methylation data we want to generate a true value for the amount of methylation covering a read, by multiplying the readCount
by the percentMeth
, we should generate the methylation reads only, which is what we are interested in.
mcols(methyl_rep1)$methyl_read <- mcols(methyl_rep1)$mcols.readCount * mcols(methyl_rep1)$mcols.percentMeth / 100
mcols(methyl_rep2)$methyl_read <- mcols(methyl_rep2)$mcols.readCount * mcols(methyl_rep2)$mcols.percentMeth / 100
methyl_r1_cov <- coverage(methyl_rep1, weight = "methyl_read")
methyl_r2_cov <- coverage(methyl_rep2, weight = "methyl_read")
tss_windows <- binned_function(tss_windows, methyl_r1_cov, "sum", "methyl_r1")
tss_windows <- binned_function(tss_windows, methyl_r2_cov, "sum", "methyl_r2")
Now with the Parp1 reads and methylation reads, we can start doing some correlations.
non_zero <- "both"
Here we do a subsampling visualization and correlation to see how things look:
r1_v_mcf7 <- subsample_nonzeros(mcols(tss_windows), c("methyl_r1", "parp1_mcf7"), non_zero = non_zero, n_points = 10000)
ggplot(r1_v_mcf7, aes(x = methyl_r1, y = parp1_mcf7)) + geom_point() + scale_y_log10() + scale_x_log10()
cor(log10(r1_v_mcf7[,1]+1), log10(r1_v_mcf7[,2]+1))
## [1] -0.1657706
r2_v_mcf7 <- subsample_nonzeros(mcols(tss_windows), c("methyl_r2", "parp1_mcf7"), non_zero = non_zero, n_points = 10000)
ggplot(r2_v_mcf7, aes(x = methyl_r2, y = parp1_mcf7)) + geom_point() + scale_y_log10() + scale_x_log10()
cor(log10(r2_v_mcf7[,1]+1), log10(r2_v_mcf7[,2]+1))
## [1] -0.1523763
And now use all of the non-zero points in both:
all_comb <- expand.grid(c("methyl_r1", "methyl_r2"), c("parp1_mcf7", "parp1_mdamb231"), stringsAsFactors = FALSE)
out_cor <- lapply(seq(1, nrow(all_comb)), function(i_row){
#print(i_row)
correlate_non_zero(mcols(tss_windows), as.character(all_comb[i_row,]), log_transform = TRUE, non_zero = non_zero, test = TRUE)
})
all_comb_names <- paste(all_comb[,1], all_comb[,2], sep = "_v_")
out_cor <- do.call(rbind, out_cor)
rownames(out_cor) <- all_comb_names
TSS correlations:
corr_value | p_value | |
---|---|---|
methyl_r1_v_parp1_mcf7 | -0.1473731 | 0 |
methyl_r2_v_parp1_mcf7 | -0.1549252 | 0 |
methyl_r1_v_parp1_mdamb231 | -0.2433957 | 0 |
methyl_r2_v_parp1_mdamb231 | -0.2442113 | 0 |
And generate all the graphs.
out_graphs <- lapply(seq(1, nrow(all_comb)), function(i_row){
use_vars <- as.character(all_comb[i_row,])
subpoints <- subsample_nonzeros(mcols(tss_windows), use_vars, non_zero = non_zero, n_points = 10000)
ggplot(subpoints, aes_string(x = use_vars[1], y = use_vars[2])) + geom_point() + scale_y_log10() + scale_x_log10()
})
out_graphs
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
Are these correlations a result of association with the TSS's? One way to test this is to set up a calculation genome-wide.
genome_tiles <- tileGenome(seqinfo(Hsapiens), tilewidth = 2000, cut.last.tile.in.chrom = TRUE)
genome_tiles <- binned_function(genome_tiles, mcf7_cov, "sum", "parp1_mcf7")
genome_tiles <- binned_function(genome_tiles, mdamb231_cov, "sum", "parp1_mdamb231")
genome_tiles <- binned_function(genome_tiles, methyl_r1_cov, "sum", "methyl_r1")
genome_tiles <- binned_function(genome_tiles, methyl_r2_cov, "sum", "methyl_r2")
genome_r1_v_mcf7 <- subsample_nonzeros(mcols(genome_tiles), c("methyl_r1", "parp1_mcf7"), non_zero = non_zero, n_points = 10000)
ggplot(genome_r1_v_mcf7, aes(x = methyl_r1, y = parp1_mcf7)) + scale_x_log10() + scale_y_log10() + geom_point()
cor(log(genome_r1_v_mcf7[,1]+1), log(genome_r1_v_mcf7[,2]+1))
## [1] 0.05393331
genome_cor <- lapply(seq(1, nrow(all_comb)), function(i_row){
#print(i_row)
correlate_non_zero(mcols(genome_tiles), as.character(all_comb[i_row,]), log_transform = TRUE, non_zero = non_zero, test = TRUE)
})
all_comb_names <- paste(all_comb[,1], all_comb[,2], sep = "_v_")
genome_cor <- do.call(rbind, genome_cor)
rownames(genome_cor) <- all_comb_names
Genome wide correlations:
corr_value | p_value | |
---|---|---|
methyl_r1_v_parp1_mcf7 | 0.0482872 | 0.000000 |
methyl_r2_v_parp1_mcf7 | 0.0281699 | 0.000000 |
methyl_r1_v_parp1_mdamb231 | 0.0179345 | 0.000000 |
methyl_r2_v_parp1_mdamb231 | 0.0018917 | 0.542315 |
We will save the correlation results in some plain text files.
saveloc <- "../inst/correlation_tables"
write.table(out_cor, file = file.path(saveloc, "methylation_tss.txt"), sep = "\t")
write.table(genome_cor, file = file.path(saveloc, "methylation_genome.txt"), sep = "\t")
## [1] "2014-12-11 11:09:52 EST"
## R version 3.1.1 (2014-07-10)
## Platform: x86_64-unknown-linux-gnu (64-bit)
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] hgu133plus2.db_2.14.0
## [2] org.Hs.eg.db_2.14.0
## [3] RSQLite_0.11.4
## [4] DBI_0.3.1
## [5] AnnotationDbi_1.26.1
## [6] Biobase_2.24.0
## [7] fmdatabreastcaparp1_0.0.1
## [8] fmcorrelationbreastcaparp1_0.0.1
## [9] pracma_1.7.9
## [10] BSgenome.Hsapiens.UCSC.hg19_1.3.1000
## [11] BSgenome_1.32.0
## [12] Biostrings_2.32.1
## [13] XVector_0.4.0
## [14] ggplot2_1.0.0
## [15] GenomicRanges_1.16.4
## [16] GenomeInfoDb_1.0.2
## [17] IRanges_1.22.10
## [18] BiocGenerics_0.10.0
## [19] devtools_1.6.1
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 colorspace_1.2-4 digest_0.6.4 evaluate_0.5.5
## [5] formatR_1.0 grid_3.1.1 gtable_0.1.2 knitr_1.7
## [9] labeling_0.3 magrittr_1.0.1 markdown_0.7.4 MASS_7.3-35
## [13] mime_0.2 munsell_0.4.2 plyr_1.8.1 proto_0.3-10
## [17] Rcpp_0.11.3 reshape2_1.4 Rsamtools_1.16.1 scales_0.2.4
## [21] stats4_3.1.1 stringr_0.6.2 tools_3.1.1 zlibbioc_1.10.0