Note that running this analysis will require at least 4 GB of RAM unless objects are deleted when they are no longer required.
To demonstrate the utility of the general categoryCompare
approach, we will construct an artificial data set based on the Gene Ontology. We will attempt to demonstrate that differences in the set or list based comparisons are also dependent on the number of items annotated to a particular term.
Set based: calculations performed independently on each set of features, and then the results are combined.
List based: calculations performed on the intersected list of two sets of features.
library(ggplot2)
useTheme <- theme_bw() + theme(legend.key = element_blank(), legend.title = element_text(size = 25),
axis.title = element_text(size = 25), axis.text = element_text(size = 15),
legend.text = element_text(size = 20), strip.text.y = element_text(size = 20),
strip.text.x = element_text(size = 20))
savePlot <- function(plotObj, plotFile, figPath = "/mlab/data/rmflight/Documents/projects/work/ccPaper/savedPlots") {
if (file.exists(figPath)) {
plotObj <- plotObj + ggtitle(NULL)
png(filename = file.path(figPath, plotFile), width = 854, height = 628)
print(plotObj)
dev.off()
}
}
We will make use of three sets of GO terms from H. sapiens. For each set, we want differing numbers of genes annotated to them.
Let's first look at the distribution of number of genes annotated to the GO terms in biological process
library(org.Hs.eg.db)
library(GO.db)
library(ccPaper)
hsGO <- as.list(org.Hs.egGO2ALLEGS)
goOntology <- Ontology(names(hsGO))
goBP <- names(goOntology)[goOntology == "BP"]
hsGO <- hsGO[goBP]
hsGO <- lapply(hsGO, unique)
universeGenes <- unique(unlist(hsGO))
hsGO_count <- sapply(hsGO, length)
hsGO_count <- data.frame(size = hsGO_count)
p <- ggplot(hsGO_count, aes(x = size)) + geom_bar(binwidth = 10) + xlim(0, 2000) +
ylim(0, 500) + useTheme + ggtitle("Number of genes annotated to GO terms") +
xlab("Num. of Genes") + ylab("Count")
p
savePlot(p, "go2gene_distribution.png")
## pdf
## 2
Based on the distribution in the previous figure, we will set up 3 different groups of GO terms, with a different number from each group:
This gives us a total of 100 GO terms, with a good sample from each group. We will generate random samples that are defined by these limits, and use this set of GO terms going forward.
grpLow <- c(10, 100)
grpMed <- c(250, 500)
grpHi <- c(500, 1500)
set.seed(271113) # for reproducibility
nGO <- c(50, 30, 20)
names(nGO) <- c("low", "med", "hi")
GO_low <- limitedRandomSample(hsGO_count, grpLow, nGO["low"])
GO_med <- limitedRandomSample(hsGO_count, grpMed, nGO["med"])
GO_hi <- limitedRandomSample(hsGO_count, grpHi, nGO["hi"])
useGO <- c(GO_low, GO_med, GO_hi)
useGOSizeClass <- c(rep("low", nGO["low"]), rep("med", nGO["med"]), rep("hi",
nGO["hi"]))
names(useGOSizeClass) <- useGO
Before we go and generate our samples, we should actually process some real experimental data and examine the fraction of annotated genes in the experiment compared to the total number of genes annotated. We will use the ALL
data set from Bioconductor
and a paired lung cancer dataset available from GEO (available as part of this package).
The ALL
data set has two classes of tumor, B and T-cell lymphoblastoma. We will do a simple differential expression between the two cell types.
library(ALL)
library(limma)
data(ALL)
grpALL <- factor(strtrim(pData(ALL)$BT, 1))
designALL <- model.matrix(~0 + grpALL)
colnames(designALL) <- c("B", "T")
fitALL <- lmFit(ALL, designALL)
contMatrixALL <- makeContrasts(BvT = B - T, levels = designALL)
fitALL2 <- contrasts.fit(fitALL, contMatrixALL)
fitALL2 <- eBayes(fitALL2)
deALL <- rownames(topTable(fitALL2, adjust = "BH", p.value = 0.05, number = Inf))
Now for all the GO terms that map to the probes in the ALL
differentially expressed genes, how does the proportion of differentially expressed genes change as a function of the number of genes annotated to the GO term?
library(hgu95av2.db)
##
ALLgo <- as.list(hgu95av2GO2ALLPROBES)
ALL_sizeFrac <- trimGOFrac(ALLgo, deALL)
ggplot(ALL_sizeFrac, aes(x = size, y = frac)) + geom_point() + xlim(0, 1500) +
useTheme + ggtitle("DE ALL fraction as a function of size") + xlab("Number of Genes Annotated to Term") +
ylab("Fraction of Term Genes in DE Sample")
## Warning: Removed 192 rows containing missing values (geom_point).
The lung cancer data is a paired sample design from GEO.
data(lung)
sampleStat <- strsplit2(pData(lung)$title, " ")
notNP <- grep("^NP", sampleStat[, 2], invert = T)
lung <- lung[, notNP]
sampleStat <- sampleStat[notNP, ]
sampleID <- factor(sampleStat[, 2])
cancerID <- factor(sampleStat[, 1])
lungDesign <- model.matrix(~sampleID + cancerID)
fitLung <- lmFit(lung, lungDesign)
fitLung <- eBayes(fitLung)
deLung <- rownames(topTable(fitLung, coef = "cancerIDTumor", number = 2000,
adjust.method = "BY", p.value = 1e-05))
library(hgu133plus2.db)
##
lung2go <- as.list(hgu133plus2GO2ALLPROBES)
lung_sizeFrac <- trimGOFrac(lung2go, deLung)
lung_sizeFrac$genelist <- "lung"
ggplot(lung_sizeFrac, aes(x = size, y = frac)) + geom_point() + xlim(0, 1500) +
useTheme + ggtitle("DE Lung fraction as a function of size") + xlab("Num. of Genes") +
ylab("Fraction")
## Warning: Removed 443 rows containing missing values (geom_point).
Now, for all of these we need to generate two samples of genes from them to comprise our differentially expressed (DE) sets from the genome, representing two different expression experiments.
How do we select these two sets of genes?
What we expect from this is that in both datasets, the GO terms with lower numbers of genes annotated will have large fractions of their annotations present, while GO terms with more and more genes will have lower fractions, and there will be less overlap between the two datasets.
These expectations should be checked.
goList <- list(low = GO_low, med = GO_med, hi = GO_hi)
nGene <- 1000
sample1_org <- sampleTerms(useGO, hsGO, 1000, 4)[1:nGene]
sample2_org <- sampleTerms(useGO, hsGO, 1000, 4)[1:nGene]
# check our expectations
goFractions <- calcFraction(hsGO[useGO], list(sample1 = sample1_org, sample2 = sample2_org))
ggplot(goFractions, aes(x = size, y = frac, color = genelist)) + geom_point() +
useTheme + ggtitle("Sample GO fractions") + xlab("Num. of Genes") + ylab("Fraction")
Lets actually compare this to the distribution of fractions vs sizes from the experimental data sets. Because those used all the GO terms, we will redo them with all GO terms as well, not just those from our sample.
sampleGOFracs <- calcFraction(hsGO, list(sample1 = sample1_org, sample2 = sample2_org))
ALL_sizeFrac$genelist <- "all"
ALL_tmp <- rbind(ALL_sizeFrac, lung_sizeFrac, sampleGOFracs[, c("frac", "genelist",
"size")])
ggplot(ALL_tmp, aes(x = size, y = frac)) + geom_point() + xlim(0, 1500) + facet_grid(genelist ~
.) + useTheme + ggtitle("Sample GO fractions for all") + xlab("Num. of Genes") +
ylab("Fraction")
## Warning: Removed 192 rows containing missing values (geom_point).
## Warning: Removed 443 rows containing missing values (geom_point).
## Warning: Removed 125 rows containing missing values (geom_point).
## Warning: Removed 125 rows containing missing values (geom_point).
keepItems <- c("goFractions", "sample1_org", "sample2_org", "hsGO", "useGO",
"useGOSizeClass", "GO_hi", "GO_low", "GO_med", "grpHi", "grpLow", "grpMed",
"minGO", "nGO", "nGene", "universeGenes", "savePlot", "useTheme", "fitALL2")
allItems <- ls()
delItems <- allItems[!(allItems %in% keepItems)]
rm(list = delItems)
For each geneList
, we will do the GO enrichment calculations, and then get the results. We also do the intersection of all the geneList
's, and do the calculations again.
samples_noNoise <- list(sample1 = sample1_org, sample2 = sample2_org)
go_noNoise <- hyperGOMultiEnrichment(samples_noNoise, universe = universeGenes)
Now get the p-values out (-1 * log10(pvalue)
). For each of our test GO terms, we take the minimum p-value from the results of the set calculations, and also the values from the list calculations.
noNoise_results <- pvaluesMultiEnrich(c("sample1", "sample2"), useGO, go_noNoise)
noNoise_pvalues <- pvalueDiffSig(noNoise_results$values, pCutoff = 0.05, log = TRUE)
Finally, lets add in our values of the fraction, total genes, and the class (low, med, hi) so we can do some fancy plotting stuff with it.
noNoise_pvalues$sizeClass <- useGOSizeClass
noNoise_pvalues$size <- goFractions[1:length(useGOSizeClass), 4]
noNoise_pvalues$frac <- goFractions$frac[1:length(useGOSizeClass)]
Now we will plot the difference in log-p-values (where we have set - list), so that positive values imply that set had lower p-values than list, based on the fraction of genes annotated.
noNoise_pvalues$sizeClass <- factor(noNoise_pvalues$sizeClass, levels = c("low",
"med", "hi"), ordered = TRUE)
p <- ggplot(noNoise_pvalues, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~
sizeClass, scales = "free_x") + xlab("fraction") + useTheme + ggtitle("Sample - Combined Differences") +
xlab("Fraction") + ylab("Diff. log p-values")
p
savePlot(p, "nonoise_sample_combineddiff.png")
## pdf
## 2
ggplot(noNoise_pvalues, aes(x = sizeClass, y = diff)) + geom_boxplot() + geom_point() +
ggtitle("Sample - Combined Differences") + xlab("Size Class") + ylab("Diff. log p-values")
We can see that it appears in general that set has better p-values than list, especially in the med and hi groups. Also, both methods get pretty much the same GO terms as significant.
How many of the GO terms have better p-values in set compared to list?
tapply(noNoise_pvalues$diff, noNoise_pvalues$sizeClass, function(x) {
sum(x > 0)
})
## low med hi
## 23 16 14
What is the average difference among each group?
tapply(noNoise_pvalues$diff, noNoise_pvalues$sizeClass, mean)
## low med hi
## -0.8243 -0.1824 4.4454
So ideally, the mean of the full set of differences, combined with how many pass in each case, should adequately quantify how well each of the methods is doing.
In addition to the genes that are annotated to the GO terms of interest, we will also add in genes that have no relation to the GO terms under consideration, these would be considered noise genes. To make the model simple, we will have the same genes as noise in both samples.
nNoise <- 500
noiseGenes <- possibleNoise(hsGO, useGO)
noiseGenes <- sample(noiseGenes, nNoise)
# check that we did this right, the fraction should not change after adding
# noise genes
sample1 <- c(sample1_org, noiseGenes)
sample2 <- c(sample2_org, noiseGenes)
samplesNoise <- list(sample1 = sample1, sample2 = sample2)
goFracNoise <- calcFraction(hsGO[useGO], samplesNoise)
plot(goFracNoise$frac, goFractions$frac)
go_noise <- hyperGOMultiEnrichment(samplesNoise, universeGenes)
noise_results <- pvaluesMultiEnrich(c("sample1", "sample2"), useGO, go_noise)
noise_pvalues <- pvalueDiffSig(noise_results$values, pCutoff = 0.05, log = TRUE)
noise_pvalues$sizeClass <- useGOSizeClass
noise_pvalues$size <- goFracNoise[1:length(useGOSizeClass), 4]
noise_pvalues$frac <- goFracNoise$frac[1:length(useGOSizeClass)]
noise_pvalues$sizeClass <- factor(noise_pvalues$sizeClass, c("low", "med", "hi"),
ordered = TRUE)
ggplot(noise_pvalues, aes(x = frac, y = diff, color = sigState)) + geom_point() +
facet_grid(. ~ sizeClass, scales = "free_x") + useTheme + ggtitle("Sample - Combined with Noise") +
xlab("Fraction") + ylab("Diff. log p-values")
tapply(noise_pvalues$diff, noise_pvalues$sizeClass, function(x) {
sum(x > 0)
})
## low med hi
## 32 24 18
tapply(noise_pvalues$diff, noise_pvalues$sizeClass, mean)
## low med hi
## 0.4028 4.0175 7.3028
Although using gene lists that were perfect (i.e. no noise) everything in the list method was pretty much still significant, on average sets gave much better p-values. However, as we add noise to the system in the form of genes that are not annotated to our test GO terms, more terms are significant only in the sets method (i.e. show up in both sets).
Is there any difference in the difference of p-values between noise vs noNoise (comparing set vs list)?
diff_pvalues <- noise_pvalues
diff_pvalues$diff <- noise_pvalues$diff - noNoise_pvalues$diff
ggplot(diff_pvalues, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~
sizeClass, scales = "free_x") + ggtitle("difference of differences noise vs noNoise") +
useTheme + xlab("Fraction") + ylab("Diff. noise / noNoise")
tapply(diff_pvalues$diff, diff_pvalues$sizeClass, mean)
## low med hi
## 1.227 4.200 2.857
From this plot, it appears that not only do the set values return lower p-values, but return lower p-values when noise is added overall.
To make sure that the results are consistent, we will do several iterations of the above calculations.
nTimes <- 100
nGene <- 1000
testSamples <- lapply(seq(1, nTimes), function(x) {
list(s1 = sampleTerms(useGO, hsGO, 1000, 4)[1:nGene], s2 = sampleTerms(useGO,
hsGO, 1000, 4)[1:nGene], noise = sample(noiseGenes, nNoise))
})
save(useGO, testSamples, universeGenes, useGOSizeClass, file = "inst/data/100GeneSamples.RData")
data("100GeneSamples")
library(snowfall)
# load('inst/data/100GeneSamples.RData')
testFun <- function(x) {
nnList <- x[c("s1", "s2")]
nnRes <- hyperGOMultiEnrichment(nnList, universe = universeGenes)
nn_res <- pvaluesMultiEnrich(c("s1", "s2"), useGO, nnRes)
nnP <- pvalueDiffSig(nn_res$values, pCutoff = 0.05, log = TRUE)
nnP$class <- useGOSizeClass
noiList <- list(s1 = c(x$s1, x$noise), s2 = c(x$s2, x$noise))
noiRes <- hyperGOMultiEnrichment(noiList, universe = universeGenes)
noi_res <- pvaluesMultiEnrich(c("s1", "s2"), useGO, noiRes)
noiP <- pvalueDiffSig(noi_res$values, pCutoff = 0.05, log = TRUE)
noiP$class <- useGOSizeClass
return(list(clean = nnP, noise = noiP))
}
sfInit(parallel = TRUE, cpus = 10)
sfExport("useGOSizeClass", "universeGenes", "useGO")
sfLibrary(ccPaper)
sfLibrary(GO.db)
sfLibrary(org.Hs.eg.db)
testRes <- sfLapply(testSamples, testFun)
sfStop()
save(testRes, file = "inst/data/100GeneResults.RData")
For each of the GO terms, we want to calculate the mean, standard deviation, and then a 95% confidence interval to represent graphically.
data("100GeneResults")
cleanRes <- lapply(testRes, function(x) {
x$clean
})
testStats_clean <- sameGOStats(cleanRes)
testStats_clean$class <- useGOSizeClass
testStats_clean$frac <- goFractions$frac[1:length(useGOSizeClass)]
testStats_clean$class <- factor(testStats_clean$class, levels = c("low", "med",
"hi"), ordered = TRUE)
p <- ggplot(testStats_clean, aes(x = frac, y = mean, ymax = max, ymin = min)) +
geom_point() + geom_errorbar() + facet_grid(. ~ class, scales = "free_x") +
useTheme + ggtitle("Sample - List Differences, 100 Gene Samples") + xlab("Fraction") +
ylab("Mean diff. log p-values")
p
savePlot(p, "genesamples100_sampleListDiffs.png")
## pdf
## 2
noiseRes <- lapply(testRes, function(x) {
x$noise
})
testStats_noise <- sameGOStats(noiseRes)
testStats_noise$class <- useGOSizeClass
testStats_noise$frac <- goFracNoise$frac[1:length(useGOSizeClass)]
testStats_noise$class <- factor(testStats_noise$class, levels = c("low", "med",
"hi"), ordered = TRUE)
ggplot(testStats_noise, aes(x = frac, y = mean, ymax = max, ymin = min)) + geom_point() +
geom_errorbar() + facet_grid(. ~ class, scales = "free_x") + ggtitle("Sample - List Differences, 100 Gene Samples, with noise") +
xlab("Fraction") + ylab("Mean diff. log p-values") + useTheme
These show that if there is a difference in the p-values from set and list, they are real.
What about differences between noise and clean?
noiseCleanDiff <- diffGOStats(testRes)
noiseCleanDiff$class <- useGOSizeClass
noiseCleanDiff$frac <- goFracNoise$frac[1:length(useGOSizeClass)]
noiseCleanDiff$class <- factor(noiseCleanDiff$class, levels = c("low", "med",
"hi"), ordered = TRUE)
ggplot(noiseCleanDiff, aes(x = frac, y = mean, ymax = max, ymin = min)) + geom_point() +
geom_errorbar() + facet_grid(. ~ class, scales = "free_x") + ggtitle("Difference of Differences, 100 Gene Samples") +
useTheme + xlab("Fraction") + ylab("Diff noise / noNoise")
Here we see that the noisy case also tends to generate higher differences in p-values between set and list than the clean case.
Caveats: Currently the fraction of genes annotated to GO terms is assumed to be the same for each sample. This may not actually be the case, and ideally this should be repeated with independent fraction estimates for each sampling.
OK, the error bars are not too bad. How about sampling multiple sets of GO terms??
goLimits <- list(low = c(10, 100), med = c(250, 500), hi = c(500, 1500))
nSample <- 100
sfInit(parallel = TRUE, cpus = 10)
sfLibrary(ccPaper)
sfExport("hsGO", "nGO", "goLimits")
testGOSample <- sfLapply(seq(1, nSample), function(x) {
t1 <- goSample(hsGO, nGO, goLimits, nSample = 2, nGene = 1000, nNoise = 500)
return(t1)
})
sfStop()
save(testGOSample, file = "inst/data/100GOSamples.RData")
Running those samples:
data("100GOSamples")
runGOSamples <- function(x) {
cleanSample <- x$geneSample
cleanRes <- hyperGOMultiEnrichment(cleanSample, universe = universeGenes)
cleanP <- pvaluesMultiEnrich(names(cleanSample), x$goData$id, cleanRes)
cleanP_dif <- pvalueDiffSig(cleanP$values, pCutoff = 0.05, log = TRUE)
cleanP_dif <- cbind(cleanP_dif, x$goData)
noiseSample <- lapply(names(cleanSample), function(inSample) {
c(x$geneSample[[inSample]], x$noiseSample[[inSample]])
})
names(noiseSample) <- names(cleanSample)
noiseRes <- hyperGOMultiEnrichment(noiseSample, universe = universeGenes)
noiseP <- pvaluesMultiEnrich(names(cleanSample), x$goData$id, noiseRes)
noiseP_dif <- pvalueDiffSig(noiseP$values, pCutoff = 0.05, log = TRUE)
noiseP_dif <- cbind(noiseP_dif, x$goData)
return(list(clean = cleanP_dif, noise = noiseP_dif))
}
library(snowfall)
sfInit(parallel = TRUE, cpus = 10)
sfLibrary(ccPaper)
sfLibrary(org.Hs.eg.db)
sfLibrary(GO.db)
sfExport("universeGenes")
testGORes <- sfLapply(testGOSample, runGOSamples)
sfStop()
save(testGORes, file = "inst/data/100GOResults.RData")
data("100GOResults")
grab_noNoise <- function(x) {
x$clean[, c("diff", "class", "frac")]
}
go_noNoise <- lapply(testGORes, grab_noNoise)
go_noNoise <- do.call(rbind, go_noNoise)
go_noNoise$class <- factor(go_noNoise$class, levels = c("low", "med", "hi"),
ordered = TRUE)
go_noNoiseT <- go_noNoise[(!is.nan(go_noNoise$diff)), ]
p <- ggplot(go_noNoiseT, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~
class, scales = "free_x") + useTheme + stat_density2d(aes(fill = ..level..),
geom = "polygon") + ggtitle("Sample - List Differences, 100 GO Samples, noNoise") +
xlab("Fraction") + ylab("Diff. log p-values") + labs(fill = "density")
p
## Warning: Removed 3 rows containing non-finite values (stat_density2d).
## Warning: Removed 2 rows containing non-finite values (stat_density2d).
## Warning: Removed 1 rows containing non-finite values (stat_density2d).
savePlot(p, "gosamples100_samplelistDiff.png")
## Warning: Removed 3 rows containing non-finite values (stat_density2d).
## Warning: Removed 2 rows containing non-finite values (stat_density2d).
## Warning: Removed 1 rows containing non-finite values (stat_density2d).
## pdf
## 2
grab_noise <- function(x) {
x$noise[, c("diff", "class", "frac")]
}
go_noise <- lapply(testGORes, grab_noise)
go_noise <- do.call(rbind, go_noise)
go_noise$class <- factor(go_noise$class, levels = c("low", "med", "hi"), ordered = TRUE)
ggplot(go_noise, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ class,
scales = "free_x") + ggtitle("Sample - List Differences, 100 GO Samples, Noise") +
xlab("Fraction") + ylab("Diff. log p-values") + useTheme
grab_noiseDiff <- function(x) {
outRes <- x$clean[, c("diff", "class", "frac")]
outRes$diff <- x$noise$diff - x$clean$diff
outRes
}
go_diff <- lapply(testGORes, grab_noiseDiff)
go_diff <- do.call(rbind, go_diff)
go_diff$class <- factor(go_diff$class, levels = c("low", "med", "hi"), ordered = TRUE)
ggplot(go_diff, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ class,
scales = "free_x") + ggtitle("Difference of Differences, 100 GO Samples") +
xlab("Fraction") + ylab("Diff. log p-values") + useTheme
## Warning: Removed 2 rows containing missing values (geom_point).
ggplot(go_diff, aes(x = frac, y = diff, fill = class)) + geom_boxplot(position = "identity",
alpha = 0.4) + ggtitle("Difference of Differences, 100 GO Samples") + xlab("Fraction") +
ylab("Diff. noise / noNoise") + useTheme
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
How about we compare the distribution of noise and no-noise using violin plots?
go_noise$noise <- "noise"
go_noNoise$noise <- "none"
go_all <- rbind(go_noise, go_noNoise)
rownames(go_all) <- NULL
ggplot(go_all, aes(x = frac, y = diff, fill = noise)) + geom_violin(trim = TRUE,
alpha = 0.5, position = "identity") + facet_grid(. ~ class, scales = "free_x") +
ggtitle("Difference of Differences, 100 GO Samples") + xlab("Fraction") +
ylab("Diff. log p-values") + useTheme
## Warning: Removed 5 rows containing non-finite values (stat_ydensity).
## Warning: Removed 2 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
What if we aggregate things into bins first, based on their class and fraction?
go_diff$frac10 <- round(go_diff$frac * 10)/10
ggplot(go_diff, aes(x = factor(frac10), y = diff)) + geom_boxplot() + facet_grid(. ~
class, scales = "free_x") + ggtitle("Difference of Differences, 100 GO Samples") +
xlab("Fraction") + ylab("Diff. log p-values") + useTheme
## Warning: Removed 5 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
probPoint <- is.infinite(go_diff$diff) | is.na(go_diff$diff) | is.nan(go_diff$diff)
go_diff <- go_diff[!probPoint, ]
ggplot(go_diff, aes(x = frac, y = diff)) + stat_smooth() + facet_grid(. ~ class,
scales = "free_x") + ggtitle("Difference of Differences, 100 GO Samples") +
xlab("Fraction") + ylab("Diff. log p-values") + useTheme
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
Although the improvement of noise over clean data dips below 0 at medium fractions, it does tend to improve rather significantly as the fraction of genes annotated to a GO term increases.
For a given set of genes and GO terms, lets now keep adding more and more noise genes, and for the set amount of noise genes, go from 0% to 100% shared noise genes. And then see how things change. For this, we are going to use our initial set of GO terms (useGO
), and a single original set of genes, simply sampling noise genes as we go along. Note that in this case noise genes are defined as genes that are not annotated to any of the GO terms that the DE genes are annotated to.
As in the previous examples, we will generate the samples first, and then run the calculations on them. This is because it is easier to test sample generation and enrichment separately.
noiseGenes <- possibleNoise(hsGO, useGO)
sizeNoise <- seq(0, 1000, 10)
fracShared <- seq(0, 1, 0.01)
noiseSamples <- sweepNoiseSample(noiseGenes, nSamples = 2, sizeNoise, fracShared)
nGene <- 1000
cleanSample <- list(s1 = sampleTerms(useGO, hsGO, nGene, 4)[1:nGene], s2 = sampleTerms(useGO,
hsGO, nGene, 4)[1:nGene])
save(noiseSamples, cleanSample, sizeNoise, fracShared, universeGenes, useGO,
useGOSizeClass, file = file.path(useDir, "inst/data/noiseSamples.RData"))
data(noiseSamples)
useNoise <- unlist(noiseSamples, recursive = FALSE)
sizeNoiseAll <- rep(sizeNoise, each = length(fracShared))
fracSharedAll <- rep(fracShared, length(sizeNoise))
runSweepNoise <- function(noiseGenes) {
inNoise <- lapply(seq(1, length(cleanSample)), function(x) {
c(cleanSample[[x]], noiseGenes[[x]])
})
names(inNoise) <- names(cleanSample)
noiseRes <- hyperGOMultiEnrichment(inNoise, universe = universeGenes)
noiseP <- pvaluesMultiEnrich(names(inNoise), useGO, noiseRes)
noisePComp <- pvalueDiffSig(noiseP$values, pCutoff = 0.05, log = TRUE)
noisePComp$class <- useGOSizeClass
return(noisePComp)
}
library(snowfall)
sfInit(parallel = TRUE, cpus = 10)
sfLibrary(ccPaper)
sfLibrary(org.Hs.eg.db)
sfLibrary(GO.db)
sfExport("universeGenes", "useGO", "useGOSizeClass", "cleanSample")
noiseSweepRes <- sfLapply(useNoise, runSweepNoise)
sfStop()
save(noiseSweepRes, file.path(useDir, "inst/data/noiseSweepRes.RData"))
Examine the noise results.
data(noiseSamples)
data(noiseSweepRes)
noiseFrac <- data.frame(size = rep(sizeNoise, each = length(fracShared)), shared = rep(fracShared,
length(sizeNoise)))
outMedians <- lapply(noiseSweepRes, function(inRes) {
tapply(inRes$diff, inRes$class, median)
})
outMedians <- do.call(rbind, outMedians)
outMedians <- cbind(outMedians, noiseFrac)
ggplot(outMedians, aes(x = size, y = hi, color = shared)) + geom_point() + useTheme +
ggtitle("Hi Median Difference") + xlab("Noise Added") + ylab("Median Diff.")
ggplot(outMedians, aes(x = size, y = med, color = shared)) + geom_point() +
useTheme + ggtitle("Medium Median Difference") + xlab("Noise Added") + ylab("Median Diff.")
ggplot(outMedians, aes(x = size, y = low, color = shared)) + geom_point() +
useTheme + ggtitle("Low Median Difference") + xlab("Noise Added") + ylab("Median Diff.")
outMedians2 <- data.frame(diff = c(outMedians$low, outMedians$med, outMedians$hi),
class = c(rep("low", nrow(outMedians)), rep("med", nrow(outMedians)), rep("hi",
nrow(outMedians))), size = rep(outMedians$size, 3), shared = rep(outMedians$shared,
3))
outMedians2$class <- factor(outMedians2$class, c("low", "med", "hi"), ordered = TRUE)
p <- ggplot(outMedians2, aes(x = size, y = diff, color = shared)) + geom_point() +
useTheme + facet_grid(class ~ ., scales = "free_y") + ggtitle("Sample - Combined as a function of size and shared fraction")
p
savePlot(p, "samplecombined_functionSizeFraction.png")
## pdf
## 2
Another way to do the analysis is using a Gene Set Enrichment Analysis, or GSEA. In contrast to over-representation analysis, where we count if a group is more highly represented in a group vs the background; GSEA tests if a set of genes are more highly ranked than random sets of genes.
For a hypothetical example, we are going to use the geneSetTest
from limma
, because it allows us to test on a set of gene statistics themselves.
The sets will be our same GO terms, and we will change the proportion of genes from each GO term that are enhanced above a base value. All genes will be assigned a value from a uniform distribution with a value of 0.5. For each GO term set, some genes will be selected (with an exponential decaying probability for more genes), and assigned p-values from a uniform distribution with a value of 0.01.
For the list version, the p-values of genes will be combined using Fishers method, see the function fishersMethod
provided in this package.
We will use the t-values that limma
has generated for the ALL data set, and just sample from those.
tALL <- topTable(fitALL2, number = Inf)
tALL <- tALL$t[(tALL$t < 10) & (tALL$t > -10)]
tValues <- runif(length(universeGenes), -6, 6)
gseaSamples <- list(s1 = sampleTerms(useGO, hsGO, 1000, 4), s2 = sampleTerms(useGO,
hsGO, 1000, 4))
gseaGOFracS1 <- calcFraction(hsGO[useGO], list(s1 = gseaSamples[[1]]))
gseaTValues <- sampletValueGenGSEA(gseaSamples, universeGenes, tValues)
Check that we did what we thought. If done right, all of the sample genes should be on one side of the a histogram.
tmpDat <- data.frame(x = gseaTValues[[1]], type = "all", stringsAsFactors = FALSE)
tmpDat$type[(universeGenes %in% gseaSamples[[1]])] <- "sample"
ggplot(tmpDat, aes(x = x, fill = type)) + geom_histogram(binwidth = 0.1, position = "identity") +
ggtitle("location of samples") + useTheme
tmpComb <- data.frame(x = gseaTValues[["comb"]], type = "all", stringsAsFactors = FALSE)
tmpComb$type[(universeGenes %in% gseaSamples[[1]])] <- "sample"
ggplot(tmpComb, aes(x = x, fill = type)) + geom_histogram(binwidth = 0.1, position = "identity") +
ggtitle("location of samples") + useTheme
Now lets actually do some calculations.
go2index <- symbols2indices(hsGO[useGO], universeGenes)
gseaSetResults <- lapply(gseaTValues, function(x) {
mmGeneSetTest(go2index, x)
})
And figure out what is doing better.
cleanDiff <- gseaDiffs(gseaSetResults)
cleanDiff$sizeClass <- useGOSizeClass
cleanDiff$sizeClass <- factor(cleanDiff$sizeClass, c("low", "med", "hi"), ordered = TRUE)
cleanDiff$frac <- gseaGOFracS1$frac
ggplot(cleanDiff, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ sizeClass) +
ggtitle("Sample - List for GSEA, using GO") + useTheme + xlab("Fraction") +
ylab("Diff. log p-values")
So this appears to be a different trend than for the hypergeometric case, in that a meta sample (achieved by averaging the t-statistics) actually does no worse, and in many cases much better than the worst p-value. This effect is less-pronounced as the number of genes in a class become larger, and as the fraction covered increases.
So what happens if we add some noisy genes into the top 1000? Stuff that isn't annotated at all to the GO terms of interest?
nNoise <- 2000
noiseGenes <- possibleNoise(hsGO, useGO)
noiseGenes <- sample(noiseGenes, nNoise)
gseaNoise <- list(s1 = c(gseaSamples[["s1"]], noiseGenes), s2 = c(gseaSamples[["s2"]],
noiseGenes))
gseaTValues_noise <- sampletValueGenGSEA(gseaNoise, universeGenes, tValues)
tmpDat <- data.frame(x = gseaTValues_noise[[1]], type = "all", stringsAsFactors = FALSE)
tmpDat$type[(universeGenes %in% gseaSamples[[1]])] <- "sample"
ggplot(tmpDat, aes(x = x, fill = type)) + geom_histogram(binwidth = 0.1, position = "identity",
alpha = 0.75) + ggtitle("Sample - List GSEA GO, adding noise genes") + useTheme
gseaSetResults_noise <- lapply(gseaTValues_noise, function(x) {
mmGeneSetTest(go2index, x)
})
noiseDiff <- gseaDiffs(gseaSetResults_noise)
noiseDiff$sizeClass <- useGOSizeClass
noiseDiff$sizeClass <- factor(noiseDiff$sizeClass, c("low", "med", "hi"), ordered = TRUE)
noiseDiff$frac <- gseaGOFracS1$frac
ggplot(noiseDiff, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ sizeClass) +
ggtitle("Sample - List, GSEA GO with Noise") + useTheme + xlab("Fraction") +
ylab("Diff. log p-values")
diffDiff <- data.frame(diff = (noiseDiff$diff - cleanDiff$diff), sizeClass = noiseDiff$sizeClass,
frac = noiseDiff$frac)
ggplot(diffDiff, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ sizeClass) +
ggtitle("Difference of Differences, GSEA GO") + useTheme + xlab("Fraction") +
ylab("Diff. noise / noNoise")
So, it appears that having a uniform distribution does not make things behave the way we might expect. What if it is more an issue of independence, that GO is a bad fit for GSEA type analysis because of the issue that many of the terms are heavily overlapped.
What if we force independence on our objects? i.e. each object has independent sets of genes, and treat them appropriately? i.e. to count as “diff” we will put genes (features) into the top 50% of ranked t-values (therefore reducing coverage means pushing annotated genes down into the bottom 50% of ranked values), and adding noise means to push un-annotated (i.e. random) above in rank.
Lets try this for a single example.
nGene <- 10000
indepTValues <- sort(runif(nGene, -6, 6))
names(indepTValues) <- as.character(seq(1, nGene))
gseaMap <- list(t1 = sample(seq(1, nGene/2), 100))
tmpDat <- data.frame(tVal = indepTValues, status = "all", stringsAsFactors = F)
tmpDat$status[gseaMap[["t1"]]] <- "sample"
ggplot(tmpDat, aes(x = tVal, fill = status)) + geom_histogram(binwidt = 0.1,
position = "identity") + ggtitle("Check independent distribution, 100%") +
useTheme + xlab("t-statistic")
geneSetTest(gseaMap$t1, indepTValues, alternative = "down", ranks.only = FALSE)
## [1] 1e-04
gseaMap80 <- list(t1 = c(sample(seq(1, nGene/2), 80), sample(seq((nGene/2) +
1, nGene), 20)))
geneSetTest(gseaMap80$t1, indepTValues, alternative = "down", ranks.only = FALSE)
## [1] 1e-04
tmpDat <- data.frame(tVal = indepTValues, status = "all", stringsAsFactors = F)
tmpDat$status[gseaMap80[["t1"]]] <- "sample"
ggplot(tmpDat, aes(x = tVal, fill = status)) + geom_histogram(binwidt = 0.1,
position = "identity") + ggtitle("check independent distribution, 80%") +
useTheme + xlab("t-statistic")
gseaMap50 <- list(t1 = c(sample(seq(1, nGene/2), 50), sample(seq((nGene/2) +
1, nGene), 50)))
geneSetTest(gseaMap50$t1, indepTValues, alternative = "down", ranks.only = FALSE)
## [1] 0.4535
gseaMapUnif <- list(t1 = sample(nGene, 100))
geneSetTest(gseaMapUnif$t1, indepTValues, alternative = "down", ranks.only = FALSE)
## [1] 0.9735
OK, so at least we verify that as the percentage of genes in the top 50% decreases, then the p-values decrease as well, as we expect they should. What about as we increase the number of genes above above the set, while keeping the number in the top 50 the same?
gseaAbove100 <- list(t1 = sample(seq(100, nGene/2), 100))
geneSetTest(gseaAbove100$t1, indepTValues, alternative = "down", ranks.only = FALSE)
## [1] 1e-04
gseaAbove500 <- list(t1 = sample(seq(501, nGene/2), 100))
geneSetTest(gseaAbove500$t1, indepTValues, alternative = "down", ranks.only = FALSE)
## [1] 1e-04
gseaAbove1000 <- list(t1 = sample(seq(1001, nGene/2), 100))
geneSetTest(gseaAbove1000$t1, indepTValues, alternative = "down", ranks.only = FALSE)
## [1] 1e-04
gseaAbove2000 <- list(t1 = sample(seq(2001, nGene/2), 100))
geneSetTest(gseaAbove2000$t1, indepTValues, alternative = "down", ranks.only = FALSE)
## [1] 1e-04
gseaAbove4000 <- list(t1 = sample(seq(4001, nGene/2), 100))
geneSetTest(gseaAbove4000$t1, indepTValues, alternative = "down", ranks.only = FALSE)
## [1] 0.0525
OK, so it takes a rather large proportion above to change the p-value any amount.
So, what should we do. We need a function that will let us vary the size and proportion of genes that are in the bottom 50% of values.
`?`(gseaSizeRange)
`?`(gseaProportion)
Lets verify this function is doing what we think it is.
testSize <- gseaSizeRange(c(500, 500), 50)
useStatistics <- runif(10000, -6, 6)
useSig <- runif(50, 0.1, 1)
testStats <- gseaProportion(testSize, useSig, useStatistics)
fullIndex <- seq(1, length(useStatistics))
useCut <- 5000
realRanks <- lapply(testStats, function(inStat) {
apply(inStat$stats, 2, function(inCol) {
outRank <- rank(inCol)[inStat$index]
sum(outRank < useCut)/length(inStat$index)
})
})
realRanks <- do.call(rbind, realRanks)
plot(realRanks[, 1], useSig)
plot(realRanks[, 2], useSig)
plot(realRanks[, 3], useSig)
OK, it looks like it is working. That's good.
testSize <- gseaSizeRange(c(500, 500), 50)
useStatistics <- runif(10000, -6, 6)
useSig <- runif(50, 0.1, 1)
testStats <- gseaProportion(testSize, useSig, useStatistics)
outP <- lapply(testStats, function(inStat) {
tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn) {
geneSetTest(inStat$index, inStat$stats[, useColumn], "down", ranks.only = FALSE)
})
tmpP <- do.call(cbind, tmpP)
colnames(tmpP) <- colnames(inStat$stats)
return(tmpP)
})
outP2 <- do.call(rbind, outP)
outP2 <- -1 * log(outP2)
tmpP <- data.frame(set = apply(outP2[, c("s1", "s2")], 1, min), list = outP2[,
"comb"])
tmpP$diff <- tmpP$set - tmpP$list
tmpP$frac <- useSig
p <- ggplot(tmpP, aes(x = frac, y = diff)) + geom_point() + useTheme + ggtitle("Sample - Combined average t-statistics") +
xlab("Fraction") + ylab("Diff. log p-values")
p
savePlot(p, "samplecombined_tstatistics.png")
## pdf
## 2
Yeah, that doesn't seem to really work.
What if GSEA is dependent on the shared proportion that is in the up or down state? i.e. if we allow the proportion “down” to vary between the two samples, and plot the difference in set and list values based on how close the value of the proportion in the top is??
`?`(gseaProportionVary)
Check that we generate different significant proportions for independent samples, and what is the combined.
testSize <- gseaSizeRange(c(500, 500), 50)
useStatistics <- runif(10000, -6, 6)
useSig <- list(s1 = runif(50, 0.1, 1), s2 = runif(50, 0.1, 1))
testStats <- gseaProportionVary(testSize, useSig, useStatistics)
fullIndex <- seq(1, length(useStatistics))
useCut <- 5000
realRanks <- lapply(testStats, function(inStat) {
apply(inStat$stats, 2, function(inCol) {
outRank <- rank(inCol)[inStat$index]
sum(outRank < useCut)/length(inStat$index)
})
})
realRanks <- do.call(rbind, realRanks)
plot(realRanks[, 1], useSig[[1]])
plot(realRanks[, 2], useSig[[2]])
plot(realRanks[, 3], useSig[[1]])
plot(realRanks[, 3], useSig[[2]])
OK, this seems to be working still.
Now, lets try this on some fake data.
testSize <- gseaSizeRange(c(500, 500), 50)
useStatistics <- runif(10000, -6, 6)
useSig <- list(s1 = runif(50, 0.1, 1), s2 = runif(50, 0.1, 1))
testStats <- gseaProportionVary(testSize, useSig, useStatistics)
outP <- lapply(testStats, function(inStat) {
tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn) {
geneSetTest(inStat$index, inStat$stats[, useColumn], "down", ranks.only = FALSE)
})
tmpP <- do.call(cbind, tmpP)
colnames(tmpP) <- colnames(inStat$stats)
return(tmpP)
})
outP2 <- do.call(rbind, outP)
outP2 <- -1 * log(outP2)
tmpP <- data.frame(set = apply(outP2[, c("s1", "s2")], 1, min), list = outP2[,
"comb"])
tmpP$diff <- tmpP$set - tmpP$list
realRanks <- lapply(testStats, function(inStat) {
apply(inStat$stats, 2, function(inCol) {
outRank <- rank(inCol)[inStat$index]
sum(outRank < useCut)/length(inStat$index)
})
})
realRanks <- do.call(rbind, realRanks)
tmpP$frac <- realRanks[, 3]
ggplot(tmpP, aes(x = frac, y = diff)) + geom_point() + ggtitle("Sample - Combined vary fraction") +
useTheme + xlab("Fraction") + ylab("Diff. log p-values")
So, none of this has really worked, at least not in the way that was hoped.
For our hypothetical comparison, we need to think of how one would combine two studies. In this case, one might combine two studies by combining the p-values. Therefore, we will use ranked p-values as our metric in the geneSetTest
.
For this to work with geneSetTest
, we need to make the range 0, 0.5, 1 conform to -1, 0, 1, because geneSetTest
works nicer with signed values. We will use a linear regression to transform the p-values to a signed range.
regData <- data.frame(y = c(-1, 0, 1), x = c(0, 0.5, 1))
regModel <- lm(y ~ x, regData)
testData <- runif(10000, 0, 1)
outData <- regModel$coefficients[1] + regModel$coefficients[2] * testData
hist(outData, 100)
So our transformation works. Lets make sure it will work like the t-statistic.
nGene <- 10000
indepTValues <- sort(runif(nGene, 0, 1))
names(indepTValues) <- as.character(seq(1, nGene))
gseaMap <- list(t1 = sample(seq(1, nGene/4), 100))
tmpDat <- data.frame(tVal = indepTValues, status = "all", stringsAsFactors = F)
tmpDat$status[gseaMap[["t1"]]] <- "sample"
ggplot(tmpDat, aes(x = tVal, fill = status)) + geom_histogram(binwidt = 0.01,
position = "identity") + ggtitle("check p-values, 100%") + useTheme + xlab("P-values")
geneSetTest(gseaMap$t1, p2signed(regModel, indepTValues), alternative = "down",
ranks.only = FALSE)
## [1] 1e-04
gseaMap80 <- list(t1 = c(sample(seq(1, nGene/2), 80), sample(seq((nGene/2) +
1, nGene), 20)))
geneSetTest(gseaMap80$t1, p2signed(regModel, indepTValues), alternative = "down",
ranks.only = FALSE)
## [1] 1e-04
tmpDat <- data.frame(tVal = indepTValues, status = "all", stringsAsFactors = F)
tmpDat$status[gseaMap80[["t1"]]] <- "sample"
ggplot(tmpDat, aes(x = tVal, fill = status)) + geom_histogram(binwidt = 0.01,
position = "identity") + ggtitle("check p-values, 80%") + useTheme + xlab("P-values")
gseaMap50 <- list(t1 = c(sample(seq(1, nGene/2), 50), sample(seq((nGene/2) +
1, nGene), 50)))
geneSetTest(gseaMap50$t1, p2signed(regModel, indepTValues), alternative = "down",
ranks.only = FALSE)
## [1] 0.6527
Now, lets maintain proportions between samples as we did before, but for the list method, combine the p-values using Fisher's method.
`?`(gseaProportionFisher)
testSize <- gseaSizeRange(c(500, 500), 50)
useStatistics <- runif(10000, 0, 1)
useSig <- runif(50, 0.1, 1)
testStats <- gseaProportionFisher(testSize, useSig, useStatistics)
fullIndex <- seq(1, length(useStatistics))
useCut <- 5000
realRanks <- lapply(testStats, function(inStat) {
apply(inStat$stats, 2, function(inCol) {
outRank <- rank(inCol)[inStat$index]
sum(outRank < useCut)/length(inStat$index)
})
})
realRanks <- do.call(rbind, realRanks)
plot(realRanks[, 1], useSig)
plot(realRanks[, 2], useSig)
plot(realRanks[, 3], useSig)
testSize <- gseaSizeRange(c(500, 500), 50)
useStatistics <- runif(10000, 0, 1)
useSig <- runif(50, 0.1, 1)
testStats <- gseaProportionFisher(testSize, useSig, useStatistics)
outP <- lapply(testStats, function(inStat) {
tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn) {
geneSetTest(inStat$index, p2signed(regModel, inStat$stats[, useColumn]),
"down", ranks.only = FALSE)
})
tmpP <- do.call(cbind, tmpP)
colnames(tmpP) <- colnames(inStat$stats)
return(tmpP)
})
outP2 <- do.call(rbind, outP)
outP2 <- -1 * log(outP2)
tmpP <- data.frame(set = apply(outP2[, c("s1", "s2")], 1, min), list = outP2[,
"comb"])
tmpP$diff <- tmpP$set - tmpP$list
tmpP$frac <- useSig
ggplot(tmpP, aes(x = frac, y = diff)) + geom_point() + ggtitle("Sample - Combined Fishers") +
useTheme + xlab("Fraction") + ylab("Diff. log p-values")
# try also using ranks only
outPRank <- lapply(testStats, function(inStat) {
tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn) {
geneSetTest(inStat$index, inStat$stats, ranks.only = TRUE)
})
tmpP <- do.call(cbind, tmpP)
colnames(tmpP) <- colnames(inStat$stats)
return(tmpP)
})
outP2Rank <- do.call(rbind, outPRank)
outP2Rank <- -1 * log(outP2Rank)
tmpP <- data.frame(set = apply(outP2Rank[, c("s1", "s2")], 1, min), list = outP2Rank[,
"comb"])
tmpP$diff <- tmpP$set - tmpP$list
tmpP$frac <- useSig
ggplot(tmpP, aes(x = frac, y = diff)) + geom_point() + ggtitle("Sample - Combined Fishers Ranks") +
useTheme + xlab("Fraction") + ylab("Diff. log p-values")
Finally, we note that Shen & Tseng from 2010 combined independent experiments by taking the maximum p-value between samples.
`?`(gseaProportionMax)
testSize <- gseaSizeRange(c(500, 500), 50)
useStatistics <- runif(10000, 0, 1)
useSig <- runif(50, 0.1, 1)
testStats <- gseaProportionMax(testSize, useSig, useStatistics)
fullIndex <- seq(1, length(useStatistics))
useCut <- 5000
realRanks <- lapply(testStats, function(inStat) {
apply(inStat$stats, 2, function(inCol) {
outRank <- rank(inCol)[inStat$index]
sum(outRank < useCut)/length(inStat$index)
})
})
realRanks <- do.call(rbind, realRanks)
plot(realRanks[, 1], useSig)
plot(realRanks[, 2], useSig)
plot(realRanks[, 3], useSig)
testSize <- gseaSizeRange(c(500, 500), 50)
useStatistics <- runif(10000, 0, 1)
useSig <- runif(50, 0.1, 1)
testStats <- gseaProportionMax(testSize, useSig, useStatistics)
outP <- lapply(testStats, function(inStat) {
tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn) {
geneSetTest(inStat$index, p2signed(regModel, inStat$stats[, useColumn]),
"down", ranks.only = FALSE)
})
tmpP <- do.call(cbind, tmpP)
colnames(tmpP) <- colnames(inStat$stats)
return(tmpP)
})
outP2 <- do.call(rbind, outP)
outP2 <- -1 * log(outP2)
tmpP <- data.frame(set = apply(outP2[, c("s1", "s2")], 1, min), list = outP2[,
"comb"])
tmpP$diff <- tmpP$set - tmpP$list
tmpP$frac <- useSig
ggplot(tmpP, aes(x = frac, y = diff)) + geom_point() + ggtitle("Sample - Combined Max-P") +
useTheme + xlab("Fraction") + ylab("Diff. log p-values")
And wow! It appears to work!
Lets test using different size ranges and the proportions.
gseaLo <- gseaSizeRange(c(20, 250), 50)
gseaMed <- gseaSizeRange(c(250, 500), 30)
gseaHi <- gseaSizeRange(c(500, 1000), 20)
gseaSizes <- c(gseaLo, gseaMed, gseaHi)
gseaSizeClass <- c(rep("low", 50), rep("med", 30), rep("hi", 20))
gseaFrac <- runif(100, 0.3, 1)
gseaStatistics <- runif(10000, 0, 1)
testStats <- gseaProportionMax(gseaSizes, gseaFrac, gseaStatistics)
outP <- lapply(testStats, function(inStat) {
tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn) {
geneSetTest(inStat$index, p2signed(regModel, inStat$stats[, useColumn]),
"down", ranks.only = FALSE)
})
tmpP <- do.call(cbind, tmpP)
colnames(tmpP) <- colnames(inStat$stats)
return(tmpP)
})
outP2 <- do.call(rbind, outP)
outP2 <- -1 * log(outP2)
tmpP <- data.frame(set = apply(outP2[, c("s1", "s2")], 1, min), list = outP2[,
"comb"])
tmpP$diff <- tmpP$set - tmpP$list
tmpP$frac <- gseaFrac
tmpP$class <- gseaSizeClass
tmpP$class <- factor(tmpP$class, c("low", "med", "hi"), ordered = TRUE)
ggplot(tmpP, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ class) +
useTheme
p <- ggplot(tmpP, aes(x = frac, y = diff)) + geom_point() + useTheme + ggtitle("Sample - Combined Max-P") +
xlab("Fraction") + ylab("Diff. log p-values")
p
savePlot(p, "samplecombined_maxp.png")
## pdf
## 2
So, this is very important. The method of combining p-values for the list method is extremely important.
Does this hold using ranks only for the calculation?
outPRank <- lapply(testStats, function(inStat) {
tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn) {
geneSetTest(inStat$index, inStat$stats[, useColumn], ranks.only = TRUE)
})
tmpP <- do.call(cbind, tmpP)
colnames(tmpP) <- colnames(inStat$stats)
return(tmpP)
})
outP2Rank <- do.call(rbind, outPRank)
outP2Rank <- -1 * log(outP2Rank)
tmpPRank <- data.frame(set = apply(outP2Rank[, c("s1", "s2")], 1, min), list = outP2Rank[,
"comb"])
tmpPRank$diff <- tmpPRank$set - tmpPRank$list
tmpPRank$frac <- gseaFrac
tmpPRank$class <- gseaSizeClass
tmpPRank$class <- factor(tmpPRank$class, c("low", "med", "hi"), ordered = TRUE)
ggplot(tmpPRank, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ class) +
ggtitle("Sample - Combined Max-P Ranks") + xlab("Fraction") + ylab("Diff. log p-values")
No, it doesn't because the ranks.only=TRUE
will only check for mixed values, not at one end or the other.
Sys.time()
## [1] "2014-02-19 10:31:06 EST"
sessionInfo()
## R version 3.0.1 (2013-05-16)
## Platform: x86_64-unknown-linux-gnu (64-bit)
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=C LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] mgcv_1.7-27 nlme_3.1-113
## [3] hgu133plus2.db_2.10.1 hgu95av2.db_2.10.1
## [5] ALL_1.4.16 GO.db_2.10.1
## [7] org.Hs.eg.db_2.10.1 devtools_1.4.1
## [9] ccPaper_0.0.19 ggplot2_0.9.3.1
## [11] limma_3.18.7 categoryComparePaper_1.6.15
## [13] AnnotationDbi_1.24.0 RSQLite_0.11.4
## [15] DBI_0.2-7 Biobase_2.22.0
## [17] BiocGenerics_0.8.0 roxygen2_2.2.2
## [19] digest_0.6.4
##
## loaded via a namespace (and not attached):
## [1] annotate_1.40.0 AnnotationForge_1.4.4 brew_1.0-6
## [4] Category_2.28.0 colorspace_1.2-4 dichromat_2.0-0
## [7] evaluate_0.5.1 formatR_0.10 genefilter_1.44.0
## [10] GOstats_2.28.0 graph_1.40.1 grid_3.0.1
## [13] GSEABase_1.24.0 gtable_0.1.2 httr_0.2
## [16] hwriter_1.3 IRanges_1.20.6 knitr_1.5
## [19] labeling_0.2 lattice_0.20-24 markdown_0.6.4
## [22] MASS_7.3-29 Matrix_1.1-1.1 memoise_0.1
## [25] munsell_0.4.2 plyr_1.8 proto_0.3-10
## [28] RBGL_1.38.0 RColorBrewer_1.0-5 RCurl_1.95-4.1
## [31] RCytoscape_1.12.0 reshape2_1.2.2 scales_0.2.3
## [34] splines_3.0.1 stats4_3.0.1 stringr_0.6.2
## [37] survival_2.37-4 tools_3.0.1 whisker_0.3-2
## [40] XML_3.98-1.1 XMLRPC_0.3-0 xtable_1.7-1