Note that running this analysis will require at least 4 GB of RAM unless objects are deleted when they are no longer required.

Hypothetical Example

To demonstrate the utility of the general categoryCompare approach, we will construct an artificial data set based on the Gene Ontology. We will attempt to demonstrate that differences in the set or list based comparisons are also dependent on the number of items annotated to a particular term.

Definitions

Set based: calculations performed independently on each set of features, and then the results are combined.

List based: calculations performed on the intersected list of two sets of features.

GO Terms

library(ggplot2)
useTheme <- theme_bw() + theme(legend.key = element_blank(), legend.title = element_text(size = 25), 
    axis.title = element_text(size = 25), axis.text = element_text(size = 15), 
    legend.text = element_text(size = 20), strip.text.y = element_text(size = 20), 
    strip.text.x = element_text(size = 20))


savePlot <- function(plotObj, plotFile, figPath = "/mlab/data/rmflight/Documents/projects/work/ccPaper/savedPlots") {
    if (file.exists(figPath)) {
        plotObj <- plotObj + ggtitle(NULL)
        png(filename = file.path(figPath, plotFile), width = 854, height = 628)
        print(plotObj)
        dev.off()
    }
}

We will make use of three sets of GO terms from H. sapiens. For each set, we want differing numbers of genes annotated to them.

Number of Genes Annotated Distribution

Let's first look at the distribution of number of genes annotated to the GO terms in biological process

library(org.Hs.eg.db)
library(GO.db)
library(ccPaper)
hsGO <- as.list(org.Hs.egGO2ALLEGS)

goOntology <- Ontology(names(hsGO))
goBP <- names(goOntology)[goOntology == "BP"]
hsGO <- hsGO[goBP]
hsGO <- lapply(hsGO, unique)
universeGenes <- unique(unlist(hsGO))

hsGO_count <- sapply(hsGO, length)
hsGO_count <- data.frame(size = hsGO_count)
p <- ggplot(hsGO_count, aes(x = size)) + geom_bar(binwidth = 10) + xlim(0, 2000) + 
    ylim(0, 500) + useTheme + ggtitle("Number of genes annotated to GO terms") + 
    xlab("Num. of Genes") + ylab("Count")
p

plot of chunk goCountDist


savePlot(p, "go2gene_distribution.png")

## pdf 
##   2

Set up groups

Based on the distribution in the previous figure, we will set up 3 different groups of GO terms, with a different number from each group:

low: 10-100, 50 terms
med: 250-500, 30 terms
hi: 500-1500, 20 terms

This gives us a total of 100 GO terms, with a good sample from each group. We will generate random samples that are defined by these limits, and use this set of GO terms going forward.

grpLow <- c(10, 100)
grpMed <- c(250, 500)
grpHi <- c(500, 1500)

set.seed(271113)  # for reproducibility
nGO <- c(50, 30, 20)
names(nGO) <- c("low", "med", "hi")

GO_low <- limitedRandomSample(hsGO_count, grpLow, nGO["low"])
GO_med <- limitedRandomSample(hsGO_count, grpMed, nGO["med"])
GO_hi <- limitedRandomSample(hsGO_count, grpHi, nGO["hi"])

useGO <- c(GO_low, GO_med, GO_hi)
useGOSizeClass <- c(rep("low", nGO["low"]), rep("med", nGO["med"]), rep("hi", 
    nGO["hi"]))
names(useGOSizeClass) <- useGO

Examine Gene-GO Distribution for Real Data

Before we go and generate our samples, we should actually process some real experimental data and examine the fraction of annotated genes in the experiment compared to the total number of genes annotated. We will use the ALL data set from Bioconductor and a paired lung cancer dataset available from GEO (available as part of this package).

ALL

The ALL data set has two classes of tumor, B and T-cell lymphoblastoma. We will do a simple differential expression between the two cell types.

library(ALL)
library(limma)
data(ALL)

grpALL <- factor(strtrim(pData(ALL)$BT, 1))
designALL <- model.matrix(~0 + grpALL)
colnames(designALL) <- c("B", "T")
fitALL <- lmFit(ALL, designALL)
contMatrixALL <- makeContrasts(BvT = B - T, levels = designALL)
fitALL2 <- contrasts.fit(fitALL, contMatrixALL)
fitALL2 <- eBayes(fitALL2)
deALL <- rownames(topTable(fitALL2, adjust = "BH", p.value = 0.05, number = Inf))

Now for all the GO terms that map to the probes in the ALL differentially expressed genes, how does the proportion of differentially expressed genes change as a function of the number of genes annotated to the GO term?

library(hgu95av2.db)

##

ALLgo <- as.list(hgu95av2GO2ALLPROBES)
ALL_sizeFrac <- trimGOFrac(ALLgo, deALL)
ggplot(ALL_sizeFrac, aes(x = size, y = frac)) + geom_point() + xlim(0, 1500) + 
    useTheme + ggtitle("DE ALL fraction as a function of size") + xlab("Number of Genes Annotated to Term") + 
    ylab("Fraction of Term Genes in DE Sample")

## Warning: Removed 192 rows containing missing values (geom_point).

plot of chunk goDistALL

Lung Cancer

The lung cancer data is a paired sample design from GEO.

data(lung)
sampleStat <- strsplit2(pData(lung)$title, " ")

notNP <- grep("^NP", sampleStat[, 2], invert = T)
lung <- lung[, notNP]
sampleStat <- sampleStat[notNP, ]

sampleID <- factor(sampleStat[, 2])
cancerID <- factor(sampleStat[, 1])

lungDesign <- model.matrix(~sampleID + cancerID)
fitLung <- lmFit(lung, lungDesign)
fitLung <- eBayes(fitLung)
deLung <- rownames(topTable(fitLung, coef = "cancerIDTumor", number = 2000, 
    adjust.method = "BY", p.value = 1e-05))

library(hgu133plus2.db)

##

lung2go <- as.list(hgu133plus2GO2ALLPROBES)
lung_sizeFrac <- trimGOFrac(lung2go, deLung)
lung_sizeFrac$genelist <- "lung"
ggplot(lung_sizeFrac, aes(x = size, y = frac)) + geom_point() + xlim(0, 1500) + 
    useTheme + ggtitle("DE Lung fraction as a function of size") + xlab("Num. of Genes") + 
    ylab("Fraction")

## Warning: Removed 443 rows containing missing values (geom_point).

plot of chunk goDistLung

Get Genes

Now, for all of these we need to generate two samples of genes from them to comprise our differentially expressed (DE) sets from the genome, representing two different expression experiments.

How do we select these two sets of genes?

define the maximum number of diff genes for the set
Randomly sort the GO terms
For each GO term in turn (i.e. iterating over the GO terms):
- take a random fraction of genes for that GO term (sampling from an exponentially decaying distribution of possibilites), adding the requisite number of genes
- If all the genes for a GO term get taken, remove that GO term from future consideration
Continue iterating through until the total number of genes is reached

What we expect from this is that in both datasets, the GO terms with lower numbers of genes annotated will have large fractions of their annotations present, while GO terms with more and more genes will have lower fractions, and there will be less overlap between the two datasets.

These expectations should be checked.

goList <- list(low = GO_low, med = GO_med, hi = GO_hi)
nGene <- 1000
sample1_org <- sampleTerms(useGO, hsGO, 1000, 4)[1:nGene]
sample2_org <- sampleTerms(useGO, hsGO, 1000, 4)[1:nGene]

# check our expectations
goFractions <- calcFraction(hsGO[useGO], list(sample1 = sample1_org, sample2 = sample2_org))
ggplot(goFractions, aes(x = size, y = frac, color = genelist)) + geom_point() + 
    useTheme + ggtitle("Sample GO fractions") + xlab("Num. of Genes") + ylab("Fraction")

plot of chunk generateSamples

Compare to experimental

Lets actually compare this to the distribution of fractions vs sizes from the experimental data sets. Because those used all the GO terms, we will redo them with all GO terms as well, not just those from our sample.

sampleGOFracs <- calcFraction(hsGO, list(sample1 = sample1_org, sample2 = sample2_org))
ALL_sizeFrac$genelist <- "all"
ALL_tmp <- rbind(ALL_sizeFrac, lung_sizeFrac, sampleGOFracs[, c("frac", "genelist", 
    "size")])
ggplot(ALL_tmp, aes(x = size, y = frac)) + geom_point() + xlim(0, 1500) + facet_grid(genelist ~ 
    .) + useTheme + ggtitle("Sample GO fractions for all") + xlab("Num. of Genes") + 
    ylab("Fraction")

## Warning: Removed 192 rows containing missing values (geom_point).
## Warning: Removed 443 rows containing missing values (geom_point).
## Warning: Removed 125 rows containing missing values (geom_point).
## Warning: Removed 125 rows containing missing values (geom_point).

plot of chunk compareFracs

keepItems <- c("goFractions", "sample1_org", "sample2_org", "hsGO", "useGO", 
    "useGOSizeClass", "GO_hi", "GO_low", "GO_med", "grpHi", "grpLow", "grpMed", 
    "minGO", "nGO", "nGene", "universeGenes", "savePlot", "useTheme", "fitALL2")
allItems <- ls()
delItems <- allItems[!(allItems %in% keepItems)]
rm(list = delItems)

Do calculations

For each geneList, we will do the GO enrichment calculations, and then get the results. We also do the intersection of all the geneList's, and do the calculations again.

samples_noNoise <- list(sample1 = sample1_org, sample2 = sample2_org)
go_noNoise <- hyperGOMultiEnrichment(samples_noNoise, universe = universeGenes)

Now get the p-values out (-1 * log10(pvalue)). For each of our test GO terms, we take the minimum p-value from the results of the set calculations, and also the values from the list calculations.

noNoise_results <- pvaluesMultiEnrich(c("sample1", "sample2"), useGO, go_noNoise)

noNoise_pvalues <- pvalueDiffSig(noNoise_results$values, pCutoff = 0.05, log = TRUE)

Finally, lets add in our values of the fraction, total genes, and the class (low, med, hi) so we can do some fancy plotting stuff with it.

noNoise_pvalues$sizeClass <- useGOSizeClass
noNoise_pvalues$size <- goFractions[1:length(useGOSizeClass), 4]
noNoise_pvalues$frac <- goFractions$frac[1:length(useGOSizeClass)]

Now we will plot the difference in log-p-values (where we have set - list), so that positive values imply that set had lower p-values than list, based on the fraction of genes annotated.

noNoise_pvalues$sizeClass <- factor(noNoise_pvalues$sizeClass, levels = c("low", 
    "med", "hi"), ordered = TRUE)
p <- ggplot(noNoise_pvalues, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ 
    sizeClass, scales = "free_x") + xlab("fraction") + useTheme + ggtitle("Sample - Combined Differences") + 
    xlab("Fraction") + ylab("Diff. log p-values")
p

plot of chunk noNoisePlotStuff

savePlot(p, "nonoise_sample_combineddiff.png")

## pdf 
##   2

ggplot(noNoise_pvalues, aes(x = sizeClass, y = diff)) + geom_boxplot() + geom_point() + 
    ggtitle("Sample - Combined Differences") + xlab("Size Class") + ylab("Diff. log p-values")

plot of chunk noNoisePlotStuff

We can see that it appears in general that set has better p-values than list, especially in the med and hi groups. Also, both methods get pretty much the same GO terms as significant.

How many of the GO terms have better p-values in set compared to list?

tapply(noNoise_pvalues$diff, noNoise_pvalues$sizeClass, function(x) {
    sum(x > 0)
})

## low med  hi 
##  23  16  14

What is the average difference among each group?

tapply(noNoise_pvalues$diff, noNoise_pvalues$sizeClass, mean)

##     low     med      hi 
## -0.8243 -0.1824  4.4454

So ideally, the mean of the full set of differences, combined with how many pass in each case, should adequately quantify how well each of the methods is doing.

Noise

In addition to the genes that are annotated to the GO terms of interest, we will also add in genes that have no relation to the GO terms under consideration, these would be considered noise genes. To make the model simple, we will have the same genes as noise in both samples.

nNoise <- 500
noiseGenes <- possibleNoise(hsGO, useGO)
noiseGenes <- sample(noiseGenes, nNoise)

# check that we did this right, the fraction should not change after adding
# noise genes
sample1 <- c(sample1_org, noiseGenes)
sample2 <- c(sample2_org, noiseGenes)

samplesNoise <- list(sample1 = sample1, sample2 = sample2)

goFracNoise <- calcFraction(hsGO[useGO], samplesNoise)
plot(goFracNoise$frac, goFractions$frac)

plot of chunk noiseGenes

go_noise <- hyperGOMultiEnrichment(samplesNoise, universeGenes)

noise_results <- pvaluesMultiEnrich(c("sample1", "sample2"), useGO, go_noise)

noise_pvalues <- pvalueDiffSig(noise_results$values, pCutoff = 0.05, log = TRUE)

noise_pvalues$sizeClass <- useGOSizeClass
noise_pvalues$size <- goFracNoise[1:length(useGOSizeClass), 4]
noise_pvalues$frac <- goFracNoise$frac[1:length(useGOSizeClass)]

noise_pvalues$sizeClass <- factor(noise_pvalues$sizeClass, c("low", "med", "hi"), 
    ordered = TRUE)
ggplot(noise_pvalues, aes(x = frac, y = diff, color = sigState)) + geom_point() + 
    facet_grid(. ~ sizeClass, scales = "free_x") + useTheme + ggtitle("Sample - Combined with Noise") + 
    xlab("Fraction") + ylab("Diff. log p-values")

plot of chunk plotSummarizeNoise

tapply(noise_pvalues$diff, noise_pvalues$sizeClass, function(x) {
    sum(x > 0)
})

## low med  hi 
##  32  24  18

tapply(noise_pvalues$diff, noise_pvalues$sizeClass, mean)

##    low    med     hi 
## 0.4028 4.0175 7.3028

Although using gene lists that were perfect (i.e. no noise) everything in the list method was pretty much still significant, on average sets gave much better p-values. However, as we add noise to the system in the form of genes that are not annotated to our test GO terms, more terms are significant only in the sets method (i.e. show up in both sets).

Difference of Differences

Is there any difference in the difference of p-values between noise vs noNoise (comparing set vs list)?

diff_pvalues <- noise_pvalues
diff_pvalues$diff <- noise_pvalues$diff - noNoise_pvalues$diff
ggplot(diff_pvalues, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ 
    sizeClass, scales = "free_x") + ggtitle("difference of differences noise vs noNoise") + 
    useTheme + xlab("Fraction") + ylab("Diff. noise / noNoise")

plot of chunk diffDiffs

tapply(diff_pvalues$diff, diff_pvalues$sizeClass, mean)

##   low   med    hi 
## 1.227 4.200 2.857

From this plot, it appears that not only do the set values return lower p-values, but return lower p-values when noise is added overall.

Check Consistency

To make sure that the results are consistent, we will do several iterations of the above calculations.

For this set of GO terms, do the above calculations 100 times
Generate 100 different sets of GO terms, and run calculations again.

Gene Sample Level Consistency

nTimes <- 100
nGene <- 1000

testSamples <- lapply(seq(1, nTimes), function(x) {
    list(s1 = sampleTerms(useGO, hsGO, 1000, 4)[1:nGene], s2 = sampleTerms(useGO, 
        hsGO, 1000, 4)[1:nGene], noise = sample(noiseGenes, nNoise))
})

save(useGO, testSamples, universeGenes, useGOSizeClass, file = "inst/data/100GeneSamples.RData")

data("100GeneSamples")
library(snowfall)
# load('inst/data/100GeneSamples.RData')
testFun <- function(x) {
    nnList <- x[c("s1", "s2")]
    nnRes <- hyperGOMultiEnrichment(nnList, universe = universeGenes)
    nn_res <- pvaluesMultiEnrich(c("s1", "s2"), useGO, nnRes)
    nnP <- pvalueDiffSig(nn_res$values, pCutoff = 0.05, log = TRUE)
    nnP$class <- useGOSizeClass

    noiList <- list(s1 = c(x$s1, x$noise), s2 = c(x$s2, x$noise))
    noiRes <- hyperGOMultiEnrichment(noiList, universe = universeGenes)
    noi_res <- pvaluesMultiEnrich(c("s1", "s2"), useGO, noiRes)
    noiP <- pvalueDiffSig(noi_res$values, pCutoff = 0.05, log = TRUE)
    noiP$class <- useGOSizeClass

    return(list(clean = nnP, noise = noiP))
}
sfInit(parallel = TRUE, cpus = 10)
sfExport("useGOSizeClass", "universeGenes", "useGO")
sfLibrary(ccPaper)
sfLibrary(GO.db)
sfLibrary(org.Hs.eg.db)
testRes <- sfLapply(testSamples, testFun)
sfStop()
save(testRes, file = "inst/data/100GeneResults.RData")

Errors from 100 Samples of the initial set of GO terms

For each of the GO terms, we want to calculate the mean, standard deviation, and then a 95% confidence interval to represent graphically.

data("100GeneResults")
cleanRes <- lapply(testRes, function(x) {
    x$clean
})
testStats_clean <- sameGOStats(cleanRes)
testStats_clean$class <- useGOSizeClass
testStats_clean$frac <- goFractions$frac[1:length(useGOSizeClass)]
testStats_clean$class <- factor(testStats_clean$class, levels = c("low", "med", 
    "hi"), ordered = TRUE)
p <- ggplot(testStats_clean, aes(x = frac, y = mean, ymax = max, ymin = min)) + 
    geom_point() + geom_errorbar() + facet_grid(. ~ class, scales = "free_x") + 
    useTheme + ggtitle("Sample - List Differences, 100 Gene Samples") + xlab("Fraction") + 
    ylab("Mean diff. log p-values")
p

plot of chunk noNoiseStats

savePlot(p, "genesamples100_sampleListDiffs.png")

## pdf 
##   2

noiseRes <- lapply(testRes, function(x) {
    x$noise
})
testStats_noise <- sameGOStats(noiseRes)
testStats_noise$class <- useGOSizeClass
testStats_noise$frac <- goFracNoise$frac[1:length(useGOSizeClass)]
testStats_noise$class <- factor(testStats_noise$class, levels = c("low", "med", 
    "hi"), ordered = TRUE)
ggplot(testStats_noise, aes(x = frac, y = mean, ymax = max, ymin = min)) + geom_point() + 
    geom_errorbar() + facet_grid(. ~ class, scales = "free_x") + ggtitle("Sample - List Differences, 100 Gene Samples, with noise") + 
    xlab("Fraction") + ylab("Mean diff. log p-values") + useTheme

plot of chunk noiseStats

These show that if there is a difference in the p-values from set and list, they are real.

What about differences between noise and clean?

noiseCleanDiff <- diffGOStats(testRes)
noiseCleanDiff$class <- useGOSizeClass
noiseCleanDiff$frac <- goFracNoise$frac[1:length(useGOSizeClass)]
noiseCleanDiff$class <- factor(noiseCleanDiff$class, levels = c("low", "med", 
    "hi"), ordered = TRUE)
ggplot(noiseCleanDiff, aes(x = frac, y = mean, ymax = max, ymin = min)) + geom_point() + 
    geom_errorbar() + facet_grid(. ~ class, scales = "free_x") + ggtitle("Difference of Differences, 100 Gene Samples") + 
    useTheme + xlab("Fraction") + ylab("Diff noise / noNoise")

plot of chunk 100GeneNoiseClean

Here we see that the noisy case also tends to generate higher differences in p-values between set and list than the clean case.

Caveats: Currently the fraction of genes annotated to GO terms is assumed to be the same for each sample. This may not actually be the case, and ideally this should be repeated with independent fraction estimates for each sampling.

GO Samples

OK, the error bars are not too bad. How about sampling multiple sets of GO terms??

goLimits <- list(low = c(10, 100), med = c(250, 500), hi = c(500, 1500))

nSample <- 100

sfInit(parallel = TRUE, cpus = 10)
sfLibrary(ccPaper)
sfExport("hsGO", "nGO", "goLimits")

testGOSample <- sfLapply(seq(1, nSample), function(x) {
    t1 <- goSample(hsGO, nGO, goLimits, nSample = 2, nGene = 1000, nNoise = 500)
    return(t1)
})

sfStop()
save(testGOSample, file = "inst/data/100GOSamples.RData")

Running those samples:

data("100GOSamples")
runGOSamples <- function(x) {
    cleanSample <- x$geneSample
    cleanRes <- hyperGOMultiEnrichment(cleanSample, universe = universeGenes)

    cleanP <- pvaluesMultiEnrich(names(cleanSample), x$goData$id, cleanRes)
    cleanP_dif <- pvalueDiffSig(cleanP$values, pCutoff = 0.05, log = TRUE)
    cleanP_dif <- cbind(cleanP_dif, x$goData)

    noiseSample <- lapply(names(cleanSample), function(inSample) {
        c(x$geneSample[[inSample]], x$noiseSample[[inSample]])
    })
    names(noiseSample) <- names(cleanSample)
    noiseRes <- hyperGOMultiEnrichment(noiseSample, universe = universeGenes)
    noiseP <- pvaluesMultiEnrich(names(cleanSample), x$goData$id, noiseRes)
    noiseP_dif <- pvalueDiffSig(noiseP$values, pCutoff = 0.05, log = TRUE)
    noiseP_dif <- cbind(noiseP_dif, x$goData)
    return(list(clean = cleanP_dif, noise = noiseP_dif))
}

library(snowfall)
sfInit(parallel = TRUE, cpus = 10)
sfLibrary(ccPaper)
sfLibrary(org.Hs.eg.db)
sfLibrary(GO.db)
sfExport("universeGenes")

testGORes <- sfLapply(testGOSample, runGOSamples)
sfStop()
save(testGORes, file = "inst/data/100GOResults.RData")

data("100GOResults")

grab_noNoise <- function(x) {
    x$clean[, c("diff", "class", "frac")]
}

go_noNoise <- lapply(testGORes, grab_noNoise)
go_noNoise <- do.call(rbind, go_noNoise)
go_noNoise$class <- factor(go_noNoise$class, levels = c("low", "med", "hi"), 
    ordered = TRUE)
go_noNoiseT <- go_noNoise[(!is.nan(go_noNoise$diff)), ]
p <- ggplot(go_noNoiseT, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ 
    class, scales = "free_x") + useTheme + stat_density2d(aes(fill = ..level..), 
    geom = "polygon") + ggtitle("Sample - List Differences, 100 GO Samples, noNoise") + 
    xlab("Fraction") + ylab("Diff. log p-values") + labs(fill = "density")
p

## Warning: Removed 3 rows containing non-finite values (stat_density2d).
## Warning: Removed 2 rows containing non-finite values (stat_density2d).
## Warning: Removed 1 rows containing non-finite values (stat_density2d).

plot of chunk lookDiffs

savePlot(p, "gosamples100_samplelistDiff.png")

## Warning: Removed 3 rows containing non-finite values (stat_density2d).
## Warning: Removed 2 rows containing non-finite values (stat_density2d).
## Warning: Removed 1 rows containing non-finite values (stat_density2d).

## pdf 
##   2


grab_noise <- function(x) {
    x$noise[, c("diff", "class", "frac")]
}
go_noise <- lapply(testGORes, grab_noise)
go_noise <- do.call(rbind, go_noise)
go_noise$class <- factor(go_noise$class, levels = c("low", "med", "hi"), ordered = TRUE)
ggplot(go_noise, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ class, 
    scales = "free_x") + ggtitle("Sample - List Differences, 100 GO Samples, Noise") + 
    xlab("Fraction") + ylab("Diff. log p-values") + useTheme

plot of chunk lookDiffs


grab_noiseDiff <- function(x) {
    outRes <- x$clean[, c("diff", "class", "frac")]
    outRes$diff <- x$noise$diff - x$clean$diff
    outRes
}
go_diff <- lapply(testGORes, grab_noiseDiff)
go_diff <- do.call(rbind, go_diff)
go_diff$class <- factor(go_diff$class, levels = c("low", "med", "hi"), ordered = TRUE)
ggplot(go_diff, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ class, 
    scales = "free_x") + ggtitle("Difference of Differences, 100 GO Samples") + 
    xlab("Fraction") + ylab("Diff. log p-values") + useTheme

## Warning: Removed 2 rows containing missing values (geom_point).

plot of chunk lookDiffs

ggplot(go_diff, aes(x = frac, y = diff, fill = class)) + geom_boxplot(position = "identity", 
    alpha = 0.4) + ggtitle("Difference of Differences, 100 GO Samples") + xlab("Fraction") + 
    ylab("Diff. noise / noNoise") + useTheme

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

plot of chunk lookDiffs

How about we compare the distribution of noise and no-noise using violin plots?

go_noise$noise <- "noise"
go_noNoise$noise <- "none"

go_all <- rbind(go_noise, go_noNoise)
rownames(go_all) <- NULL
ggplot(go_all, aes(x = frac, y = diff, fill = noise)) + geom_violin(trim = TRUE, 
    alpha = 0.5, position = "identity") + facet_grid(. ~ class, scales = "free_x") + 
    ggtitle("Difference of Differences, 100 GO Samples") + xlab("Fraction") + 
    ylab("Diff. log p-values") + useTheme

## Warning: Removed 5 rows containing non-finite values (stat_ydensity).
## Warning: Removed 2 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).

plot of chunk compNoiseNoNoise

What if we aggregate things into bins first, based on their class and fraction?

go_diff$frac10 <- round(go_diff$frac * 10)/10
ggplot(go_diff, aes(x = factor(frac10), y = diff)) + geom_boxplot() + facet_grid(. ~ 
    class, scales = "free_x") + ggtitle("Difference of Differences, 100 GO Samples") + 
    xlab("Fraction") + ylab("Diff. log p-values") + useTheme

## Warning: Removed 5 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

plot of chunk aggregate


probPoint <- is.infinite(go_diff$diff) | is.na(go_diff$diff) | is.nan(go_diff$diff)
go_diff <- go_diff[!probPoint, ]
ggplot(go_diff, aes(x = frac, y = diff)) + stat_smooth() + facet_grid(. ~ class, 
    scales = "free_x") + ggtitle("Difference of Differences, 100 GO Samples") + 
    xlab("Fraction") + ylab("Diff. log p-values") + useTheme

## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.

plot of chunk aggregate

Although the improvement of noise over clean data dips below 0 at medium fractions, it does tend to improve rather significantly as the fraction of genes annotated to a GO term increases.

As a function of noisy and shared noise genes

For a given set of genes and GO terms, lets now keep adding more and more noise genes, and for the set amount of noise genes, go from 0% to 100% shared noise genes. And then see how things change. For this, we are going to use our initial set of GO terms (useGO), and a single original set of genes, simply sampling noise genes as we go along. Note that in this case noise genes are defined as genes that are not annotated to any of the GO terms that the DE genes are annotated to.

As in the previous examples, we will generate the samples first, and then run the calculations on them. This is because it is easier to test sample generation and enrichment separately.

noiseGenes <- possibleNoise(hsGO, useGO)
sizeNoise <- seq(0, 1000, 10)
fracShared <- seq(0, 1, 0.01)
noiseSamples <- sweepNoiseSample(noiseGenes, nSamples = 2, sizeNoise, fracShared)

nGene <- 1000
cleanSample <- list(s1 = sampleTerms(useGO, hsGO, nGene, 4)[1:nGene], s2 = sampleTerms(useGO, 
    hsGO, nGene, 4)[1:nGene])

save(noiseSamples, cleanSample, sizeNoise, fracShared, universeGenes, useGO, 
    useGOSizeClass, file = file.path(useDir, "inst/data/noiseSamples.RData"))

data(noiseSamples)

useNoise <- unlist(noiseSamples, recursive = FALSE)
sizeNoiseAll <- rep(sizeNoise, each = length(fracShared))
fracSharedAll <- rep(fracShared, length(sizeNoise))

runSweepNoise <- function(noiseGenes) {
    inNoise <- lapply(seq(1, length(cleanSample)), function(x) {
        c(cleanSample[[x]], noiseGenes[[x]])
    })
    names(inNoise) <- names(cleanSample)

    noiseRes <- hyperGOMultiEnrichment(inNoise, universe = universeGenes)
    noiseP <- pvaluesMultiEnrich(names(inNoise), useGO, noiseRes)
    noisePComp <- pvalueDiffSig(noiseP$values, pCutoff = 0.05, log = TRUE)
    noisePComp$class <- useGOSizeClass
    return(noisePComp)
}

library(snowfall)
sfInit(parallel = TRUE, cpus = 10)
sfLibrary(ccPaper)
sfLibrary(org.Hs.eg.db)
sfLibrary(GO.db)
sfExport("universeGenes", "useGO", "useGOSizeClass", "cleanSample")
noiseSweepRes <- sfLapply(useNoise, runSweepNoise)
sfStop()
save(noiseSweepRes, file.path(useDir, "inst/data/noiseSweepRes.RData"))

Examine the noise results.

data(noiseSamples)
data(noiseSweepRes)

noiseFrac <- data.frame(size = rep(sizeNoise, each = length(fracShared)), shared = rep(fracShared, 
    length(sizeNoise)))

outMedians <- lapply(noiseSweepRes, function(inRes) {
    tapply(inRes$diff, inRes$class, median)
})
outMedians <- do.call(rbind, outMedians)
outMedians <- cbind(outMedians, noiseFrac)

ggplot(outMedians, aes(x = size, y = hi, color = shared)) + geom_point() + useTheme + 
    ggtitle("Hi Median Difference") + xlab("Noise Added") + ylab("Median Diff.")

plot of chunk examineSweptNoise


ggplot(outMedians, aes(x = size, y = med, color = shared)) + geom_point() + 
    useTheme + ggtitle("Medium Median Difference") + xlab("Noise Added") + ylab("Median Diff.")

plot of chunk examineSweptNoise


ggplot(outMedians, aes(x = size, y = low, color = shared)) + geom_point() + 
    useTheme + ggtitle("Low Median Difference") + xlab("Noise Added") + ylab("Median Diff.")

plot of chunk examineSweptNoise

outMedians2 <- data.frame(diff = c(outMedians$low, outMedians$med, outMedians$hi), 
    class = c(rep("low", nrow(outMedians)), rep("med", nrow(outMedians)), rep("hi", 
        nrow(outMedians))), size = rep(outMedians$size, 3), shared = rep(outMedians$shared, 
        3))
outMedians2$class <- factor(outMedians2$class, c("low", "med", "hi"), ordered = TRUE)

p <- ggplot(outMedians2, aes(x = size, y = diff, color = shared)) + geom_point() + 
    useTheme + facet_grid(class ~ ., scales = "free_y") + ggtitle("Sample - Combined as a function of size and shared fraction")
p

plot of chunk examine2

savePlot(p, "samplecombined_functionSizeFraction.png")

## pdf 
##   2

GSEA

Another way to do the analysis is using a Gene Set Enrichment Analysis, or GSEA. In contrast to over-representation analysis, where we count if a group is more highly represented in a group vs the background; GSEA tests if a set of genes are more highly ranked than random sets of genes.

For a hypothetical example, we are going to use the geneSetTest from limma, because it allows us to test on a set of gene statistics themselves.

Data set up

The sets will be our same GO terms, and we will change the proportion of genes from each GO term that are enhanced above a base value. All genes will be assigned a value from a uniform distribution with a value of 0.5. For each GO term set, some genes will be selected (with an exponential decaying probability for more genes), and assigned p-values from a uniform distribution with a value of 0.01.

For the list version, the p-values of genes will be combined using Fishers method, see the function fishersMethod provided in this package.

Generate Data

We will use the t-values that limma has generated for the ALL data set, and just sample from those.

tALL <- topTable(fitALL2, number = Inf)
tALL <- tALL$t[(tALL$t < 10) & (tALL$t > -10)]
tValues <- runif(length(universeGenes), -6, 6)

gseaSamples <- list(s1 = sampleTerms(useGO, hsGO, 1000, 4), s2 = sampleTerms(useGO, 
    hsGO, 1000, 4))

gseaGOFracS1 <- calcFraction(hsGO[useGO], list(s1 = gseaSamples[[1]]))

gseaTValues <- sampletValueGenGSEA(gseaSamples, universeGenes, tValues)

Check that we did what we thought. If done right, all of the sample genes should be on one side of the a histogram.

tmpDat <- data.frame(x = gseaTValues[[1]], type = "all", stringsAsFactors = FALSE)
tmpDat$type[(universeGenes %in% gseaSamples[[1]])] <- "sample"
ggplot(tmpDat, aes(x = x, fill = type)) + geom_histogram(binwidth = 0.1, position = "identity") + 
    ggtitle("location of samples") + useTheme

plot of chunk checkValues


tmpComb <- data.frame(x = gseaTValues[["comb"]], type = "all", stringsAsFactors = FALSE)
tmpComb$type[(universeGenes %in% gseaSamples[[1]])] <- "sample"
ggplot(tmpComb, aes(x = x, fill = type)) + geom_histogram(binwidth = 0.1, position = "identity") + 
    ggtitle("location of samples") + useTheme

plot of chunk checkValues

Now lets actually do some calculations.

go2index <- symbols2indices(hsGO[useGO], universeGenes)
gseaSetResults <- lapply(gseaTValues, function(x) {
    mmGeneSetTest(go2index, x)
})

And figure out what is doing better.

cleanDiff <- gseaDiffs(gseaSetResults)
cleanDiff$sizeClass <- useGOSizeClass
cleanDiff$sizeClass <- factor(cleanDiff$sizeClass, c("low", "med", "hi"), ordered = TRUE)
cleanDiff$frac <- gseaGOFracS1$frac

ggplot(cleanDiff, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ sizeClass) + 
    ggtitle("Sample - List for GSEA, using GO") + useTheme + xlab("Fraction") + 
    ylab("Diff. log p-values")

plot of chunk gseaBetterGO

So this appears to be a different trend than for the hypergeometric case, in that a meta sample (achieved by averaging the t-statistics) actually does no worse, and in many cases much better than the worst p-value. This effect is less-pronounced as the number of genes in a class become larger, and as the fraction covered increases.

So what happens if we add some noisy genes into the top 1000? Stuff that isn't annotated at all to the GO terms of interest?

nNoise <- 2000
noiseGenes <- possibleNoise(hsGO, useGO)
noiseGenes <- sample(noiseGenes, nNoise)

gseaNoise <- list(s1 = c(gseaSamples[["s1"]], noiseGenes), s2 = c(gseaSamples[["s2"]], 
    noiseGenes))

gseaTValues_noise <- sampletValueGenGSEA(gseaNoise, universeGenes, tValues)

tmpDat <- data.frame(x = gseaTValues_noise[[1]], type = "all", stringsAsFactors = FALSE)
tmpDat$type[(universeGenes %in% gseaSamples[[1]])] <- "sample"
ggplot(tmpDat, aes(x = x, fill = type)) + geom_histogram(binwidth = 0.1, position = "identity", 
    alpha = 0.75) + ggtitle("Sample - List GSEA GO, adding noise genes") + useTheme

plot of chunk gseaNoise

gseaSetResults_noise <- lapply(gseaTValues_noise, function(x) {
    mmGeneSetTest(go2index, x)
})

noiseDiff <- gseaDiffs(gseaSetResults_noise)
noiseDiff$sizeClass <- useGOSizeClass
noiseDiff$sizeClass <- factor(noiseDiff$sizeClass, c("low", "med", "hi"), ordered = TRUE)
noiseDiff$frac <- gseaGOFracS1$frac

ggplot(noiseDiff, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ sizeClass) + 
    ggtitle("Sample - List, GSEA GO with Noise") + useTheme + xlab("Fraction") + 
    ylab("Diff. log p-values")

plot of chunk gseaBetter


diffDiff <- data.frame(diff = (noiseDiff$diff - cleanDiff$diff), sizeClass = noiseDiff$sizeClass, 
    frac = noiseDiff$frac)
ggplot(diffDiff, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ sizeClass) + 
    ggtitle("Difference of Differences, GSEA GO") + useTheme + xlab("Fraction") + 
    ylab("Diff. noise / noNoise")

plot of chunk gseaBetter

So, it appears that having a uniform distribution does not make things behave the way we might expect. What if it is more an issue of independence, that GO is a bad fit for GSEA type analysis because of the issue that many of the terms are heavily overlapped.

Independence

What if we force independence on our objects? i.e. each object has independent sets of genes, and treat them appropriately? i.e. to count as “diff” we will put genes (features) into the top 50% of ranked t-values (therefore reducing coverage means pushing annotated genes down into the bottom 50% of ranked values), and adding noise means to push un-annotated (i.e. random) above in rank.

Lets try this for a single example.

nGene <- 10000
indepTValues <- sort(runif(nGene, -6, 6))
names(indepTValues) <- as.character(seq(1, nGene))

gseaMap <- list(t1 = sample(seq(1, nGene/2), 100))

tmpDat <- data.frame(tVal = indepTValues, status = "all", stringsAsFactors = F)
tmpDat$status[gseaMap[["t1"]]] <- "sample"
ggplot(tmpDat, aes(x = tVal, fill = status)) + geom_histogram(binwidt = 0.1, 
    position = "identity") + ggtitle("Check independent distribution, 100%") + 
    useTheme + xlab("t-statistic")

plot of chunk independentGSEA


geneSetTest(gseaMap$t1, indepTValues, alternative = "down", ranks.only = FALSE)

## [1] 1e-04


gseaMap80 <- list(t1 = c(sample(seq(1, nGene/2), 80), sample(seq((nGene/2) + 
    1, nGene), 20)))

geneSetTest(gseaMap80$t1, indepTValues, alternative = "down", ranks.only = FALSE)

## [1] 1e-04


tmpDat <- data.frame(tVal = indepTValues, status = "all", stringsAsFactors = F)
tmpDat$status[gseaMap80[["t1"]]] <- "sample"
ggplot(tmpDat, aes(x = tVal, fill = status)) + geom_histogram(binwidt = 0.1, 
    position = "identity") + ggtitle("check independent distribution, 80%") + 
    useTheme + xlab("t-statistic")

plot of chunk independentGSEA



gseaMap50 <- list(t1 = c(sample(seq(1, nGene/2), 50), sample(seq((nGene/2) + 
    1, nGene), 50)))

geneSetTest(gseaMap50$t1, indepTValues, alternative = "down", ranks.only = FALSE)

## [1] 0.4535


gseaMapUnif <- list(t1 = sample(nGene, 100))
geneSetTest(gseaMapUnif$t1, indepTValues, alternative = "down", ranks.only = FALSE)

## [1] 0.9735

OK, so at least we verify that as the percentage of genes in the top 50% decreases, then the p-values decrease as well, as we expect they should. What about as we increase the number of genes above above the set, while keeping the number in the top 50 the same?

gseaAbove100 <- list(t1 = sample(seq(100, nGene/2), 100))
geneSetTest(gseaAbove100$t1, indepTValues, alternative = "down", ranks.only = FALSE)

## [1] 1e-04


gseaAbove500 <- list(t1 = sample(seq(501, nGene/2), 100))
geneSetTest(gseaAbove500$t1, indepTValues, alternative = "down", ranks.only = FALSE)

## [1] 1e-04


gseaAbove1000 <- list(t1 = sample(seq(1001, nGene/2), 100))
geneSetTest(gseaAbove1000$t1, indepTValues, alternative = "down", ranks.only = FALSE)

## [1] 1e-04


gseaAbove2000 <- list(t1 = sample(seq(2001, nGene/2), 100))
geneSetTest(gseaAbove2000$t1, indepTValues, alternative = "down", ranks.only = FALSE)

## [1] 1e-04


gseaAbove4000 <- list(t1 = sample(seq(4001, nGene/2), 100))
geneSetTest(gseaAbove4000$t1, indepTValues, alternative = "down", ranks.only = FALSE)

## [1] 0.0525

OK, so it takes a rather large proportion above to change the p-value any amount.

So, what should we do. We need a function that will let us vary the size and proportion of genes that are in the bottom 50% of values.

`?`(gseaSizeRange)
`?`(gseaProportion)

Lets verify this function is doing what we think it is.

testSize <- gseaSizeRange(c(500, 500), 50)
useStatistics <- runif(10000, -6, 6)
useSig <- runif(50, 0.1, 1)
testStats <- gseaProportion(testSize, useSig, useStatistics)

fullIndex <- seq(1, length(useStatistics))
useCut <- 5000

realRanks <- lapply(testStats, function(inStat) {

    apply(inStat$stats, 2, function(inCol) {
        outRank <- rank(inCol)[inStat$index]
        sum(outRank < useCut)/length(inStat$index)
    })

})

realRanks <- do.call(rbind, realRanks)
plot(realRanks[, 1], useSig)

plot of chunk checkGSEAProp

plot(realRanks[, 2], useSig)

plot of chunk checkGSEAProp

plot(realRanks[, 3], useSig)

plot of chunk checkGSEAProp

OK, it looks like it is working. That's good.

testSize <- gseaSizeRange(c(500, 500), 50)
useStatistics <- runif(10000, -6, 6)
useSig <- runif(50, 0.1, 1)
testStats <- gseaProportion(testSize, useSig, useStatistics)

outP <- lapply(testStats, function(inStat) {
    tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn) {
        geneSetTest(inStat$index, inStat$stats[, useColumn], "down", ranks.only = FALSE)
    })
    tmpP <- do.call(cbind, tmpP)
    colnames(tmpP) <- colnames(inStat$stats)
    return(tmpP)
})

outP2 <- do.call(rbind, outP)
outP2 <- -1 * log(outP2)

tmpP <- data.frame(set = apply(outP2[, c("s1", "s2")], 1, min), list = outP2[, 
    "comb"])
tmpP$diff <- tmpP$set - tmpP$list
tmpP$frac <- useSig
p <- ggplot(tmpP, aes(x = frac, y = diff)) + geom_point() + useTheme + ggtitle("Sample - Combined average t-statistics") + 
    xlab("Fraction") + ylab("Diff. log p-values")
p

plot of chunk testFunctions

savePlot(p, "samplecombined_tstatistics.png")

## pdf 
##   2

Yeah, that doesn't seem to really work.

What if GSEA is dependent on the shared proportion that is in the up or down state? i.e. if we allow the proportion “down” to vary between the two samples, and plot the difference in set and list values based on how close the value of the proportion in the top is??

`?`(gseaProportionVary)

Check that we generate different significant proportions for independent samples, and what is the combined.

testSize <- gseaSizeRange(c(500, 500), 50)
useStatistics <- runif(10000, -6, 6)
useSig <- list(s1 = runif(50, 0.1, 1), s2 = runif(50, 0.1, 1))
testStats <- gseaProportionVary(testSize, useSig, useStatistics)

fullIndex <- seq(1, length(useStatistics))
useCut <- 5000

realRanks <- lapply(testStats, function(inStat) {

    apply(inStat$stats, 2, function(inCol) {
        outRank <- rank(inCol)[inStat$index]
        sum(outRank < useCut)/length(inStat$index)
    })

})

realRanks <- do.call(rbind, realRanks)
plot(realRanks[, 1], useSig[[1]])

plot of chunk checkgseaProportionsVary

plot(realRanks[, 2], useSig[[2]])

plot of chunk checkgseaProportionsVary

plot(realRanks[, 3], useSig[[1]])

plot of chunk checkgseaProportionsVary

plot(realRanks[, 3], useSig[[2]])

plot of chunk checkgseaProportionsVary

OK, this seems to be working still.

Now, lets try this on some fake data.

testSize <- gseaSizeRange(c(500, 500), 50)
useStatistics <- runif(10000, -6, 6)
useSig <- list(s1 = runif(50, 0.1, 1), s2 = runif(50, 0.1, 1))
testStats <- gseaProportionVary(testSize, useSig, useStatistics)

outP <- lapply(testStats, function(inStat) {
    tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn) {
        geneSetTest(inStat$index, inStat$stats[, useColumn], "down", ranks.only = FALSE)
    })
    tmpP <- do.call(cbind, tmpP)
    colnames(tmpP) <- colnames(inStat$stats)
    return(tmpP)
})

outP2 <- do.call(rbind, outP)
outP2 <- -1 * log(outP2)

tmpP <- data.frame(set = apply(outP2[, c("s1", "s2")], 1, min), list = outP2[, 
    "comb"])
tmpP$diff <- tmpP$set - tmpP$list

realRanks <- lapply(testStats, function(inStat) {

    apply(inStat$stats, 2, function(inCol) {
        outRank <- rank(inCol)[inStat$index]
        sum(outRank < useCut)/length(inStat$index)
    })

})

realRanks <- do.call(rbind, realRanks)

tmpP$frac <- realRanks[, 3]
ggplot(tmpP, aes(x = frac, y = diff)) + geom_point() + ggtitle("Sample - Combined vary fraction") + 
    useTheme + xlab("Fraction") + ylab("Diff. log p-values")

plot of chunk gseaProportions2Data

So, none of this has really worked, at least not in the way that was hoped.

GSEA using p-values instead of t-statistics

For our hypothetical comparison, we need to think of how one would combine two studies. In this case, one might combine two studies by combining the p-values. Therefore, we will use ranked p-values as our metric in the geneSetTest.

For this to work with geneSetTest, we need to make the range 0, 0.5, 1 conform to -1, 0, 1, because geneSetTest works nicer with signed values. We will use a linear regression to transform the p-values to a signed range.

regData <- data.frame(y = c(-1, 0, 1), x = c(0, 0.5, 1))
regModel <- lm(y ~ x, regData)

testData <- runif(10000, 0, 1)
outData <- regModel$coefficients[1] + regModel$coefficients[2] * testData
hist(outData, 100)

plot of chunk generateRegression

So our transformation works. Lets make sure it will work like the t-statistic.

nGene <- 10000
indepTValues <- sort(runif(nGene, 0, 1))
names(indepTValues) <- as.character(seq(1, nGene))

gseaMap <- list(t1 = sample(seq(1, nGene/4), 100))

tmpDat <- data.frame(tVal = indepTValues, status = "all", stringsAsFactors = F)
tmpDat$status[gseaMap[["t1"]]] <- "sample"
ggplot(tmpDat, aes(x = tVal, fill = status)) + geom_histogram(binwidt = 0.01, 
    position = "identity") + ggtitle("check p-values, 100%") + useTheme + xlab("P-values")

plot of chunk pvalRanks


geneSetTest(gseaMap$t1, p2signed(regModel, indepTValues), alternative = "down", 
    ranks.only = FALSE)

## [1] 1e-04


gseaMap80 <- list(t1 = c(sample(seq(1, nGene/2), 80), sample(seq((nGene/2) + 
    1, nGene), 20)))

geneSetTest(gseaMap80$t1, p2signed(regModel, indepTValues), alternative = "down", 
    ranks.only = FALSE)

## [1] 1e-04


tmpDat <- data.frame(tVal = indepTValues, status = "all", stringsAsFactors = F)
tmpDat$status[gseaMap80[["t1"]]] <- "sample"
ggplot(tmpDat, aes(x = tVal, fill = status)) + geom_histogram(binwidt = 0.01, 
    position = "identity") + ggtitle("check p-values, 80%") + useTheme + xlab("P-values")

plot of chunk pvalRanks



gseaMap50 <- list(t1 = c(sample(seq(1, nGene/2), 50), sample(seq((nGene/2) + 
    1, nGene), 50)))

geneSetTest(gseaMap50$t1, p2signed(regModel, indepTValues), alternative = "down", 
    ranks.only = FALSE)

## [1] 0.6527

Now, lets maintain proportions between samples as we did before, but for the list method, combine the p-values using Fisher's method.

`?`(gseaProportionFisher)

testSize <- gseaSizeRange(c(500, 500), 50)
useStatistics <- runif(10000, 0, 1)
useSig <- runif(50, 0.1, 1)
testStats <- gseaProportionFisher(testSize, useSig, useStatistics)

fullIndex <- seq(1, length(useStatistics))
useCut <- 5000

realRanks <- lapply(testStats, function(inStat) {

    apply(inStat$stats, 2, function(inCol) {
        outRank <- rank(inCol)[inStat$index]
        sum(outRank < useCut)/length(inStat$index)
    })

})

realRanks <- do.call(rbind, realRanks)
plot(realRanks[, 1], useSig)

plot of chunk checkGSEAPropFisher

plot(realRanks[, 2], useSig)

plot of chunk checkGSEAPropFisher

plot(realRanks[, 3], useSig)

plot of chunk checkGSEAPropFisher

testSize <- gseaSizeRange(c(500, 500), 50)
useStatistics <- runif(10000, 0, 1)
useSig <- runif(50, 0.1, 1)
testStats <- gseaProportionFisher(testSize, useSig, useStatistics)

outP <- lapply(testStats, function(inStat) {
    tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn) {
        geneSetTest(inStat$index, p2signed(regModel, inStat$stats[, useColumn]), 
            "down", ranks.only = FALSE)
    })
    tmpP <- do.call(cbind, tmpP)
    colnames(tmpP) <- colnames(inStat$stats)
    return(tmpP)
})

outP2 <- do.call(rbind, outP)
outP2 <- -1 * log(outP2)

tmpP <- data.frame(set = apply(outP2[, c("s1", "s2")], 1, min), list = outP2[, 
    "comb"])
tmpP$diff <- tmpP$set - tmpP$list
tmpP$frac <- useSig
ggplot(tmpP, aes(x = frac, y = diff)) + geom_point() + ggtitle("Sample - Combined Fishers") + 
    useTheme + xlab("Fraction") + ylab("Diff. log p-values")

plot of chunk testFishers


# try also using ranks only
outPRank <- lapply(testStats, function(inStat) {
    tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn) {
        geneSetTest(inStat$index, inStat$stats, ranks.only = TRUE)
    })
    tmpP <- do.call(cbind, tmpP)
    colnames(tmpP) <- colnames(inStat$stats)
    return(tmpP)
})

outP2Rank <- do.call(rbind, outPRank)
outP2Rank <- -1 * log(outP2Rank)

tmpP <- data.frame(set = apply(outP2Rank[, c("s1", "s2")], 1, min), list = outP2Rank[, 
    "comb"])
tmpP$diff <- tmpP$set - tmpP$list
tmpP$frac <- useSig
ggplot(tmpP, aes(x = frac, y = diff)) + geom_point() + ggtitle("Sample - Combined Fishers Ranks") + 
    useTheme + xlab("Fraction") + ylab("Diff. log p-values")

plot of chunk testFishers

Maximum p-value

Finally, we note that Shen & Tseng from 2010 combined independent experiments by taking the maximum p-value between samples.

`?`(gseaProportionMax)

testSize <- gseaSizeRange(c(500, 500), 50)
useStatistics <- runif(10000, 0, 1)
useSig <- runif(50, 0.1, 1)
testStats <- gseaProportionMax(testSize, useSig, useStatistics)

fullIndex <- seq(1, length(useStatistics))
useCut <- 5000

realRanks <- lapply(testStats, function(inStat) {

    apply(inStat$stats, 2, function(inCol) {
        outRank <- rank(inCol)[inStat$index]
        sum(outRank < useCut)/length(inStat$index)
    })

})

realRanks <- do.call(rbind, realRanks)
plot(realRanks[, 1], useSig)

plot of chunk checkGSEAMax

plot(realRanks[, 2], useSig)

plot of chunk checkGSEAMax

plot(realRanks[, 3], useSig)

plot of chunk checkGSEAMax

testSize <- gseaSizeRange(c(500, 500), 50)
useStatistics <- runif(10000, 0, 1)
useSig <- runif(50, 0.1, 1)
testStats <- gseaProportionMax(testSize, useSig, useStatistics)

outP <- lapply(testStats, function(inStat) {
    tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn) {
        geneSetTest(inStat$index, p2signed(regModel, inStat$stats[, useColumn]), 
            "down", ranks.only = FALSE)
    })
    tmpP <- do.call(cbind, tmpP)
    colnames(tmpP) <- colnames(inStat$stats)
    return(tmpP)
})

outP2 <- do.call(rbind, outP)
outP2 <- -1 * log(outP2)

tmpP <- data.frame(set = apply(outP2[, c("s1", "s2")], 1, min), list = outP2[, 
    "comb"])
tmpP$diff <- tmpP$set - tmpP$list
tmpP$frac <- useSig
ggplot(tmpP, aes(x = frac, y = diff)) + geom_point() + ggtitle("Sample - Combined Max-P") + 
    useTheme + xlab("Fraction") + ylab("Diff. log p-values")

plot of chunk runGSEAMax

And wow! It appears to work!

Lets test using different size ranges and the proportions.

gseaLo <- gseaSizeRange(c(20, 250), 50)
gseaMed <- gseaSizeRange(c(250, 500), 30)
gseaHi <- gseaSizeRange(c(500, 1000), 20)

gseaSizes <- c(gseaLo, gseaMed, gseaHi)
gseaSizeClass <- c(rep("low", 50), rep("med", 30), rep("hi", 20))

gseaFrac <- runif(100, 0.3, 1)
gseaStatistics <- runif(10000, 0, 1)

testStats <- gseaProportionMax(gseaSizes, gseaFrac, gseaStatistics)

outP <- lapply(testStats, function(inStat) {
    tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn) {
        geneSetTest(inStat$index, p2signed(regModel, inStat$stats[, useColumn]), 
            "down", ranks.only = FALSE)
    })
    tmpP <- do.call(cbind, tmpP)
    colnames(tmpP) <- colnames(inStat$stats)
    return(tmpP)
})

outP2 <- do.call(rbind, outP)
outP2 <- -1 * log(outP2)

tmpP <- data.frame(set = apply(outP2[, c("s1", "s2")], 1, min), list = outP2[, 
    "comb"])
tmpP$diff <- tmpP$set - tmpP$list
tmpP$frac <- gseaFrac
tmpP$class <- gseaSizeClass
tmpP$class <- factor(tmpP$class, c("low", "med", "hi"), ordered = TRUE)
ggplot(tmpP, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ class) + 
    useTheme

plot of chunk varySizesAndProps


p <- ggplot(tmpP, aes(x = frac, y = diff)) + geom_point() + useTheme + ggtitle("Sample - Combined Max-P") + 
    xlab("Fraction") + ylab("Diff. log p-values")
p

plot of chunk varySizesAndProps

savePlot(p, "samplecombined_maxp.png")

## pdf 
##   2

So, this is very important. The method of combining p-values for the list method is extremely important.

Does this hold using ranks only for the calculation?

outPRank <- lapply(testStats, function(inStat) {
    tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn) {
        geneSetTest(inStat$index, inStat$stats[, useColumn], ranks.only = TRUE)
    })
    tmpP <- do.call(cbind, tmpP)
    colnames(tmpP) <- colnames(inStat$stats)
    return(tmpP)
})

outP2Rank <- do.call(rbind, outPRank)
outP2Rank <- -1 * log(outP2Rank)

tmpPRank <- data.frame(set = apply(outP2Rank[, c("s1", "s2")], 1, min), list = outP2Rank[, 
    "comb"])
tmpPRank$diff <- tmpPRank$set - tmpPRank$list
tmpPRank$frac <- gseaFrac
tmpPRank$class <- gseaSizeClass
tmpPRank$class <- factor(tmpPRank$class, c("low", "med", "hi"), ordered = TRUE)
ggplot(tmpPRank, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ class) + 
    ggtitle("Sample - Combined Max-P Ranks") + xlab("Fraction") + ylab("Diff. log p-values")

plot of chunk maxPRankOnly

No, it doesn't because the ranks.only=TRUE will only check for mixed values, not at one end or the other.

Date and System information

Sys.time()

## [1] "2014-02-19 10:31:06 EST"

sessionInfo()

## R version 3.0.1 (2013-05-16)
## Platform: x86_64-unknown-linux-gnu (64-bit)
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=C                 LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] mgcv_1.7-27                 nlme_3.1-113               
##  [3] hgu133plus2.db_2.10.1       hgu95av2.db_2.10.1         
##  [5] ALL_1.4.16                  GO.db_2.10.1               
##  [7] org.Hs.eg.db_2.10.1         devtools_1.4.1             
##  [9] ccPaper_0.0.19              ggplot2_0.9.3.1            
## [11] limma_3.18.7                categoryComparePaper_1.6.15
## [13] AnnotationDbi_1.24.0        RSQLite_0.11.4             
## [15] DBI_0.2-7                   Biobase_2.22.0             
## [17] BiocGenerics_0.8.0          roxygen2_2.2.2             
## [19] digest_0.6.4               
## 
## loaded via a namespace (and not attached):
##  [1] annotate_1.40.0       AnnotationForge_1.4.4 brew_1.0-6           
##  [4] Category_2.28.0       colorspace_1.2-4      dichromat_2.0-0      
##  [7] evaluate_0.5.1        formatR_0.10          genefilter_1.44.0    
## [10] GOstats_2.28.0        graph_1.40.1          grid_3.0.1           
## [13] GSEABase_1.24.0       gtable_0.1.2          httr_0.2             
## [16] hwriter_1.3           IRanges_1.20.6        knitr_1.5            
## [19] labeling_0.2          lattice_0.20-24       markdown_0.6.4       
## [22] MASS_7.3-29           Matrix_1.1-1.1        memoise_0.1          
## [25] munsell_0.4.2         plyr_1.8              proto_0.3-10         
## [28] RBGL_1.38.0           RColorBrewer_1.0-5    RCurl_1.95-4.1       
## [31] RCytoscape_1.12.0     reshape2_1.2.2        scales_0.2.3         
## [34] splines_3.0.1         stats4_3.0.1          stringr_0.6.2        
## [37] survival_2.37-4       tools_3.0.1           whisker_0.3-2        
## [40] XML_3.98-1.1          XMLRPC_0.3-0          xtable_1.7-1