The sim2 data set consists of synthetic human, paired-end, 100bp reads from two conditions, each with three samples. The basis for the simulation is the human chromosome 1 from Ensembl GRCh37.71. We introduce differential gene expression, differential transcript expression and differential isoform usage.
basedir <- "/home/charlotte/gene_vs_tx_quantification"
suppressPackageStartupMessages(library(limma))
suppressPackageStartupMessages(library(edgeR))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tximport))
suppressPackageStartupMessages(library(ggplot2,
lib.loc = "/home/charlotte/R/x86_64-pc-linux-gnu-library/3.2"))
suppressPackageStartupMessages(library(Hmisc))
suppressPackageStartupMessages(library(iCOBRA,
lib.loc = "/home/charlotte/R/x86_64-pc-linux-gnu-library/3.2"))
suppressPackageStartupMessages(library(BiocParallel))
suppressPackageStartupMessages(library(DESeq2, lib.loc = "/home/charlotte/R/x86_64-pc-linux-gnu-library/3.2"))
## Warning: replacing previous import by 'ggplot2::Position' when loading 'DESeq2'
suppressPackageStartupMessages(library(DEXSeq))
suppressPackageStartupMessages(library(biomaRt))
sumna <- function(w) {
if (all(is.na(w))) NA
else sum(w, na.rm = TRUE)
}
muted <- c("#DC050C","#E8601C","#7BAFDE","#1965B0","#B17BA6",
"#882E72","#F1932D","#F6C141","#F7EE55","#4EB265",
"#90C987","#CAEDAB","#777777")
Since the simulation is based on chromosome 1 only, we use the following code to subset annotations (downloaded from Ensembl, GRCh37.71) to only this region, as well as to prepare index files for RSEM and STAR. We also generate an incomplete reference catalog, by removing 20% (randomly selected) of the transcripts from the annotation.
## Extract only transcripts from chromosome 1 from the transcript fasta file
## Also, generate a transcript-to-gene mapping
library(Biostrings)
cdna <- readDNAStringSet("wholegenome/Homo_sapiens.GRCh37.71.cdna.all.fa",
format = "fasta")
cdna.chr1 <- cdna[grep("GRCh37:1:", names(cdna))]
writeXStringSet(cdna.chr1, filepath = "chr1/Homo_sapiens.GRCh37.71.cdna.chr1.fa",
format = "fasta")
tx.to.gene <- t(sapply(names(cdna), function(s) {
s <- strsplit(s, " ")[[1]]
tx <- s[grep("^ENST", s)]
gn <- gsub("gene:", "", s[grep("gene:ENSG", s)])
c(gn, tx)
}))
rownames(tx.to.gene) <- NULL
write.table(tx.to.gene, file = "wholegenome/tx_to_gene.txt",
col.names = FALSE, row.names = FALSE, sep = "\t", quote = FALSE)
tx.to.gene.chr1 <- t(sapply(names(cdna.chr1), function(s) {
s <- strsplit(s, " ")[[1]]
tx <- s[grep("^ENST", s)]
gn <- gsub("gene:", "", s[grep("gene:ENSG", s)])
c(gn, tx)
}))
rownames(tx.to.gene.chr1) <- NULL
write.table(tx.to.gene.chr1, file = "chr1/tx_to_gene_chr1.txt",
col.names = FALSE, row.names = FALSE, sep = "\t", quote = FALSE)
## Subset GTF file
library(rtracklayer)
gtff <- import("wholegenome/Homo_sapiens.GRCh37.71.gtf")
gtff2 <- subset(gtff, seqnames == "1")
export(gtff2, "chr1/Homo_sapiens.GRCh37.71.chr1.gtf", format = "gtf")
## Determine which transcripts to exclude
library(Biostrings)
cdna <- readDNAStringSet("chr1/Homo_sapiens.GRCh37.71.cdna.chr1.fa")
tx <- sapply(strsplit(names(cdna), " "), .subset, 1)
set.seed(1)
keep_tx <- sample(tx, size = 0.8*length(tx))
excl_tx <- setdiff(tx, keep_tx)
## Generate new cDNA fasta
writeXStringSet(cdna[match(keep_tx, tx)],
file = "chr1_incomplete/Homo_sapiens.GRCh37.71.cdna.chr1_incomplete80.fa")
## Read and generate new GTF file
library(rtracklayer)
gtf <- import("chr1/Homo_sapiens.GRCh37.71.chr1.gtf")
gtf <- subset(gtf, transcript_id %in% keep_tx)
export(gtf, con = "chr1_incomplete/Homo_sapiens.GRCh37.71.chr1_incomplete80.gtf")
## Generate kallisto index
if (!file.exists("chr1_incomplete/kallisto_index/Homo_sapiens.GRCh37.71.cdna.chr1_incomplete80.idx")) {
cmd <- "kallisto index -i chr1_incomplete/kallisto_index/Homo_sapiens.GRCh37.71.cdna.chr1_incomplete80.idx chr1_incomplete/Homo_sapiens.GRCh37.71.cdna.chr1_incomplete80.fa"
cat(cmd, "\n")
system(cmd)
}
if (!file.exists("chr1_incomplete/STAR_index/SA")) {
cmd <- "STAR --runMode genomeGenerate --runThreadN 10 --genomeDir chr1_incomplete/STAR_index --genomeFastaFiles chr1/Homo_sapiens.GRCh37.71.dna.chromosome.1.fa --sjdbGTFfile chr1_incomplete/Homo_sapiens.GRCh37.71.chr1_incomplete80.gtf --sjdbOverhang 99"
message(cmd)
system(cmd)
}
RSEM=/home/Shared/data/alt_splicing_simulations/Simulation5_Charlotte/software/rsem-1.2.21
## Prepare the reference files for RSEM
$RSEM/rsem-prepare-reference --bowtie2 \
--transcript-to-gene-map chr1/tx_to_gene_chr1.txt \
chr1/Homo_sapiens.GRCh37.71.cdna.chr1.fa \
chr1/RSEM_index/Homo_sapiens.GRCh37.71.cdna.chr1
## Estimate the model file with RSEM
$RSEM/rsem-calculate-expression \
--paired-end --bowtie2 --seed 123 \
/home/charlotte/dte_simulation/exp_data/E_MTAB_1733/FASTQ/ERS326990_1.fastq.gz \
/home/charlotte/dte_simulation/exp_data/E_MTAB_1733/FASTQ/ERS326990_2.fastq.gz \
chr1/RSEM_index/Homo_sapiens.GRCh37.71.cdna.chr1 \
/home/charlotte/dte_simulation/exp_data/E_MTAB_1733/RSEM/ERS326990/chr1/ERS326990_chr1
STAR --runMode genomeGenerate --runThreadN 10 --genomeDir /home/Shared/data/alt_splicing_simulations/dte_simulation/annotations/chr1/STAR_index --genomeFastaFiles /home/Shared/data/alt_splicing_simulations/dte_simulation/annotations/chr1/Homo_sapiens.GRCh37.71.dna.chromosome.1.fa --sjdbGTFfile /home/Shared/data/alt_splicing_simulations/dte_simulation/annotations/chr1/Homo_sapiens.GRCh37.71.chr1.gtf --sjdbOverhang 99
Here, we reproduce the code used for generation of the simulated data, alignment, counting and isoform quantification.
library(dplyr)
library(Hmisc)
###############################################################################
## Preparation
###############################################################################
## Get sample to use as basis
s0 <- read.delim("../exp_data/E_MTAB_1733/kallisto/ERS326990/abundance.tsv",
header = TRUE, as.is = TRUE)
## Add gene information
tx_to_gene <- read.delim("../annotations/wholegenome/tx_to_gene.txt",
header = FALSE, as.is = TRUE)
colnames(tx_to_gene) <- c("gene", "transcript")
s0$gene <- tx_to_gene$gene[match(s0$target_id, tx_to_gene$transcript)]
## Calculate the "conversion factor" between TPM and counts. To get the
## count, take TPM * eff_length * conv_factor
conv_factor <- mean(s0$est_counts/s0$tpm/s0$eff_length, na.rm = TRUE)
## Calculate isoform percentages
s0 <- as.data.frame(s0 %>% group_by(gene) %>%
mutate(isopct = tpm/sum(tpm)) %>% ungroup())
s0$isopct[is.na(s0$isopct)] <- 0
## Generate collection of isoform percentages for genes with
## given number of isoforms
tmp <- split(s0, s0$gene)
isopcts <- lapply(tmp, function(x) x$isopct)
sums <- sapply(isopcts, sum)
isopcts <- isopcts[sums > 0]
L <- sapply(isopcts, length)
isopcts2 <- split(isopcts, L)
isopcts2 <- lapply(isopcts2, function(y) do.call(rbind, y))
## Summarise data per gene (expression, number of isoforms)
s0.gene <- as.data.frame(s0 %>% group_by(gene) %>%
summarise(tpm = sum(tpm),
nbr_isoforms = length(target_id)))
s0.gene$eligible <- 1
s0.gene$eligible[s0.gene$tpm < 0.1] <- 0
s0.gene$eligible[s0.gene$tpm > 1000] <- 0
## Only change genes on chromosome 1
chr1.genes <- unique(read.delim("../annotations/chr1/tx_to_gene_chr1.txt",
header = FALSE, as.is = TRUE)[, 1])
s0.gene$eligible[!s0.gene$gene %in% chr1.genes] <- 0
n.genes <- length(chr1.genes)
###############################################################################
## Generate second basis sample (condition 2)
###############################################################################
set.seed(42)
## Generate basis sample for second condition by modifying tpms
s1 <- s0
## Select genes to modify
## DTU: choose among all eligible genes with at least 2 isoforms
dtu <- sample(s0.gene$gene[s0.gene$eligible == 1 & s0.gene$nbr_isoforms > 1],
0.109*n.genes)
s0.gene[match(dtu, s0.gene$gene), "eligible"] <- 0
## DTE: choose among all eligible genes with at least 2 isoforms
dte <- sample(s0.gene$gene[s0.gene$eligible == 1 & s0.gene$nbr_isoforms > 1],
0.1095*n.genes)
s0.gene[match(dte, s0.gene$gene), "eligible"] <- 0
## DGE: choose among all eligible genes
dge <- sample(s0.gene$gene[s0.gene$eligible == 1], 0.109*n.genes)
s0.gene[match(dge, s0.gene$gene), "eligible"] <- 0
s1l <- split(s1, s1$gene)
## Get all TPMs of genes with DGE and shuffle them
## Shuffling will be done only between neighbouring "bins", to avoid too
## large fold changes
tpms_dge <- s0.gene[match(dge, s0.gene$gene), "tpm"]
names(tpms_dge) <- dge
tpms_dge <- sort(tpms_dge)
tmp_names <- names(tpms_dge)
tpms_dge1 <- tpms_dge
tpms_dge01 <- tpms_dge[1:(length(tpms_dge)/4)]
tpms_dge02 <- tpms_dge[(length(tpms_dge)/4 + 1):(2*length(tpms_dge)/4)]
tpms_dge03 <- tpms_dge[(2*length(tpms_dge)/4 + 1):(3*length(tpms_dge)/4)]
tpms_dge04 <- tpms_dge[(3*length(tpms_dge)/4 + 1):(4*length(tpms_dge)/4)]
while(max(abs(log2(tpms_dge/tpms_dge1))) > log2(256) |
min(abs(log2(tpms_dge/tpms_dge1))) < 0.55) {
tpms_dge <- c(sample(tpms_dge02), sample(tpms_dge01),
sample(tpms_dge04), sample(tpms_dge03))
}
names(tpms_dge) <- tmp_names
## Generate list of all transcripts with DTE
dte_tx <- unlist(lapply(dte, function(g) {
a <- s1l[[g]]
i <- which(a$tpm > 0)[sample(length(which(a$tpm > 0)),
min(ceiling(0.1*nrow(a)),
length(which(a$tpm > 0))))]
a[i, "target_id"]
}))
## Get all TPMs of transcripts with DTE and shuffle them
nbins <- 4
npbin <- length(dte_tx)/nbins
stopifnot(npbin == round(npbin))
idx <- (rep(c((npbin + 1):(2*npbin), 1:npbin), nbins/2)) +
rep(2*npbin*(0:(nbins/2 - 1)), each = 2*npbin)
tpms_dte <- s0[match(dte_tx, s0$target_id), "tpm"]
names(tpms_dte) <- dte_tx
tpms_dte <- sort(tpms_dte)
tmp_names <- names(tpms_dte)
tpms_dte <- tpms_dte[idx]
names(tpms_dte) <- tmp_names
## Modify TPM and count info
## DTU
for (g in dtu) {
a <- s1l[[g]]
n <- nrow(a)
isfd <- isopcts2[[as.character(n)]]
isfd <- sample(isfd[sample(1:nrow(isfd), 1), ])
tpm <- sum(a$tpm)
a$isopct <- isfd
a$tpm <- tpm*a$isopct
a$est_counts <- a$tpm * a$eff_length * conv_factor
s1l[[g]] <- a
}
## DTE
for (g in dte) {
a <- s1l[[g]]
kp <- intersect(a$target_id, names(tpms_dte))
a[match(kp, a$target_id), "tpm"] <- tpms_dte[kp]
a$isopct <- a$tpm/sum(a$tpm)
a$est_counts <- a$tpm * a$eff_length * conv_factor
s1l[[g]] <- a
}
## DGE
for (g in dge) {
a <- s1l[[g]]
a$tpm <- tpms_dge[g] * a$isopct
a$est_counts <- a$tpm * a$eff_length * conv_factor
s1l[[g]] <- a
}
s1 <- unsplit(s1l, s1$gene)
###############################################################################
## Generate individual samples (add variability)
###############################################################################
n_per_group <- 3
stopifnot(all(s0$target_id == s1$target_id))
mp <- readRDS("../annotations/Pickrell.Cheung.Mu.Phi.Estimates.rds")
gr1 <- t(sapply(s0$target_id, function(i) {
m <- s0$est_counts[match(i, s0$target_id)]
phi <- mp$pickrell.cheung.phi[which.min(abs(mp$pickrell.cheung.mu - m))]
rnbinom(n = n_per_group, size = 1/phi, mu = m)
}))
colnames(gr1) <- paste0("A", 1:ncol(gr1))
gr2 <- t(sapply(s1$target_id, function(i) {
m <- s1$est_counts[match(i, s1$target_id)]
phi <- mp$pickrell.cheung.phi[which.min(abs(mp$pickrell.cheung.mu - m))]
rnbinom(n = n_per_group, size = 1/phi, mu = m)
}))
colnames(gr2) <- paste0("B", 1:ncol(gr2))
gr1_tpm <- sweep(gr1, 1, s0$eff_length * conv_factor, "/")
gr2_tpm <- sweep(gr2, 1, s1$eff_length * conv_factor, "/")
gr1_tpm <- sweep(gr1_tpm, 2, colSums(gr1_tpm)/1e6, "/")
gr2_tpm <- sweep(gr2_tpm, 2, colSums(gr2_tpm)/1e6, "/")
all_tpm <- cbind(gr1_tpm, gr2_tpm)
###############################################################################
## Write files for RSEM
###############################################################################
rsem_template_chr1 <-
read.delim("../exp_data/E_MTAB_1733/RSEM/ERS326990/chr1/ERS326990_chr1.isoforms.results",
header = TRUE, as.is = TRUE)
for (i in 1:ncol(all_tpm)) {
tmp <- rsem_template_chr1
tmp$TPM <- all_tpm[match(tmp$transcript_id, rownames(all_tpm)), i]
tmp$TPM <- tmp$TPM/sum(tmp$TPM)*1e6
tmp$TPM <- round(tmp$TPM, digits = 2)
tmp$TPM <- as.character(tmp$TPM)
tmp$TPM[tmp$TPM == "0"] <- "0.00"
tmp$expected_count <- as.character(tmp$expected_count)
tmp$expected_count[tmp$expected_count == "0"] <- "0.00"
tmp$FPKM <- as.character(tmp$FPKM)
tmp$FPKM[tmp$FPKM == "0"] <- "0.00"
tmp$IsoPct <- as.character(tmp$IsoPct)
tmp$IsoPct[tmp$IsoPct == "0"] <- "0.00"
write.table(tmp, file = paste0("sim2/chr1/RSEM/sample",
colnames(all_tpm)[i], "_rsem.txt"),
row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")
}
###############################################################################
## Generate truth on gene and transcript level
###############################################################################
## Transcripts
stopifnot(all(s0$target_id == s1$target_id))
truth_tx <- s0
truth_tx$isopct1 <- truth_tx$isopct
truth_tx$isopct <- NULL
truth_tx$isopct2 <- s1$isopct
truth_tx$tpm1 <- truth_tx$tpm
truth_tx$tpm <- NULL
truth_tx$tpm2 <- s1$tpm
truth_tx$status <- 0
truth_tx$status[which(truth_tx$tpm1 != truth_tx$tpm2)] <- 1
truth_tx$logFC <- 0
truth_tx$logFC[which(truth_tx$status == 1)] <-
abs(log2(truth_tx$tpm1[which(truth_tx$status == 1)]/
truth_tx$tpm2[which(truth_tx$status == 1)]))
truth_tx$logFC_cat <- cut2(truth_tx$logFC,
c(0, quantile(truth_tx$logFC[truth_tx$logFC > 0 & is.finite(truth_tx$logFC)],
probs = c(0.25, 0.5, 0.75, 1)), Inf))
truth_tx$avetpm <- apply(truth_tx[, c("tpm1", "tpm2")], 1, mean)
t3 <- truth_tx[, c("target_id", "status", "logFC_cat", "logFC", "avetpm",
"length", "eff_length", "tpm1", "tpm2", "isopct1", "isopct2", "gene")]
colnames(t3) <- c("transcript", "status", "logFC_cat", "logFC", "avetpm",
"length", "eff_length", "tpm1", "tpm2", "isopct1", "isopct2", "gene")
## Subset to chr1
chr1_tx <- rsem_template_chr1$transcript_id
t3t <- t3[match(chr1_tx, t3$transcript), ]
t3t$avetpm_cat <- cut2(t3t$avetpm,
c(0, quantile(t3t$avetpm[t3t$avetpm > 0],
probs = c(0.25, 0.5, 0.75, 1))))
write.table(t3t, file = "sim2/chr1/truth_transcript.txt", row.names = FALSE,
col.names = TRUE, quote = FALSE, sep = "\t")
## Genes
t3g <- as.data.frame(t3t %>% group_by(gene) %>%
summarise(tpm1 = sum(tpm1),
tpm2 = sum(tpm2),
status_any = as.numeric(any(status == 1)),
n_isoforms = length(transcript),
diffisouse = as.numeric(any(isopct1 != isopct2))))
t3g$status_overall <- 0
t3g$status_overall[match(c(dge, dte), t3g$gene)] <- 1
t3g$logFC_overall <- 0
t3g$logFC_overall[match(c(dge, dte), t3g$gene)] <-
abs(log2(t3g$tpm1[match(c(dge, dte), t3g$gene)]/
t3g$tpm2[match(c(dge, dte), t3g$gene)]))
t3g$dtu <- 0
t3g$dte <- 0
t3g$dge <- 0
t3g$dtu[match(dtu, t3g$gene)] <- 1
t3g$dte[match(dte, t3g$gene)] <- 1
t3g$dge[match(dge, t3g$gene)] <- 1
t3g$logFC_overall_cat <-
cut2(t3g$logFC_overall,
c(0, quantile(t3g$logFC_overall[t3g$logFC_overall > 0],
probs = c(0.25, 0.5, 0.75, 1))))
t3g$avetpm <- apply(t3g[, c("tpm1", "tpm2")], 1, mean)
t3g$avetpm_cat <-
cut2(t3g$avetpm,
c(0, quantile(t3g$avetpm[t3g$avetpm > 0],
probs = c(0.25, 0.5, 0.75, 1))))
write.table(t3g, file = "sim2/chr1/truth_gene.txt", row.names = FALSE,
col.names = TRUE, quote = FALSE, sep = "\t")
###############################################################################
## Simulate FASTQ files
###############################################################################
## chr1
for (i in c(paste0("A", 1:n_per_group), paste0("B", 1:n_per_group))) {
cmd <- paste0("/home/Shared/data/alt_splicing_simulations/Simulation5_Charlotte/software/rsem-1.2.21/rsem-simulate-reads /home/Shared/data/alt_splicing_simulations/dte_simulation/annotations/chr1/RSEM_index/Homo_sapiens.GRCh37.71.cdna.chr1 ../exp_data/E_MTAB_1733/RSEM/ERS326990/chr1/ERS326990_chr1.stat/ERS326990_chr1.HQ.model sim2/chr1/RSEM/sample", i, "_rsem.txt 0 10000000 sim2/chr1/FASTQ/sample", i, "/sample", i, " --seed 123")
cat(cmd, "\n")
system(cmd)
}
###############################################################################
## Run kallisto on simulated FASTQ files
###############################################################################
## chr1
if (!file.exists("../annotations/chr1/kallisto_index/Homo_sapiens.GRCh37.71.cdna.chr1.idx")) {
cmd <- "kallisto index -i ../annotations/chr1/kallisto_index/Homo_sapiens.GRCh37.71.cdna.chr1.idx ../annotations/chr1/Homo_sapiens.GRCh37.71.cdna.chr1.fa"
cat(cmd, "\n")
system(cmd)
}
for (i in c(paste0("A", 1:n_per_group), paste0("B", 1:n_per_group))) {
cmd <- paste0("kallisto quant -i ../annotations/chr1/kallisto_index/Homo_sapiens.GRCh37.71.cdna.chr1.idx -o sim2/chr1/kallisto/sample", i, " -b 30 sim2/chr1/FASTQ/sample", i, "/sample", i, "_1.fq sim2/chr1/FASTQ/sample", i, "/sample", i, "_2.fq")
cat(cmd, "\n")
system(cmd)
}
###############################################################################
## Run salmon on simulated FASTQ files
###############################################################################
## chr1
if (!file.exists("../annotations/chr1/salmon_index/Homo_sapiens.GRCh37.71.cdna.chr1.sidx/hash.bin")) {
cmd <- "salmon index -i ../annotations/chr1/salmon_index/Homo_sapiens.GRCh37.71.cdna.chr1.sidx -t ../annotations/chr1/Homo_sapiens.GRCh37.71.cdna.chr1.fa -p 5 --type quasi"
cat(cmd, "\n")
system(cmd)
}
## Swap order of genes and transcripts in tx_to_gene_chr1.txt
cmd <- "awk '{ t=$1 ; $1=$2; $2=t; print }' ../annotations/chr1/tx_to_gene_chr1.txt > ../annotations/chr1/tx_to_gene_chr1_swapped.txt"
cat(cmd, "\n")
system(cmd)
for (i in c(paste0("A", 1:n_per_group), paste0("B", 1:n_per_group))) {
cmd <- paste0("salmon quant -i ../annotations/chr1/salmon_index/Homo_sapiens.GRCh37.71.cdna.chr1.sidx -l IU -1 sim2/chr1/FASTQ/sample", i, "/sample", i, "_1.fq -2 sim2/chr1/FASTQ/sample", i, "/sample", i, "_2.fq -p 5 -o sim2/chr1/salmon/sample", i, " -g ../annotations/chr1/tx_to_gene_chr1_swapped.txt --numBootstraps=30")
cat(cmd, "\n")
system(cmd)
}
###############################################################################
## Align to genome
###############################################################################
## chr1
fqdir <- list.files("sim2/chr1/FASTQ", full.names = TRUE)
fqdir <- fqdir[file.info(fqdir)$isdir]
for (fq in fqdir) {
cmd <- paste0("STAR --genomeDir ../annotations/chr1/STAR_index --readFilesIn ",
paste0(fq, "/", basename(fq), "_1.fq"), " ",
paste0(fq, "/", basename(fq), "_2.fq"), " --runThreadN 10 ",
"--outFileNamePrefix sim2/chr1/BAM/", basename(fq), "/",
" --outSAMtype BAM Unsorted")
cat(cmd, "\n")
system(cmd)
}
###############################################################################
## Run featureCounts
###############################################################################
## chr1
library(Rsubread)
bamdir <- list.files("sim2/chr1/BAM", full.names = TRUE)
bamdir <- bamdir[file.info(bamdir)$isdir]
bamfiles <- paste0(bamdir, "/Aligned.out.bam")
names(bamfiles) <- basename(bamdir)
fc <- featureCounts(files = bamfiles, isGTFAnnotationFile = TRUE,
annot.ext = "../annotations/chr1/Homo_sapiens.GRCh37.71.chr1.gtf",
GTF.featureType = "exon",
GTF.attrType = "gene_id",
useMetaFeatures = TRUE,
isPairedEnd = TRUE)
colnames(fc$counts) <- sapply(colnames(fc$counts), function(w) {
tmp <- strsplit(w, "\\.")[[1]]
tmp[grep("sample", tmp)]
})
save(fc, file = "sim2/chr1/featureCounts/fc.Rdata")
###############################################################################
## Run featureCounts, including multimappers
###############################################################################
## chr1
library(Rsubread)
bamdir <- list.files("sim2/chr1/BAM", full.names = TRUE)
bamdir <- bamdir[file.info(bamdir)$isdir]
bamfiles <- paste0(bamdir, "/Aligned.out.bam")
names(bamfiles) <- basename(bamdir)
fc_fracmm <- featureCounts(files = bamfiles, isGTFAnnotationFile = TRUE,
annot.ext = "../annotations/chr1/Homo_sapiens.GRCh37.71.chr1.gtf",
GTF.featureType = "exon",
GTF.attrType = "gene_id",
useMetaFeatures = TRUE,
isPairedEnd = TRUE,
countMultiMappingReads = TRUE,
fraction = TRUE)
colnames(fc_fracmm$counts) <- sapply(colnames(fc_fracmm$counts), function(w) {
tmp <- strsplit(w, "\\.")[[1]]
tmp[grep("sample", tmp)]
})
save(fc_fracmm, file = "sim2/chr1/featureCounts/fc_fracMM.Rdata")
###############################################################################
###############################################################################
## INCOMPLETE ANNOTATION
###############################################################################
###############################################################################
###############################################################################
## Run kallisto on simulated FASTQ files
###############################################################################
## chr1_incomplete
if (!file.exists("../annotations/chr1_incomplete/kallisto_index/Homo_sapiens.GRCh37.71.cdna.chr1_incomplete80.idx")) {
cmd <- "kallisto index -i ../annotations/chr1_incomplete/kallisto_index/Homo_sapiens.GRCh37.71.cdna.chr1_incomplete80.idx ../annotations/chr1_incomplete/Homo_sapiens.GRCh37.71.cdna.chr1_incomplete80.fa"
cat(cmd, "\n")
system(cmd)
}
for (i in c(paste0("A", 1:n_per_group), paste0("B", 1:n_per_group))) {
cmd <- paste0("kallisto quant -i ../annotations/chr1_incomplete/kallisto_index/Homo_sapiens.GRCh37.71.cdna.chr1_incomplete80.idx -o sim2/chr1_incomplete/kallisto/sample", i, " -b 30 sim2/chr1/FASTQ/sample", i, "/sample", i, "_1.fq sim2/chr1/FASTQ/sample", i, "/sample", i, "_2.fq")
cat(cmd, "\n")
system(cmd)
}
###############################################################################
## Run salmon on simulated FASTQ files
###############################################################################
## chr1_incomplete
if (!file.exists("../annotations/chr1_incomplete/salmon_index/Homo_sapiens.GRCh37.71.cdna.chr1_incomplete80.sidx/hash.bin")) {
cmd <- "salmon index -i ../annotations/chr1_incomplete/salmon_index/Homo_sapiens.GRCh37.71.cdna.chr1_incomplete80.sidx -t ../annotations/chr1_incomplete/Homo_sapiens.GRCh37.71.cdna.chr1_incomplete80.fa -p 5 --type quasi"
cat(cmd, "\n")
system(cmd)
}
for (i in c(paste0("A", 1:n_per_group), paste0("B", 1:n_per_group))) {
cmd <- paste0("salmon quant -i ../annotations/chr1_incomplete/salmon_index/Homo_sapiens.GRCh37.71.cdna.chr1_incomplete80.sidx -l IU -1 sim2/chr1/FASTQ/sample", i, "/sample", i, "_1.fq -2 sim2/chr1/FASTQ/sample", i, "/sample", i, "_2.fq -p 5 -o sim2/chr1_incomplete/salmon/sample", i, " -g ../annotations/chr1/tx_to_gene_chr1_swapped.txt --numBootstraps=30")
cat(cmd, "\n")
system(cmd)
}
###############################################################################
## Align to genome
###############################################################################
## chr1_incomplete
fqdir <- list.files("sim2/chr1/FASTQ", full.names = TRUE)
fqdir <- fqdir[file.info(fqdir)$isdir]
for (fq in fqdir) {
cmd <- paste0("STAR --genomeDir ../annotations/chr1_incomplete/STAR_index --readFilesIn ",
paste0(fq, "/", basename(fq), "_1.fq"), " ",
paste0(fq, "/", basename(fq), "_2.fq"), " --runThreadN 10 ",
"--outFileNamePrefix sim2/chr1_incomplete/BAM/", basename(fq), "/",
" --outSAMtype BAM Unsorted")
cat(cmd, "\n")
system(cmd)
}
###############################################################################
## Run featureCounts
###############################################################################
## chr1
library(Rsubread)
bamdir <- list.files("sim2/chr1_incomplete/BAM", full.names = TRUE)
bamdir <- bamdir[file.info(bamdir)$isdir]
bamfiles <- paste0(bamdir, "/Aligned.out.bam")
names(bamfiles) <- basename(bamdir)
fc_incomplete <- featureCounts(files = bamfiles, isGTFAnnotationFile = TRUE,
annot.ext = "../annotations/chr1_incomplete/Homo_sapiens.GRCh37.71.chr1_incomplete80.gtf",
GTF.featureType = "exon",
GTF.attrType = "gene_id",
useMetaFeatures = TRUE,
isPairedEnd = TRUE)
colnames(fc_incomplete$counts) <- sapply(colnames(fc_incomplete$counts), function(w) {
tmp <- strsplit(w, "\\.")[[1]]
tmp[grep("sample", tmp)]
})
save(fc_incomplete, file = "sim2/chr1_incomplete/featureCounts/fc_incomplete.Rdata")
The samples are named according to the condition they belong to (“A” or “B”). We define a metadata table mapping samples to the correct condition, to be used for differential analysis.
meta <- data.frame(sample = paste0("sample", c("A1", "A2", "A3", "B1", "B2", "B3")),
condition = c("A", "A", "A", "B", "B", "B"),
stringsAsFactors = FALSE)
rownames(meta) <- meta$sample
meta
## sample condition
## sampleA1 sampleA1 A
## sampleA2 sampleA2 A
## sampleA3 sampleA3 A
## sampleB1 sampleB1 B
## sampleB2 sampleB2 B
## sampleB3 sampleB3 B
The following reference files (either pointing to annotation files downloaded from Ensembl or corresponding to files generated by the simulation/pre-processing code above) will be used throughout the analysis.
simdir <- "/home/Shared/data/alt_splicing_simulations/dte_simulation"
gtf <- paste0(simdir, "/annotations/chr1/Homo_sapiens.GRCh37.71.chr1.gtf")
cdna_fasta <- paste0(simdir, "/annotations/chr1/Homo_sapiens.GRCh37.71.cdna.chr1.fa")
truth_tx_sampleA1_file <- paste0(simdir, "/sim_data/sim2/chr1/RSEM/sampleA1_rsem.txt")
salmon_basedir <- paste0(simdir, "/sim_data/sim2/chr1/salmon")
fc_file <- paste0(simdir, "/sim_data/sim2/chr1/featureCounts/fc.Rdata")
fc_file_incomplete <- gsub("fc.Rdata", "fc_incomplete.Rdata", gsub("chr1", "chr1_incomplete", fc_file))
truth_gene_file <- paste0(simdir, "/sim_data/sim2/chr1/truth_gene.txt")
truth_tx_file <- paste0(simdir, "/sim_data/sim2/chr1/truth_transcript.txt")
The following files will be generated by the code below.
feature_lengths_file <- paste0(basedir, "/annotation/Homo_Sapiens_GRCh37.71_chr1/feature_lengths.Rdata")
tx_gene_file <- paste0(basedir, "/annotation/Homo_Sapiens_GRCh37.71_chr1/tx_gene_map.Rdata")
Here we calculate the lengths of all genes and transcript (both defined here by the union of their exons), and define a mapping between gene and transcript identifiers from the reference files.
if (!file.exists(feature_lengths_file)) {
calc_lengths_mapping(gtf = gtf, cdna_fasta = cdna_fasta,
feature_lengths_file = feature_lengths_file,
tx_gene_file = tx_gene_file)
} else {
message("feature lengths and tx-to-gene map already calculated.")
}
## feature lengths and tx-to-gene map already calculated.
load(feature_lengths_file)
load(tx_gene_file)
We use the tximport
package (https://github.com/mikelove/tximport) to generate count matrices and offset matrices (average transcript lengths) from the Salmon transcript-level estimates. We generate two different count matrices (simplesum and scaledTPM), and additionally create offsets to be used with the simplesum matrix.
salmon_files <- list.files(salmon_basedir, pattern = "sample", full.names = TRUE)
salmon_files <- salmon_files[file.info(salmon_files)$isdir]
salmon_files <- paste0(salmon_files, "/quant.sf")
salmon_files <- salmon_files[file.exists(salmon_files)]
names(salmon_files) <- basename(gsub("/quant.sf", "", salmon_files))
txi_salmonsimplesum <- tximport(files = salmon_files, type = "salmon", txIn = TRUE,
txOut = FALSE, countsFromAbundance = "no",
gene2tx = gene2tx)
## reading in files
## 1
## 2
## 3
## 4
## 5
## 6
##
## summarizing abundance
## summarizing counts
## summarizing length
txi_salmonscaledtpm <- tximport(files = salmon_files, type = "salmon", txIn = TRUE,
txOut = FALSE, countsFromAbundance = "scaledTPM",
gene2tx = gene2tx)
## reading in files
## 1
## 2
## 3
## 4
## 5
## 6
##
## summarizing abundance
## summarizing counts
## summarizing length
txi_salmontx <- tximport(files = salmon_files, type = "salmon", txIn = TRUE,
txOut = TRUE, countsFromAbundance = "no", gene2tx = gene2tx)
## reading in files
## 1
## 2
## 3
## 4
## 5
## 6
##
salmon_quant <- list(geneCOUNT_sal_simplesum = txi_salmonsimplesum$counts,
geneCOUNT_sal_scaledTPM = txi_salmonscaledtpm$counts,
avetxlength = txi_salmonsimplesum$length,
geneTPM_sal = txi_salmonsimplesum$abundance,
txTPM_sal = txi_salmontx$abundance,
txCOUNT_sal = txi_salmontx$counts,
txi_salmonsimplesum = txi_salmonsimplesum,
txi_salmonscaledtpm = txi_salmonscaledtpm,
txi_salmontx = txi_salmontx)
salmon_files <- list.files(gsub("chr1", "chr1_incomplete", salmon_basedir),
pattern = "sample", full.names = TRUE)
salmon_files <- salmon_files[file.info(salmon_files)$isdir]
salmon_files <- paste0(salmon_files, "/quant.sf")
salmon_files <- salmon_files[file.exists(salmon_files)]
names(salmon_files) <- basename(gsub("/quant.sf", "", salmon_files))
txi_salmonsimplesum <- tximport(files = salmon_files, type = "salmon", txIn = TRUE,
txOut = FALSE, countsFromAbundance = "no",
gene2tx = gene2tx)
## reading in files
## 1
## 2
## 3
## 4
## 5
## 6
##
## summarizing abundance
## summarizing counts
## summarizing length
txi_salmonscaledtpm <- tximport(files = salmon_files, type = "salmon", txIn = TRUE,
txOut = FALSE, countsFromAbundance = "scaledTPM",
gene2tx = gene2tx)
## reading in files
## 1
## 2
## 3
## 4
## 5
## 6
##
## summarizing abundance
## summarizing counts
## summarizing length
txi_salmontx <- tximport(files = salmon_files, type = "salmon", txIn = TRUE,
txOut = TRUE, countsFromAbundance = "no", gene2tx = gene2tx)
## reading in files
## 1
## 2
## 3
## 4
## 5
## 6
##
salmon_quant_incomplete <- list(geneCOUNT_sal_simplesum = txi_salmonsimplesum$counts,
geneCOUNT_sal_scaledTPM = txi_salmonscaledtpm$counts,
avetxlength = txi_salmonsimplesum$length,
geneTPM_sal = txi_salmonsimplesum$abundance,
txTPM_sal = txi_salmontx$abundance,
txCOUNT_sal = txi_salmontx$counts,
txi_salmonsimplesum = txi_salmonsimplesum,
txi_salmonscaledtpm = txi_salmonscaledtpm,
txi_salmontx = txi_salmontx)
salmon_dirs <- list.files(salmon_basedir, pattern = "sample", full.names = TRUE)
salmon_dirs <- salmon_dirs[file.info(salmon_dirs)$isdir]
names(salmon_dirs) <- basename(salmon_dirs)
## Transcript
CV_tx_salmon <- as.data.frame(sapply(salmon_dirs, function(w) {
tmpdf <- as.data.frame(t(read.delim(paste0(w, "/quant_bootstraps.sf"),
skip = 12, header = TRUE, as.is = TRUE)))
tmpcv <- apply(tmpdf, 1, sd)/apply(tmpdf, 1, mean)
tmpcv[is.na(tmpcv)] <- 0
names(tmpcv) <- rownames(tmpdf)
tmpcv
}))
CV_tx_salmon$mean_cv <- apply(CV_tx_salmon[, names(salmon_dirs)], 1, mean)
CV_tx_salmon$min_cv <- apply(CV_tx_salmon[, names(salmon_dirs)], 1, min)
CV_tx_salmon$max_cv <- apply(CV_tx_salmon[, names(salmon_dirs)], 1, max)
CV_tx_salmon$median_cv <- apply(CV_tx_salmon[, names(salmon_dirs)], 1, median)
## Gene
CV_gene_salmon <- as.data.frame(sapply(salmon_dirs, function(w) {
tmpdf <- as.data.frame(t(read.delim(paste0(w, "/quant_bootstraps.sf"),
skip = 12, header = TRUE, as.is = TRUE)))
tmpdf$gene <- tx2gene$gene[match(rownames(tmpdf), tx2gene$transcript)]
tmpdf <- tmpdf %>% group_by(gene) %>% summarise_each(funs(sum)) %>% as.data.frame
rownames(tmpdf) <- tmpdf$gene
tmpdf$gene <- NULL
tmpcv <- apply(tmpdf, 1, sd)/apply(tmpdf, 1, mean)
tmpcv[is.na(tmpcv)] <- 0
names(tmpcv) <- rownames(tmpdf)
tmpcv
}))
CV_gene_salmon$mean_cv <- apply(CV_gene_salmon[, names(salmon_dirs)], 1, mean)
CV_gene_salmon$min_cv <- apply(CV_gene_salmon[, names(salmon_dirs)], 1, min)
CV_gene_salmon$max_cv <- apply(CV_gene_salmon[, names(salmon_dirs)], 1, max)
CV_gene_salmon$median_cv <- apply(CV_gene_salmon[, names(salmon_dirs)], 1, median)
cmd <- paste0("python /home/Shared/Rlib/release-3.1-lib/DEXSeq/",
"python_scripts/dexseq_prepare_annotation.py --aggregate='no' ",
gtf, " ", gsub("gtf$", "flattened.nomerge.gff", gtf))
message(cmd)
## python /home/Shared/Rlib/release-3.1-lib/DEXSeq/python_scripts/dexseq_prepare_annotation.py --aggregate='no' /home/Shared/data/alt_splicing_simulations/dte_simulation/annotations/chr1/Homo_sapiens.GRCh37.71.chr1.gtf /home/Shared/data/alt_splicing_simulations/dte_simulation/annotations/chr1/Homo_sapiens.GRCh37.71.chr1.flattened.nomerge.gff
if (!file.exists(gsub("gtf$", "flattened.nomerge.gff", gtf))) {
system(cmd)
} else {
message("Flattened annotation already exists.")
}
## Flattened annotation already exists.
for (i in c("A1", "A2", "A3", "B1", "B2", "B3")) {
cmd <- paste0("python /home/Shared/Rlib/release-3.1-lib/DEXSeq/python_scripts/dexseq_count.py ",
"-p yes -r name -s no -f bam ",
gsub("gtf$", "flattened.nomerge.gff", gtf), " ",
paste0(simdir, "/sim_data/sim2/chr1/BAM/sample", i, "/Aligned.out.bam"), " ",
paste0(basedir, "/quantifications/sim2/dexseq_exon/sample", i, ".txt"))
message(cmd)
if (!file.exists(paste0(basedir, "/quantifications/sim2/dexseq_exon/sample", i, ".txt")))
system(cmd)
else
message("DEXSeq counts for sample ", i, " already calculated.")
}
## python /home/Shared/Rlib/release-3.1-lib/DEXSeq/python_scripts/dexseq_count.py -p yes -r name -s no -f bam /home/Shared/data/alt_splicing_simulations/dte_simulation/annotations/chr1/Homo_sapiens.GRCh37.71.chr1.flattened.nomerge.gff /home/Shared/data/alt_splicing_simulations/dte_simulation/sim_data/sim2/chr1/BAM/sampleA1/Aligned.out.bam /home/charlotte/gene_vs_tx_quantification/quantifications/sim2/dexseq_exon/sampleA1.txt
## DEXSeq counts for sample A1 already calculated.
## python /home/Shared/Rlib/release-3.1-lib/DEXSeq/python_scripts/dexseq_count.py -p yes -r name -s no -f bam /home/Shared/data/alt_splicing_simulations/dte_simulation/annotations/chr1/Homo_sapiens.GRCh37.71.chr1.flattened.nomerge.gff /home/Shared/data/alt_splicing_simulations/dte_simulation/sim_data/sim2/chr1/BAM/sampleA2/Aligned.out.bam /home/charlotte/gene_vs_tx_quantification/quantifications/sim2/dexseq_exon/sampleA2.txt
## DEXSeq counts for sample A2 already calculated.
## python /home/Shared/Rlib/release-3.1-lib/DEXSeq/python_scripts/dexseq_count.py -p yes -r name -s no -f bam /home/Shared/data/alt_splicing_simulations/dte_simulation/annotations/chr1/Homo_sapiens.GRCh37.71.chr1.flattened.nomerge.gff /home/Shared/data/alt_splicing_simulations/dte_simulation/sim_data/sim2/chr1/BAM/sampleA3/Aligned.out.bam /home/charlotte/gene_vs_tx_quantification/quantifications/sim2/dexseq_exon/sampleA3.txt
## DEXSeq counts for sample A3 already calculated.
## python /home/Shared/Rlib/release-3.1-lib/DEXSeq/python_scripts/dexseq_count.py -p yes -r name -s no -f bam /home/Shared/data/alt_splicing_simulations/dte_simulation/annotations/chr1/Homo_sapiens.GRCh37.71.chr1.flattened.nomerge.gff /home/Shared/data/alt_splicing_simulations/dte_simulation/sim_data/sim2/chr1/BAM/sampleB1/Aligned.out.bam /home/charlotte/gene_vs_tx_quantification/quantifications/sim2/dexseq_exon/sampleB1.txt
## DEXSeq counts for sample B1 already calculated.
## python /home/Shared/Rlib/release-3.1-lib/DEXSeq/python_scripts/dexseq_count.py -p yes -r name -s no -f bam /home/Shared/data/alt_splicing_simulations/dte_simulation/annotations/chr1/Homo_sapiens.GRCh37.71.chr1.flattened.nomerge.gff /home/Shared/data/alt_splicing_simulations/dte_simulation/sim_data/sim2/chr1/BAM/sampleB2/Aligned.out.bam /home/charlotte/gene_vs_tx_quantification/quantifications/sim2/dexseq_exon/sampleB2.txt
## DEXSeq counts for sample B2 already calculated.
## python /home/Shared/Rlib/release-3.1-lib/DEXSeq/python_scripts/dexseq_count.py -p yes -r name -s no -f bam /home/Shared/data/alt_splicing_simulations/dte_simulation/annotations/chr1/Homo_sapiens.GRCh37.71.chr1.flattened.nomerge.gff /home/Shared/data/alt_splicing_simulations/dte_simulation/sim_data/sim2/chr1/BAM/sampleB3/Aligned.out.bam /home/charlotte/gene_vs_tx_quantification/quantifications/sim2/dexseq_exon/sampleB3.txt
## DEXSeq counts for sample B3 already calculated.
The accuracy of the abundance estimates from Salmon and featureCounts is evaluated by comparing to the true TPMs underlying the simulation.
## Complete annotation
truth_tx_sampleA1 <- read.delim(truth_tx_sampleA1_file, header = TRUE, as.is = TRUE)
## Add number of transcripts per gene
truth_tx_sampleA1 <- truth_tx_sampleA1 %>% group_by(gene_id) %>%
mutate(nbr_tx = length(transcript_id)) %>% ungroup() %>% as.data.frame()
sampleA1_tx_salmon <- Reduce(function(...) merge(..., by = "feature", all = TRUE),
list(data.frame(feature = truth_tx_sampleA1$transcript_id,
true_tpm = truth_tx_sampleA1$TPM,
gene = truth_tx_sampleA1$gene_id,
nbr_tx = truth_tx_sampleA1$nbr_tx,
stringsAsFactors = FALSE),
data.frame(feature = rownames(salmon_quant$txTPM_sal),
est_tpm = salmon_quant$txTPM_sal[, "sampleA1"],
stringsAsFactors = FALSE)))
sampleA1_tx_salmon$measure <- "salmon_transcript"
sampleA1_tx_salmon$annotation <- "complete annotation"
sampleA1_gene_salmon <- sampleA1_tx_salmon %>% group_by(gene) %>%
summarise(true_tpm = sumna(true_tpm), est_tpm = sumna(est_tpm), nbr_tx = nbr_tx[1]) %>% as.data.frame
# sampleA1_gene_salmon <- sampleA1_tx_salmon %>% group_by(gene) %>%
# summarise_each(funs(sumna), -c(feature, measure, annotation, nbr_tx)) %>% as.data.frame
sampleA1_gene_salmon$measure <- "salmon_gene"
sampleA1_gene_salmon$annotation <- "complete annotation"
load(fc_file)
sampleA1_gene_fc <- Reduce(function(...) merge(..., by = "gene", all.x = TRUE),
list(sampleA1_gene_salmon[, c("gene", "true_tpm", "nbr_tx")],
data.frame(gene = rownames(fc$counts),
fc_count = fc$counts[, "sampleA1"],
stringsAsFactors = FALSE),
data.frame(gene = names(gene_length),
gene_length_unionexon = gene_length,
stringsAsFactors = FALSE)))
sampleA1_gene_fc$fpkm_fc_unionexon <-
sampleA1_gene_fc$fc_count/(sampleA1_gene_fc$gene_length_unionexon/1e3 *
sum(sampleA1_gene_fc$fc_count, na.rm = TRUE)/1e6)
sampleA1_gene_fc$est_tpm <-
sampleA1_gene_fc$fpkm_fc_unionexon/sum(sampleA1_gene_fc$fpkm_fc_unionexon, na.rm = TRUE) * 1e6
sampleA1_gene_fc$measure <- "featureCounts_FPKM_gene"
sampleA1_gene_fc$annotation <- "complete annotation"
## Incomplete annotation
truth_tx_sampleA1 <- read.delim(truth_tx_sampleA1_file, header = TRUE, as.is = TRUE)
## Add number of transcripts per gene
truth_tx_sampleA1 <- truth_tx_sampleA1 %>% group_by(gene_id) %>%
mutate(nbr_tx = length(transcript_id)) %>% ungroup() %>% as.data.frame()
sampleA1_tx_salmon_i <- Reduce(function(...) merge(..., by = "feature", all = TRUE),
list(data.frame(feature = truth_tx_sampleA1$transcript_id,
true_tpm = truth_tx_sampleA1$TPM,
gene = truth_tx_sampleA1$gene_id,
nbr_tx = truth_tx_sampleA1$nbr_tx,
stringsAsFactors = FALSE),
data.frame(feature = rownames(salmon_quant_incomplete$txTPM_sal),
est_tpm = salmon_quant_incomplete$txTPM_sal[, "sampleA1"],
stringsAsFactors = FALSE)))
sampleA1_tx_salmon_i$measure <- "salmon_transcript"
sampleA1_tx_salmon_i$annotation <- "incomplete annotation"
sampleA1_gene_salmon_i <- sampleA1_tx_salmon_i %>% group_by(gene) %>%
summarise(true_tpm = sumna(true_tpm), est_tpm = sumna(est_tpm), nbr_tx = nbr_tx[1]) %>% as.data.frame
# sampleA1_gene_salmon_i <- sampleA1_tx_salmon_i %>% group_by(gene) %>%
# summarise_each(funs(sumna), -c(feature, measure, annotation)) %>% as.data.frame
sampleA1_gene_salmon_i$measure <- "salmon_gene"
sampleA1_gene_salmon_i$annotation <- "incomplete annotation"
load(fc_file_incomplete)
sampleA1_gene_fc_i <- Reduce(function(...) merge(..., by = "gene", all.x = TRUE),
list(sampleA1_gene_salmon_i[, c("gene", "true_tpm", "nbr_tx")],
data.frame(gene = rownames(fc_incomplete$counts),
fc_count = fc_incomplete$counts[, "sampleA1"],
stringsAsFactors = FALSE),
data.frame(gene = names(gene_length),
gene_length_unionexon = gene_length,
stringsAsFactors = FALSE)))
sampleA1_gene_fc_i$fpkm_fc_unionexon <-
sampleA1_gene_fc_i$fc_count/(sampleA1_gene_fc_i$gene_length_unionexon/1e3 *
sum(sampleA1_gene_fc_i$fc_count, na.rm = TRUE)/1e6)
sampleA1_gene_fc_i$est_tpm <-
sampleA1_gene_fc_i$fpkm_fc_unionexon/sum(sampleA1_gene_fc_i$fpkm_fc_unionexon, na.rm = TRUE) * 1e6
sampleA1_gene_fc_i$measure <- "featureCounts_FPKM_gene"
sampleA1_gene_fc_i$annotation <- "incomplete annotation"
## Merge into one data frame
df <- rbind(sampleA1_tx_salmon[, c("measure", "annotation", "nbr_tx", "true_tpm", "est_tpm")],
sampleA1_gene_salmon[, c("measure", "annotation", "nbr_tx", "true_tpm", "est_tpm")],
sampleA1_gene_fc[, c("measure", "annotation", "nbr_tx", "true_tpm", "est_tpm")],
sampleA1_tx_salmon_i[, c("measure", "annotation", "nbr_tx", "true_tpm", "est_tpm")],
sampleA1_gene_salmon_i[, c("measure", "annotation", "nbr_tx", "true_tpm", "est_tpm")],
sampleA1_gene_fc_i[, c("measure", "annotation", "nbr_tx", "true_tpm", "est_tpm")])
df$measure <- factor(df$measure, levels = c("salmon_transcript", "salmon_gene",
"featureCounts_FPKM_gene"))
cors <- df %>% group_by(measure, annotation) %>%
summarise(sp = signif(cor(true_tpm, est_tpm, use = "complete", method = "spearman"), 3))
ggplot(df, aes(x = true_tpm, y = est_tpm, col = measure)) +
scale_colour_manual(values = c("blue", "red", "dimgrey"), guide = FALSE) +
geom_abline(intercept = 0, slope = 1) +
facet_grid(annotation ~ measure) +
geom_point(alpha = 0.5) +
scale_x_log10() + scale_y_log10() +
xlab("True TPM") + ylab("Estimated TPM") +
plot_theme() +
geom_text(x = 3, y = -5, aes(label = paste0("cor = ", sp)), size = 3, data = cors)
Split by number of transcripts
df2 <- subset(df, annotation == "complete annotation")
df2$nbr_tx_cat <- Hmisc::cut2(df2$nbr_tx, g = 3)
cors2 <- df2 %>% group_by(measure, nbr_tx_cat) %>%
summarise(sp = signif(cor(true_tpm, est_tpm, use = "complete", method = "spearman"), 3))
ggplot(df2, aes(x = true_tpm, y = est_tpm, col = measure)) +
scale_colour_manual(values = c("blue", "red", "dimgrey"), guide = FALSE) +
geom_abline(intercept = 0, slope = 1) +
facet_grid(nbr_tx_cat ~ measure) +
geom_point(alpha = 0.5) +
scale_x_log10() + scale_y_log10() +
xlab("True TPM") + ylab("Estimated TPM") +
plot_theme() +
geom_text(x = 3, y = -5, aes(label = paste0("cor = ", sp)), size = 3, data = cors2)
df_sub <- subset(df, true_tpm > 0 & est_tpm > 0)
cors_sub <- df_sub %>% group_by(measure, annotation) %>%
summarise(sp = signif(cor(true_tpm, est_tpm, use = "complete", method = "spearman"), 3))
ggplot(df_sub, aes(x = true_tpm, y = est_tpm, col = measure)) +
scale_colour_manual(values = c("blue", "red", "dimgrey"), guide = FALSE) +
geom_abline(intercept = 0, slope = 1) +
facet_grid(annotation ~ measure) +
geom_point(alpha = 0.5) +
scale_x_log10() + scale_y_log10() +
xlab("True TPM") + ylab("Estimated TPM") +
plot_theme() +
geom_text(x = 3, y = -5, aes(label = paste0("cor = ", sp)), size = 3, data = cors_sub)
df_sub <- df
df_sub$est_tpm[is.na(df_sub$est_tpm)] <- 0
cors_sub <- df_sub %>% group_by(measure, annotation) %>%
summarise(sp = signif(cor(true_tpm, est_tpm, use = "complete", method = "spearman"), 3))
ggplot(df_sub, aes(x = true_tpm, y = est_tpm, col = measure)) +
scale_colour_manual(values = c("blue", "red", "dimgrey"), guide = FALSE) +
geom_abline(intercept = 0, slope = 1) +
facet_grid(annotation ~ measure) +
geom_point(alpha = 0.5) +
scale_x_log10() + scale_y_log10() +
xlab("True TPM") + ylab("Estimated TPM") +
plot_theme() +
geom_text(x = 3, y = -5, aes(label = paste0("cor = ", sp)), size = 3, data = cors_sub)
## Consider only gene-level estimates
dfg <- rbind(sampleA1_gene_salmon[, c("gene", "measure", "annotation", "true_tpm", "est_tpm")],
sampleA1_gene_fc[, c("gene", "measure", "annotation", "true_tpm", "est_tpm")])
dfg$measure <- factor(dfg$measure, levels = c("salmon_gene",
"featureCounts_FPKM_gene"))
## Find paralogous genes using biomaRt
ensembl <- useMart("ENSEMBL_MART_ENSEMBL", host = "www.ensembl.org")
ensembl <- useDataset("hsapiens_gene_ensembl", mart = ensembl)
bm <- getBM(attributes = c("ensembl_gene_id", "hsapiens_paralog_ensembl_gene"),
filters = "ensembl_gene_id", values = dfg$gene, mart = ensembl)
bm <- bm[which(bm$hsapiens_paralog_ensembl_gene != ""), ]
bm <- subset(bm, hsapiens_paralog_ensembl_gene %in% dfg$gene)
tt <- sort(table(bm$ensembl_gene_id), decreasing = TRUE)
tt <- tt[which(tt >= 5)]
basisgenes <- c()
for (nm in names(tt)) {
if (!any(c(nm, bm$hsapiens_paralog_ensembl_gene[which(bm$ensembl_gene_id == nm)]) %in% basisgenes)) {
tmpdf <- subset(dfg, gene %in% c(nm, bm$hsapiens_paralog_ensembl_gene[which(bm$ensembl_gene_id == nm)]))
if (median(tmpdf$true_tpm) > 5) {
basisgenes <- c(basisgenes, nm)
}
}
}
pdf("paralogs_sim2.pdf", width = 7, height = 5)
for (gn in basisgenes) {
dfg_sub <- subset(dfg, gene %in% c(gn, bm$hsapiens_paralog_ensembl_gene[which(bm$ensembl_gene_id == gn)]))
cors_sub <- dfg_sub %>% group_by(measure) %>%
summarise(sp = signif(cor(true_tpm, est_tpm, use = "complete", method = "spearman"), 3),
pe = signif(cor(log10(true_tpm + 1), log10(est_tpm + 1),
use = "complete", method = "pearson"), 3))
xpos <- 0.25 * min(log10(dfg_sub$true_tpm[dfg_sub$true_tpm > 0])) +
0.75 * max(log10(dfg_sub$true_tpm[dfg_sub$true_tpm > 0]))
ypos <- 0.75 * min(log10(dfg_sub$est_tpm[dfg_sub$est_tpm > 0])) +
0.25 * max(log10(dfg_sub$est_tpm[dfg_sub$est_tpm > 0]))
ypos2 <- 0.8 * min(log10(dfg_sub$est_tpm[dfg_sub$est_tpm > 0])) +
0.2 * max(log10(dfg_sub$est_tpm[dfg_sub$est_tpm > 0]))
print(ggplot(dfg_sub, aes(x = true_tpm, y = est_tpm, col = measure)) +
scale_colour_manual(values = c("red", "dimgrey"), guide = FALSE) +
geom_abline(intercept = 0, slope = 1) +
facet_grid(annotation ~ measure) +
geom_point(size = 2) +
scale_x_log10() + scale_y_log10() +
xlab("True TPM") + ylab("Estimated TPM") +
plot_theme() + ggtitle(gn) +
geom_text(x = xpos, y = ypos, aes(label = paste0("spearman = ", sp)), size = 3, data = cors_sub) +
geom_text(x = xpos, y = ypos2, aes(label = paste0("pearson = ", pe)), size = 3, data = cors_sub))
}
dev.off()
## pdf
## 2
To quantify the variability of the transcript and gene TPM estimates obtained with Salmon, we calculate the coefficients of variation across 30 bootstrap replicates for one of the samples in the data set.
sampleA1_tx_salmon <- merge(sampleA1_tx_salmon,
data.frame(feature = rownames(CV_tx_salmon),
coefvar = CV_tx_salmon[, "sampleA1"],
stringsAsFactors = FALSE),
by = "feature", all = TRUE)
sampleA1_gene_salmon <- merge(sampleA1_gene_salmon,
data.frame(gene = rownames(CV_gene_salmon),
coefvar = CV_gene_salmon[, "sampleA1"],
stringsAsFactors = FALSE),
by = "gene", all = TRUE)
cvs_sal <- data.frame(cvs = c(sampleA1_tx_salmon$coefvar, sampleA1_gene_salmon$coefvar),
lev = rep(c("transcript", "gene"), c(nrow(sampleA1_tx_salmon), nrow(sampleA1_gene_salmon))))
ggplot(cvs_sal, aes(x = log10(cvs + 0.001), col = lev)) +
geom_line(stat = "density", size = 2) +
ggtitle("") +
xlab("log10(coefficient of variation + 0.001)") + ylab("density") +
scale_colour_manual(values = c("red", "blue"), name = "") +
plot_theme()
geneCOUNT_fc <- fc$counts[match(rownames(salmon_quant$geneCOUNT_sal_simplesum),
rownames(fc$counts)), ]
stopifnot(all(rownames(salmon_quant$geneCOUNT_sal_simplesum) ==
rownames(salmon_quant$geneCOUNT_sal_scaledTPM)))
sampleA1 <- cbind(simplesum_salmon = salmon_quant$geneCOUNT_sal_simplesum[, "sampleA1"],
scaledTPM_salmon = salmon_quant$geneCOUNT_sal_scaledTPM[, "sampleA1"],
featureCounts = geneCOUNT_fc[, "sampleA1"])
rownames(sampleA1) <- rownames(salmon_quant$geneCOUNT_sal_simplesum)
sampleA1 <- log2(sampleA1 + 1)
pairs(sampleA1, upper.panel = panel_smooth, lower.panel = panel_cor)
pairs(sampleA1[which(rowSums(sampleA1 <= 0) == 0), ], upper.panel = panel_smooth,
lower.panel = panel_cor)
idx <- rownames(sampleA1)[order(abs(sampleA1[, "simplesum_salmon"] - sampleA1[, "scaledTPM_salmon"]),
decreasing = TRUE)[1:5]]
2^sampleA1[idx, , drop = FALSE] - 1
## simplesum_salmon scaledTPM_salmon featureCounts
## ENSG00000268552 85.84020 5045.5211 0
## ENSG00000230953 12.00000 640.2443 12
## ENSG00000230777 2.00000 117.5560 2
## ENSG00000236876 9.00000 337.4714 9
## ENSG00000223804 64.39892 1917.5005 13
subset(tx2gene, gene %in% idx)
## transcript gene
## 13728 ENST00000440199 ENSG00000223804
## 13729 ENST00000445753 ENSG00000223804
## 13730 ENST00000418192 ENSG00000223804
## 13731 ENST00000440862 ENSG00000223804
## 13732 ENST00000429388 ENSG00000223804
## 14916 ENST00000452777 ENSG00000230777
## 14966 ENST00000420575 ENSG00000230953
## 15984 ENST00000424567 ENSG00000236876
## 15985 ENST00000536927 ENSG00000236876
## 17201 ENST00000601459 ENSG00000268552
We perform DTE by applying edgeR to transcript-level counts obtained from Salmon. The p-values from edgeR are then aggregated into gene-wise FDR estimates using the perGeneQValue
function from DEXSeq.
res_dte_salmon_tx <- diff_expression_edgeR(counts = salmon_quant$txCOUNT_sal,
meta = meta, cond_name = "condition",
sample_name = "sample",
gene_length_matrix = NULL)
res_dte_salmon_tx$tt$gene <- as.character(tx2gene$gene[match(rownames(res_dte_salmon_tx$tt),
tx2gene$transcript)])
## Summarise on gene level
geneSplit = split(seq(along = res_dte_salmon_tx$tt$gene), res_dte_salmon_tx$tt$gene)
pGene = sapply(geneSplit, function(i) min(res_dte_salmon_tx$tt$PValue[i]))
stopifnot(all(is.finite(pGene)))
theta = unique(sort(pGene))
q = DEXSeq:::perGeneQValueExact(pGene, theta, geneSplit)
qval_dte_salmon_gene = rep(NA_real_, length(pGene))
qval_dte_salmon_gene = q[match(pGene, theta)]
qval_dte_salmon_gene = pmin(1, qval_dte_salmon_gene)
names(qval_dte_salmon_gene) = names(geneSplit)
stopifnot(!any(is.na(qval_dte_salmon_gene)))
Here, we compare the DTE performance evaluated on transcript- and gene-level. Note that both the result level (transcript results vs aggregated gene results) and the truth (differentially expressed transcript vs genes with at least one differentially expressed transcript) are different, and the performance is evaluated by comparing each result table to its respective truth.
truth_tx <- read.delim(truth_tx_file, header = TRUE, as.is = TRUE)
truth_any <- read.delim(truth_gene_file, header = TRUE, as.is = TRUE)
cobra_dte <- COBRAData(padj = data.frame(transcript_salmon = res_dte_salmon_tx$tt$FDR,
row.names = rownames(res_dte_salmon_tx$tt)),
truth = data.frame(truth_tx[, "status", drop = FALSE],
row.names = truth_tx$transcript,
stringsAsFactors = FALSE))
tmp <- truth_any[, "status_any", drop = FALSE]
colnames(tmp) <- "status"
cobra_dte <- COBRAData(padj = data.frame(gene_salmon = qval_dte_salmon_gene,
row.names = names(qval_dte_salmon_gene)),
truth = data.frame(tmp,
row.names = truth_any$gene,
stringsAsFactors = FALSE),
object_to_extend = cobra_dte)
cobraperf_dte <- calculate_performance(cobra_dte, binary_truth = "status",
aspects = c("fdrtpr", "fdrtprcurve"),
onlyshared = TRUE)
cobraplot_dte <- prepare_data_for_plot(cobraperf_dte, colorscheme = c("red", "blue"),
facetted = FALSE)
fdrtpr(cobraplot_dte)$fullmethod <- gsub("_overall", "", fdrtpr(cobraplot_dte)$fullmethod)
fdrtprcurve(cobraplot_dte)$fullmethod <- gsub("_overall", "", fdrtprcurve(cobraplot_dte)$fullmethod)
plot_fdrtprcurve(cobraplot_dte, pointsize = 2, xaxisrange = c(0, 0.7)) +
theme(legend.position = "bottom", legend.text = element_text(size = 20))
g2tx <- gene2tx
g2tx$txstatus <- truth(cobra_dte)$status[match(g2tx$transcript, rownames(truth(cobra_dte)))]
g2tx$txsign <- as.numeric(padj(cobra_dte)$transcript_salmon[match(g2tx$transcript,
rownames(padj(cobra_dte)))] <= 0.05)
g2tx$txsign[which(is.na(g2tx$txsign))] <- 0
nbrtx <- g2tx %>% group_by(gene) %>% summarise(nbr_tx = length(transcript),
nbr_truedetx = sum(txstatus),
nbr_signtx = sum(txsign),
nbr_tptx = sum(txstatus*txsign)) %>% as.data.frame()
G <- merge(nbrtx, truth(cobra_dte), by.x = "gene", by.y = 0)
colnames(G)[colnames(G) == "status"] <- "gene_truedte"
G$gene_sign <- as.numeric(padj(cobra_dte)$gene_salmon[match(G$gene, rownames(padj(cobra_dte)))] <= 0.05)
G$gene_sign[which(is.na(G$gene_sign))] <- 0
G$gene_tp <- G$gene_sign*G$gene_truedte
G$frac_tptx <- G$nbr_tptx/G$nbr_truedetx
G$frac_tptx[which(is.na(G$frac_tptx))] <- 0
ggplot(subset(G, gene_tp == 1), aes(x = frac_tptx)) + geom_bar() +
plot_theme() + xlab("Fraction true positive transcripts") +
ggtitle("True positive genes")
cobra <- COBRAData(score = data.frame(transcript_salmon = abs(res_dte_salmon_tx$tt$logFC),
row.names = rownames(res_dte_salmon_tx$tt)))
truth_tx <- read.delim(truth_tx_file, header = TRUE, as.is = TRUE)
cobra <- COBRAData(truth = data.frame(truth_tx, row.names = truth_tx$transcript,
stringsAsFactors = FALSE), object_to_extend = cobra)
cobraperf <- iCOBRA::calculate_performance(cobra, cont_truth = "logFC")
cobraplot1 <- iCOBRA::prepare_data_for_plot(cobraperf, incltruth = TRUE, facetted = FALSE,
colorscheme = muted[c(1, 3, 5, 8, 10)])
corr(cobraplot1)$fullmethod <- gsub("_overall", "", corr(cobraplot1)$fullmethod)
deviation(cobraplot1)$fullmethod <- gsub("_overall", "", deviation(cobraplot1)$fullmethod)
scatter(cobraplot1)$fullmethod <- gsub("_overall", "", scatter(cobraplot1)$fullmethod)
iCOBRA::plot_corr(cobraplot1, pointsize = 5, corrtype = "spearman") + theme(legend.position = "bottom")
corr(cobraplot1)[, c("basemethod", "splitval", "PEARSON", "SPEARMAN")]
## basemethod splitval PEARSON SPEARMAN
## 1 transcript_salmon overall 0.4878148 0.4911797
iCOBRA::plot_deviation(cobraplot1) + theme(legend.position = "bottom")
iCOBRA::plot_scatter(cobraplot1, pointsize = 2) + theme(legend.position = "bottom") +
geom_abline(intercept = 0, slope = 1) + geom_point(size = 2)
################################################################################
## Remove genes with true logFC = 0 or Inf
keep <- rownames(truth(cobra))[which(!truth(cobra)$logFC %in% c(0, Inf))]
cobrasub <- cobra[keep, ]
cobraperf <- iCOBRA::calculate_performance(cobrasub, cont_truth = "logFC")
cobraplot1 <- iCOBRA::prepare_data_for_plot(cobraperf, incltruth = TRUE, facetted = FALSE,
colorscheme = muted[c(1, 3, 5, 8, 10)])
corr(cobraplot1)$fullmethod <- gsub("_overall", "", corr(cobraplot1)$fullmethod)
deviation(cobraplot1)$fullmethod <- gsub("_overall", "", deviation(cobraplot1)$fullmethod)
scatter(cobraplot1)$fullmethod <- gsub("_overall", "", scatter(cobraplot1)$fullmethod)
iCOBRA::plot_corr(cobraplot1, pointsize = 5, corrtype = "spearman") + theme(legend.position = "bottom")
corr(cobraplot1)[, c("basemethod", "splitval", "PEARSON", "SPEARMAN")]
## basemethod splitval PEARSON SPEARMAN
## 1 transcript_salmon overall 0.5142394 0.7338135
iCOBRA::plot_deviation(cobraplot1) + theme(legend.position = "bottom")
iCOBRA::plot_scatter(cobraplot1, pointsize = 2) + theme(legend.position = "bottom") +
geom_abline(intercept = 0, slope = 1) + geom_point(size = 2)
Given the gene count matrices defined above (corresponding to the featureCounts, simplesum and scaledTPM matrices mentioned in the manuscript), we apply edgeR (see below for DESeq2 results) to perform differential gene expression. For the simplesum and featureCounts matrices, we also perform the analyses using the offsets derived from the average transcript lengths (simplesum_avetxl).
res_sal_simplesum_edgeR <- diff_expression_edgeR(counts = salmon_quant$geneCOUNT_sal_simplesum,
meta = meta, cond_name = "condition",
sample_name = "sample",
gene_length_matrix = NULL)
res_sal_simplesum_avetxl_edgeR <- diff_expression_edgeR(counts = salmon_quant$geneCOUNT_sal_simplesum,
meta = meta, cond_name = "condition",
sample_name = "sample",
gene_length_matrix = salmon_quant$avetxlength)
res_sal_scaledTPM_edgeR <- diff_expression_edgeR(counts = salmon_quant$geneCOUNT_sal_scaledTPM,
meta = meta, cond_name = "condition",
sample_name = "sample",
gene_length_matrix = NULL)
res_fc_edgeR <- diff_expression_edgeR(counts = geneCOUNT_fc,
meta = meta, cond_name = "condition",
sample_name = "sample",
gene_length_matrix = NULL)
res_fc_avetxl_edgeR <- diff_expression_edgeR(counts = geneCOUNT_fc,
meta = meta, cond_name = "condition",
sample_name = "sample",
gene_length_matrix = salmon_quant$avetxlength)
dfh <- data.frame(pvalue = c(res_fc_edgeR$tt$PValue,
res_fc_avetxl_edgeR$tt$PValue,
res_sal_scaledTPM_edgeR$tt$PValue,
res_sal_simplesum_edgeR$tt$PValue,
res_sal_simplesum_avetxl_edgeR$tt$PValue),
mth = c(rep("featureCounts", nrow(res_fc_edgeR$tt)),
rep("featureCounts_avetxl", nrow(res_fc_avetxl_edgeR$tt)),
rep("scaledTPM_salmon", nrow(res_sal_scaledTPM_edgeR$tt)),
rep("simplesum_salmon", nrow(res_sal_simplesum_edgeR$tt)),
rep("simplesum_salmon_avetxl", nrow(res_sal_simplesum_avetxl_edgeR$tt))))
ggplot(dfh, aes(x = pvalue)) + geom_histogram() + facet_wrap(~mth) +
plot_theme() +
xlab("p-value") + ylab("count")
par(mfrow = c(2, 3))
plotBCV(res_fc_edgeR$dge, main = "featureCounts")
plotBCV(res_fc_avetxl_edgeR$dge, main = "featureCounts_avetxl")
plotBCV(res_sal_scaledTPM_edgeR$dge, main = "scaledTPM_salmon")
plotBCV(res_sal_simplesum_edgeR$dge, main = "simplesum_salmon")
plotBCV(res_sal_simplesum_avetxl_edgeR$dge, main = "simplesum_salmon_avetxl")
par(mfrow = c(2, 3))
plotSmear(res_fc_edgeR$dge, main = "featureCounts", ylim = c(-11, 11))
plotSmear(res_fc_avetxl_edgeR$dge, main = "featureCounts_avetxl", ylim = c(-11, 11))
plotSmear(res_sal_scaledTPM_edgeR$dge, main = "scaledTPM_salmon", ylim = c(-11, 11))
plotSmear(res_sal_simplesum_edgeR$dge, main = "simplesum_salmon", ylim = c(-11, 11))
plotSmear(res_sal_simplesum_avetxl_edgeR$dge, main = "simplesum_salmon_avetxl", ylim = c(-11, 11))
par(mfrow = c(1, 1))
cobra_edgeR <- COBRAData(padj = data.frame(simplesum_salmon = res_sal_simplesum_edgeR$tt$FDR,
row.names = rownames(res_sal_simplesum_edgeR$tt)))
cobra_edgeR <- COBRAData(padj = data.frame(scaledTPM_salmon = res_sal_scaledTPM_edgeR$tt$FDR,
row.names = rownames(res_sal_scaledTPM_edgeR$tt)),
object_to_extend = cobra_edgeR)
cobra_edgeR <- COBRAData(padj = data.frame(simplesum_salmon_avetxl = res_sal_simplesum_avetxl_edgeR$tt$FDR,
row.names = rownames(res_sal_simplesum_avetxl_edgeR$tt)),
object_to_extend = cobra_edgeR)
cobra_edgeR <- COBRAData(padj = data.frame(featureCounts = res_fc_edgeR$tt$FDR,
row.names = rownames(res_fc_edgeR$tt)),
object_to_extend = cobra_edgeR)
cobra_edgeR <- COBRAData(padj = data.frame(featureCounts_avetxl = res_fc_avetxl_edgeR$tt$FDR,
row.names = rownames(res_fc_avetxl_edgeR$tt)),
object_to_extend = cobra_edgeR)
truth_gene <- read.delim(truth_gene_file, header = TRUE, as.is = TRUE, row.names = 1)
cobra_edgeR <- COBRAData(truth = truth_gene, object_to_extend = cobra_edgeR)
cobraperf_edgeR <- calculate_performance(cobra_edgeR, binary_truth = "status_overall")
cobraplot1_edgeR <- prepare_data_for_plot(cobraperf_edgeR, incltruth = FALSE,
colorscheme = muted[c(1, 3, 5, 8, 10)])
plot_overlap(cobraplot1_edgeR, cex = c(1, 0.7, 0.7))
## FDR/TPR and FPC
cobraplot2_edgeR <- prepare_data_for_plot(cobraperf_edgeR, incltruth = TRUE,
colorscheme = muted[c(1, 3, 5, 8, 10, 13)],
facetted = FALSE)
fdrtpr(cobraplot2_edgeR)$fullmethod <- gsub("_overall", "", fdrtpr(cobraplot2_edgeR)$fullmethod)
fdrtprcurve(cobraplot2_edgeR)$fullmethod <- gsub("_overall", "", fdrtprcurve(cobraplot2_edgeR)$fullmethod)
plot_fdrtprcurve(cobraplot2_edgeR, pointsize = 3, xaxisrange = c(0, 0.8),
linewidth = 2) +
theme(legend.position = "bottom") + guides(colour = guide_legend(nrow = 2))
fpc(cobraplot2_edgeR)$fullmethod <- gsub("_overall", "", fpc(cobraplot2_edgeR)$fullmethod)
plot_fpc(cobraplot2_edgeR) + plot_theme() +
theme(legend.position = "bottom") + guides(colour = guide_legend(nrow = 2))
## Stratify by diff isoform usage
cobraperf3_edgeR <- calculate_performance(cobra_edgeR, binary_truth = "status_overall",
splv = "diffisouse", maxsplit = 4)
cobraplot3_edgeR <- prepare_data_for_plot(cobraperf3_edgeR, incltruth = TRUE,
incloverall = FALSE,
colorscheme = muted[c(1, 3, 5, 8, 10, 13)])
levels(fdrtpr(cobraplot3_edgeR)$splitval)[levels(fdrtpr(cobraplot3_edgeR)$splitval) == "diffisouse:0"] <- "no differential isoform usage"
levels(fdrtpr(cobraplot3_edgeR)$splitval)[levels(fdrtpr(cobraplot3_edgeR)$splitval) == "diffisouse:1"] <- "differential isoform usage"
levels(fdrtprcurve(cobraplot3_edgeR)$splitval)[levels(fdrtprcurve(cobraplot3_edgeR)$splitval) == "diffisouse:0"] <- "no differential isoform usage"
levels(fdrtprcurve(cobraplot3_edgeR)$splitval)[levels(fdrtprcurve(cobraplot3_edgeR)$splitval) == "diffisouse:1"] <- "differential isoform usage"
plot_fdrtprcurve(cobraplot3_edgeR, pointsize = 2, xaxisrange = c(0, 0.8),
linewidth = 2) +
theme(legend.position = "bottom") + guides(colour = guide_legend(nrow = 2))
df1 <- Reduce(function(...) merge(..., by = "gene", all = TRUE),
list(data.frame(gene = rownames(res_fc_edgeR$tt),
featureCounts = res_fc_edgeR$tt$logFC,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_fc_avetxl_edgeR$tt),
featureCounts_avetxl = res_fc_avetxl_edgeR$tt$logFC,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_scaledTPM_edgeR$tt),
scaledTPM_salmon = res_sal_scaledTPM_edgeR$tt$logFC,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_simplesum_edgeR$tt),
simplesum_salmon = res_sal_simplesum_edgeR$tt$logFC,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_simplesum_avetxl_edgeR$tt),
simplesum_salmon_avetxl =
res_sal_simplesum_avetxl_edgeR$tt$logFC,
stringsAsFactors = FALSE)))
rownames(df1) <- df1$gene
df1$gene <- NULL
pairs(df1, upper.panel = panel_smooth, lower.panel = panel_cor)
truth_gene <- read.delim(truth_gene_file, header = TRUE, as.is = TRUE, row.names = 1)
df2 <- Reduce(function(...) merge(..., by = "gene", all = TRUE),
list(data.frame(gene = rownames(salmon_quant$geneTPM_sal),
salmon_quant$geneTPM_sal,
stringsAsFactors = FALSE),
data.frame(gene = rownames(truth_gene),
DTU = c("No DTU", "DTU")[truth_gene$dtu + 1],
DTE = c("No DTE", "DTE")[truth_gene$dte + 1],
DGE = c("No DGE", "DGE")[truth_gene$dge + 1],
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_scaledTPM_edgeR$tt),
scaledTPM_salmon_logFC = res_sal_scaledTPM_edgeR$tt$logFC,
scaledTPM_salmon_logCPM = res_sal_scaledTPM_edgeR$tt$logCPM,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_simplesum_edgeR$tt),
simplesum_salmon_logFC = res_sal_simplesum_edgeR$tt$logFC,
simplesum_salmon_logCPM = res_sal_simplesum_edgeR$tt$logCPM,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_simplesum_avetxl_edgeR$tt),
simplesum_salmon_avetxl_logFC =
res_sal_simplesum_avetxl_edgeR$tt$logFC,
simplesum_salmon_avetxl_logCPM =
res_sal_simplesum_avetxl_edgeR$tt$logCPM,
stringsAsFactors = FALSE)))
rownames(df2) <- df2$gene
df2$gene <- NULL
df2$scaledTPM_salmon_logCPMbinary <- Hmisc::cut2(df2$scaledTPM_salmon_logCPM, g = 2)
df2$simplesum_salmon_logCPMbinary <- Hmisc::cut2(df2$simplesum_salmon_logCPM, g = 2)
df2$simplesum_salmon_avetxl_logCPMbinary <- Hmisc::cut2(df2$simplesum_salmon_avetxl_logCPM, g = 2)
df2$categ <- rep("Not differential", nrow(df2))
df2$categ[df2$DTU == "DTU"] <- "DTU"
df2$categ[df2$DTE == "DTE"] <- "DTE"
df2$categ[df2$DGE == "DGE"] <- "DGE"
df2$sumA <- df2$sampleA1 + df2$sampleA2 + df2$sampleA3
df2$sumB <- df2$sampleB1 + df2$sampleB2 + df2$sampleB3
df2$allzero_onecond <- "expressed in both groups"
df2$allzero_onecond[union(which(df2$sumA == 0), which(df2$sumB == 0))] <- "expressed in one group"
ggplot(df2, aes(x = simplesum_salmon_logFC, y = scaledTPM_salmon_logFC, col = DTU)) +
geom_abline(intercept = 0, slope = 1) +
geom_point(size = 2, alpha = 0.5) +
plot_theme() + ggtitle("sim2") + theme(legend.position = "bottom") +
scale_color_manual(values = c("red", "blue"), name = "") +
xlab("simplesum_salmon, logFC") + ylab("scaledTPM_salmon, logFC") +
guides(colour = guide_legend(override.aes = list(size = 7)))
ggplot(df2, aes(x = simplesum_salmon_logFC, y = scaledTPM_salmon_logFC, col = categ)) +
geom_abline(intercept = 0, slope = 1) +
geom_point(size = 2, alpha = 0.5) +
plot_theme() + ggtitle("sim2") + theme(legend.position = "bottom") +
scale_color_manual(values = c("red", "blue", "green", "dimgrey"), name = "") +
xlab("simplesum_salmon, logFC") + ylab("scaledTPM_salmon, logFC") +
guides(colour = guide_legend(override.aes = list(size = 7)))
ggplot(df2, aes(x = simplesum_salmon_logFC, y = scaledTPM_salmon_logFC, col = allzero_onecond)) +
geom_abline(intercept = 0, slope = 1) +
geom_point(size = 2, alpha = 0.5) +
plot_theme() + ggtitle("sim2") + theme(legend.position = "bottom") +
scale_color_manual(values = c("blue", "red"), name = "") +
xlab("simplesum_salmon, logFC") + ylab("scaledTPM_salmon, logFC") +
guides(colour = guide_legend(override.aes = list(size = 7)))
ggplot(subset(df2, !is.na(scaledTPM_salmon_logCPMbinary)),
aes(x = simplesum_salmon_logFC, y = scaledTPM_salmon_logFC,
col = scaledTPM_salmon_logCPMbinary)) +
geom_abline(intercept = 0, slope = 1) +
geom_point(size = 2, alpha = 0.5) +
facet_wrap(~scaledTPM_salmon_logCPMbinary) +
plot_theme() + ggtitle("sim2") + theme(legend.position = "bottom") +
xlab("simplesum_salmon, logFC") + ylab("scaledTPM_salmon, logFC") +
scale_color_manual(values = c("red", "blue"), name = "scaledTPM_salmon, logCPM") +
guides(colour = guide_legend(override.aes = list(size = 7)))
cobra <- COBRAData(score = data.frame(simplesum_salmon = abs(res_sal_simplesum_edgeR$tt$logFC),
row.names = rownames(res_sal_simplesum_edgeR$tt)))
cobra <- COBRAData(score = data.frame(scaledTPM_salmon = abs(res_sal_scaledTPM_edgeR$tt$logFC),
row.names = rownames(res_sal_scaledTPM_edgeR$tt)),
object_to_extend = cobra)
cobra <- COBRAData(score = data.frame(simplesum_salmon_avetxl = abs(res_sal_simplesum_avetxl_edgeR$tt$logFC),
row.names = rownames(res_sal_simplesum_avetxl_edgeR$tt)),
object_to_extend = cobra)
cobra <- COBRAData(score = data.frame(featureCounts = abs(res_fc_edgeR$tt$logFC),
row.names = rownames(res_fc_edgeR$tt)),
object_to_extend = cobra)
cobra <- COBRAData(score = data.frame(featureCounts_avetxl = abs(res_fc_avetxl_edgeR$tt$logFC),
row.names = rownames(res_fc_avetxl_edgeR$tt)),
object_to_extend = cobra)
truth_gene <- read.delim(truth_gene_file, header = TRUE, as.is = TRUE, row.names = 1)
cobra <- COBRAData(truth = truth_gene, object_to_extend = cobra)
cobraperf <- iCOBRA::calculate_performance(cobra, cont_truth = "logFC_overall")
cobraplot1 <- iCOBRA::prepare_data_for_plot(cobraperf, incltruth = TRUE, facetted = FALSE,
colorscheme = muted[c(1, 3, 5, 8, 10)])
corr(cobraplot1)$fullmethod <- gsub("_overall", "", corr(cobraplot1)$fullmethod)
deviation(cobraplot1)$fullmethod <- gsub("_overall", "", deviation(cobraplot1)$fullmethod)
scatter(cobraplot1)$fullmethod <- gsub("_overall", "", scatter(cobraplot1)$fullmethod)
iCOBRA::plot_corr(cobraplot1, pointsize = 5, corrtype = "spearman") + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2))
corr(cobraplot1)[, c("basemethod", "splitval", "PEARSON", "SPEARMAN")]
## basemethod splitval PEARSON SPEARMAN
## 1 featureCounts overall 0.6637244 0.4336520
## 2 featureCounts_avetxl overall 0.7062586 0.4870360
## 3 scaledTPM_salmon overall 0.5579301 0.4449023
## 4 simplesum_salmon overall 0.6288003 0.4078021
## 5 simplesum_salmon_avetxl overall 0.6651315 0.4558745
iCOBRA::plot_deviation(cobraplot1) + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2))
iCOBRA::plot_scatter(cobraplot1, pointsize = 2) + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2)) + geom_abline(intercept = 0, slope = 1) + geom_point(size = 2)
## Stratify by differential isoform usage
cobraperf2 <- iCOBRA::calculate_performance(cobra, cont_truth = "logFC_overall",
splv = "diffisouse")
cobraplot2 <- iCOBRA::prepare_data_for_plot(cobraperf2, incltruth = TRUE, facetted = TRUE,
incloverall = FALSE,
colorscheme = muted[c(1, 3, 5, 8, 10)])
levels(corr(cobraplot2)$splitval)[levels(corr(cobraplot2)$splitval) == "diffisouse:0"] <-
"no differential isoform usage"
levels(corr(cobraplot2)$splitval)[levels(corr(cobraplot2)$splitval) == "diffisouse:1"] <-
"differential isoform usage"
levels(deviation(cobraplot2)$splitval)[levels(deviation(cobraplot2)$splitval) == "diffisouse:0"] <-
"no differential isoform usage"
levels(deviation(cobraplot2)$splitval)[levels(deviation(cobraplot2)$splitval) == "diffisouse:1"] <-
"differential isoform usage"
levels(scatter(cobraplot2)$splitval)[levels(scatter(cobraplot2)$splitval) == "diffisouse:0"] <-
"no differential isoform usage"
levels(scatter(cobraplot2)$splitval)[levels(scatter(cobraplot2)$splitval) == "diffisouse:1"] <-
"differential isoform usage"
iCOBRA::plot_corr(cobraplot2, pointsize = 5, corrtype = "spearman") + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2))
corr(cobraplot2)[, c("basemethod", "splitval", "PEARSON", "SPEARMAN")]
## basemethod splitval PEARSON SPEARMAN
## 1 featureCounts no differential isoform usage 0.7035890 0.5856803
## 2 featureCounts differential isoform usage 0.4942483 0.1140712
## 4 featureCounts_avetxl no differential isoform usage 0.7024402 0.5848455
## 5 featureCounts_avetxl differential isoform usage 0.7269604 0.4459979
## 7 scaledTPM_salmon no differential isoform usage 0.5275662 0.5182567
## 8 scaledTPM_salmon differential isoform usage 0.7990480 0.4988737
## 10 simplesum_salmon no differential isoform usage 0.6434089 0.5440508
## 11 simplesum_salmon differential isoform usage 0.5462938 0.1284064
## 13 simplesum_salmon_avetxl no differential isoform usage 0.6421550 0.5420244
## 14 simplesum_salmon_avetxl differential isoform usage 0.8311927 0.4702301
iCOBRA::plot_deviation(cobraplot2) + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2))
iCOBRA::plot_scatter(cobraplot2, pointsize = 2, doflip = TRUE) + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2)) + geom_abline(intercept = 0, slope = 1) + geom_point(size = 2)
################################################################################
## Remove genes with true logFC = 0
keep <- rownames(truth(cobra))[which(truth(cobra)$logFC_overall != 0)]
cobrasub <- cobra[keep, ]
cobraperf <- iCOBRA::calculate_performance(cobrasub, cont_truth = "logFC_overall")
cobraplot1 <- iCOBRA::prepare_data_for_plot(cobraperf, incltruth = TRUE, facetted = FALSE,
colorscheme = muted[c(1, 3, 5, 8, 10)])
corr(cobraplot1)$fullmethod <- gsub("_overall", "", corr(cobraplot1)$fullmethod)
deviation(cobraplot1)$fullmethod <- gsub("_overall", "", deviation(cobraplot1)$fullmethod)
scatter(cobraplot1)$fullmethod <- gsub("_overall", "", scatter(cobraplot1)$fullmethod)
iCOBRA::plot_corr(cobraplot1, pointsize = 5, corrtype = "spearman") + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2))
corr(cobraplot1)[, c("basemethod", "splitval", "PEARSON", "SPEARMAN")]
## basemethod splitval PEARSON SPEARMAN
## 1 featureCounts overall 0.8633384 0.8764170
## 2 featureCounts_avetxl overall 0.8797643 0.8990422
## 3 scaledTPM_salmon overall 0.8561199 0.8932697
## 4 simplesum_salmon overall 0.8742231 0.8758923
## 5 simplesum_salmon_avetxl overall 0.8894210 0.8943751
iCOBRA::plot_deviation(cobraplot1) + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2))
iCOBRA::plot_scatter(cobraplot1, pointsize = 2) + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2)) + geom_abline(intercept = 0, slope = 1) + geom_point(size = 2)
## Stratify by diff isoform usage
cobraperf2 <- iCOBRA::calculate_performance(cobrasub, cont_truth = "logFC_overall",
splv = "diffisouse")
cobraplot2 <- iCOBRA::prepare_data_for_plot(cobraperf2, incltruth = TRUE, facetted = TRUE,
incloverall = FALSE,
colorscheme = muted[c(1, 3, 5, 8, 10)])
levels(corr(cobraplot2)$splitval)[levels(corr(cobraplot2)$splitval) == "diffisouse:0"] <-
"no differential isoform usage"
levels(corr(cobraplot2)$splitval)[levels(corr(cobraplot2)$splitval) == "diffisouse:1"] <-
"differential isoform usage"
levels(deviation(cobraplot2)$splitval)[levels(deviation(cobraplot2)$splitval) == "diffisouse:0"] <-
"no differential isoform usage"
levels(deviation(cobraplot2)$splitval)[levels(deviation(cobraplot2)$splitval) == "diffisouse:1"] <-
"differential isoform usage"
levels(scatter(cobraplot2)$splitval)[levels(scatter(cobraplot2)$splitval) == "diffisouse:0"] <-
"no differential isoform usage"
levels(scatter(cobraplot2)$splitval)[levels(scatter(cobraplot2)$splitval) == "diffisouse:1"] <-
"differential isoform usage"
iCOBRA::plot_corr(cobraplot2, pointsize = 5, corrtype = "spearman") + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2))
corr(cobraplot2)[, c("basemethod", "splitval", "PEARSON", "SPEARMAN")]
## basemethod splitval PEARSON SPEARMAN
## 1 featureCounts no differential isoform usage 0.7994075 0.8339107
## 2 featureCounts differential isoform usage 0.8160137 0.7252924
## 4 featureCounts_avetxl no differential isoform usage 0.8026411 0.8389572
## 5 featureCounts_avetxl differential isoform usage 0.8656760 0.7884148
## 7 scaledTPM_salmon no differential isoform usage 0.7499904 0.8163444
## 8 scaledTPM_salmon differential isoform usage 0.9150220 0.7961684
## 10 simplesum_salmon no differential isoform usage 0.8032161 0.8226264
## 11 simplesum_salmon differential isoform usage 0.8588051 0.7348913
## 13 simplesum_salmon_avetxl no differential isoform usage 0.8058894 0.8282040
## 14 simplesum_salmon_avetxl differential isoform usage 0.9132133 0.7831298
iCOBRA::plot_deviation(cobraplot2) + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2))
iCOBRA::plot_scatter(cobraplot2, pointsize = 2, doflip = TRUE) + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2)) + geom_abline(intercept = 0, slope = 1) + geom_point(size = 2)
A gene is considered truly differential if any of its transcripts are differential.
cobra <- COBRAData(padj = data.frame(simplesum_salmon_dge = res_sal_simplesum_edgeR$tt$FDR,
row.names = rownames(res_sal_simplesum_edgeR$tt)))
cobra <- COBRAData(padj = data.frame(scaledTPM_salmon_dge = res_sal_scaledTPM_edgeR$tt$FDR,
row.names = rownames(res_sal_scaledTPM_edgeR$tt)),
object_to_extend = cobra)
cobra <- COBRAData(padj = data.frame(simplesum_salmon_avetxl_dge = res_sal_simplesum_avetxl_edgeR$tt$FDR,
row.names = rownames(res_sal_simplesum_avetxl_edgeR$tt)),
object_to_extend = cobra)
cobra <- COBRAData(padj = data.frame(featureCounts_dge = res_fc_edgeR$tt$FDR,
row.names = rownames(res_fc_edgeR$tt)),
object_to_extend = cobra)
cobra <- COBRAData(padj = data.frame(featureCounts_avetxl_dge = res_fc_avetxl_edgeR$tt$FDR,
row.names = rownames(res_fc_avetxl_edgeR$tt)),
object_to_extend = cobra)
cobra <- COBRAData(padj = data.frame(salmon_dte = qval_dte_salmon_gene,
row.names = names(qval_dte_salmon_gene)),
object_to_extend = cobra)
truth_gene <- read.delim(truth_gene_file, header = TRUE, as.is = TRUE, row.names = 1)
cobra <- COBRAData(truth = truth_gene, object_to_extend = cobra)
cobraperf <- calculate_performance(cobra, binary_truth = "status_any")
cobraplot <- prepare_data_for_plot(cobraperf, incltruth = FALSE,
colorscheme = muted[c(1, 3, 5, 8, 10, 13)],
facetted = FALSE)
fdrtpr(cobraplot)$fullmethod <- gsub("_overall", "", fdrtpr(cobraplot)$fullmethod)
fdrtprcurve(cobraplot)$fullmethod <- gsub("_overall", "", fdrtprcurve(cobraplot)$fullmethod)
plot_fdrtprcurve(cobraplot, pointsize = 2, xaxisrange = c(0, 0.7)) +
theme(legend.position = "bottom") + guides(colour = guide_legend(nrow = 2))
BPPARAM = MulticoreParam(6)
stopifnot(all(colnames(salmon_quant$txCOUNT_sal) == rownames(meta)))
dxd_sal <- DEXSeqDataSet(countData = round(salmon_quant$txCOUNT_sal),
sampleData = meta,
design = ~sample + exon + condition:exon,
featureID = rownames(salmon_quant$txCOUNT_sal),
groupID = tx2gene$gene[match(rownames(salmon_quant$txCOUNT_sal),
tx2gene$transcript)])
dxd_sal <- estimateSizeFactors(dxd_sal)
dxd_sal <- estimateDispersions(dxd_sal, BPPARAM = BPPARAM)
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
plotDispEsts(dxd_sal)
dxd_sal <- testForDEU(dxd_sal, BPPARAM = BPPARAM)
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
dxr_sal <- DEXSeqResults(dxd_sal)
qval_dtu_salmon <- perGeneQValue(dxr_sal)
BPPARAM = MulticoreParam(6)
dexseqfiles <- list.files(paste0(basedir, "/quantifications/sim2/dexseq_exon"),
full.names = TRUE, pattern = "sample")
names(dexseqfiles) <- gsub("\\.txt$", "", basename(dexseqfiles))
dxd_exon <- DEXSeqDataSetFromHTSeq(countfiles = dexseqfiles,
sampleData = meta[match(names(dexseqfiles),
meta$sample), ],
design = ~sample + exon + condition:exon)
dxd_exon <- estimateSizeFactors(dxd_exon)
dxd_exon <- estimateDispersions(dxd_exon, BPPARAM = BPPARAM)
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
plotDispEsts(dxd_exon)
dxd_exon <- testForDEU(dxd_exon, BPPARAM = BPPARAM)
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
## using supplied model matrix
dxr_exon <- DEXSeqResults(dxd_exon)
qval_dtu_dexseq_exon <- perGeneQValue(dxr_exon)
cobra <- COBRAData(padj = data.frame(salmon_dexseq = qval_dtu_salmon,
row.names = names(qval_dtu_salmon),
stringsAsFactors = FALSE))
cobra <- COBRAData(padj = data.frame(dexseq = qval_dtu_dexseq_exon,
row.names = names(qval_dtu_dexseq_exon),
stringsAsFactors = FALSE),
object_to_extend = cobra)
truth_gene <- read.delim(truth_gene_file, header = TRUE, as.is = TRUE, row.names = 1)
cobra <- COBRAData(truth = truth_gene, object_to_extend = cobra)
cobraperf <- calculate_performance(cobra, binary_truth = "diffisouse")
cobraplot <- prepare_data_for_plot(cobraperf, incltruth = TRUE,
colorscheme = c("blue", "red", "green"),
facetted = FALSE)
fdrtpr(cobraplot)$fullmethod <- gsub("_overall", "", fdrtpr(cobraplot)$fullmethod)
fdrtprcurve(cobraplot)$fullmethod <- gsub("_overall", "", fdrtprcurve(cobraplot)$fullmethod)
plot_fdrtprcurve(cobraplot, pointsize = 2) +
theme(legend.position = "bottom") + guides(colour = guide_legend(nrow = 1))
plot_overlap(cobraplot, cex = c(1, 0.7, 0.7))
cobra <- COBRAData(padj = data.frame(salmon_dte = qval_dte_salmon_gene,
row.names = names(qval_dte_salmon_gene)))
cobra <- COBRAData(padj = data.frame(salmon_dexseq_dtu = qval_dtu_salmon,
row.names = names(qval_dtu_salmon)),
object_to_extend = cobra)
cobra <- COBRAData(padj = data.frame(simplesum_salmon_avetxl_dge = res_sal_simplesum_avetxl_edgeR$tt$FDR,
row.names = rownames(res_sal_simplesum_avetxl_edgeR$tt)),
object_to_extend = cobra)
cobraperf <- calculate_performance(cobra, aspects = "overlap", thr_venn = 0.05)
cobraplot <- prepare_data_for_plot(cobraperf)
plot_overlap(cobraplot, cex = c(1, 0.7, 0.7))
res_sal_simplesum_deseq2 <- diff_expression_DESeq2(txi = NULL,
counts = salmon_quant$geneCOUNT_sal_simplesum,
meta = meta, cond_name = "condition",
level1 = "A", level2 = "B",
sample_name = "sample")
res_sal_simplesum_avetxl_deseq2 <- diff_expression_DESeq2(txi = salmon_quant$txi_salmonsimplesum,
counts = NULL,
meta = meta, cond_name = "condition",
level1 = "A", level2 = "B",
sample_name = "sample")
res_sal_scaledTPM_deseq2 <- diff_expression_DESeq2(txi = NULL,
counts = salmon_quant$geneCOUNT_sal_scaledTPM,
meta = meta, cond_name = "condition",
level1 = "A", level2 = "B",
sample_name = "sample")
res_fc_deseq2 <- diff_expression_DESeq2(txi = NULL,
counts = geneCOUNT_fc,
meta = meta, cond_name = "condition",
level1 = "A", level2 = "B",
sample_name = "sample")
res_fc_avetxl_deseq2 <-
diff_expression_DESeq2(txi = list(counts = geneCOUNT_fc,
length = salmon_quant$avetxlength[match(rownames(geneCOUNT_fc),
rownames(salmon_quant$avetxlength)),
match(colnames(geneCOUNT_fc),
colnames(salmon_quant$avetxlength))],
countsFromAbundance = "no"),
counts = NULL,
meta = meta, cond_name = "condition",
level1 = "A", level2 = "B",
sample_name = "sample")
dfh <- data.frame(pvalue = c(res_fc_deseq2$res$pvalue,
res_fc_avetxl_deseq2$res$pvalue,
res_sal_scaledTPM_deseq2$res$pvalue,
res_sal_simplesum_deseq2$res$pvalue,
res_sal_simplesum_avetxl_deseq2$res$pvalue),
mth = c(rep("featureCounts", nrow(res_fc_deseq2$res)),
rep("featureCounts_avetxl", nrow(res_fc_avetxl_deseq2$res)),
rep("scaledTPM_salmon", nrow(res_sal_scaledTPM_deseq2$res)),
rep("simplesum_salmon", nrow(res_sal_simplesum_deseq2$res)),
rep("simplesum_salmon_avetxl", nrow(res_sal_simplesum_avetxl_deseq2$res))))
ggplot(dfh, aes(x = pvalue)) + geom_histogram() + facet_wrap(~mth) +
plot_theme() +
xlab("p-value") + ylab("count")
par(mfrow = c(2, 3))
plotDispEsts(res_fc_deseq2$dsd, main = "featureCounts")
plotDispEsts(res_fc_avetxl_deseq2$dsd, main = "featureCounts_avetxl")
plotDispEsts(res_sal_scaledTPM_deseq2$dsd, main = "scaledTPM_salmon")
plotDispEsts(res_sal_simplesum_deseq2$dsd, main = "simplesum_salmon")
plotDispEsts(res_sal_simplesum_avetxl_deseq2$dsd, main = "simplesum_salmon_avetxl")
par(mfrow = c(2, 3))
DESeq2::plotMA(res_fc_deseq2$dsd, main = "featureCounts")
DESeq2::plotMA(res_fc_avetxl_deseq2$dsd, main = "featureCounts_avetxl")
DESeq2::plotMA(res_sal_scaledTPM_deseq2$dsd, main = "scaledTPM_salmon")
DESeq2::plotMA(res_sal_simplesum_deseq2$dsd, main = "simplesum_salmon")
DESeq2::plotMA(res_sal_simplesum_avetxl_deseq2$dsd, main = "simplesum_salmon_avetxl")
par(mfrow = c(1, 1))
cobra_deseq2 <- COBRAData(padj = data.frame(simplesum_salmon = res_sal_simplesum_deseq2$res$padj,
row.names = rownames(res_sal_simplesum_deseq2$res)))
cobra_deseq2 <- COBRAData(padj = data.frame(scaledTPM_salmon = res_sal_scaledTPM_deseq2$res$padj,
row.names = rownames(res_sal_scaledTPM_deseq2$res)),
object_to_extend = cobra_deseq2)
cobra_deseq2 <- COBRAData(padj = data.frame(simplesum_salmon_avetxl = res_sal_simplesum_avetxl_deseq2$res$padj,
row.names = rownames(res_sal_simplesum_avetxl_deseq2$res)),
object_to_extend = cobra_deseq2)
cobra_deseq2 <- COBRAData(padj = data.frame(featureCounts = res_fc_deseq2$res$padj,
row.names = rownames(res_fc_deseq2$res)),
object_to_extend = cobra_deseq2)
cobra_deseq2 <- COBRAData(padj = data.frame(featureCounts_avetxl = res_fc_avetxl_deseq2$res$padj,
row.names = rownames(res_fc_avetxl_deseq2$res)),
object_to_extend = cobra_deseq2)
truth_gene <- read.delim(truth_gene_file, header = TRUE, as.is = TRUE, row.names = 1)
cobra_deseq2 <- COBRAData(truth = truth_gene, object_to_extend = cobra_deseq2)
cobraperf_deseq2 <- calculate_performance(cobra_deseq2, binary_truth = "status_overall")
cobraplot1_deseq2 <- prepare_data_for_plot(cobraperf_deseq2, incltruth = FALSE,
colorscheme = muted[c(1, 3, 5, 8, 10)])
plot_overlap(cobraplot1_deseq2, cex = c(1, 0.7, 0.7))
## FDR/TPR and FPC
cobraplot2_deseq2 <- prepare_data_for_plot(cobraperf_deseq2, incltruth = TRUE,
colorscheme = muted[c(1, 3, 5, 8, 10, 13)],
facetted = FALSE)
fdrtpr(cobraplot2_deseq2)$fullmethod <- gsub("_overall", "", fdrtpr(cobraplot2_deseq2)$fullmethod)
fdrtprcurve(cobraplot2_deseq2)$fullmethod <- gsub("_overall", "", fdrtprcurve(cobraplot2_deseq2)$fullmethod)
plot_fdrtprcurve(cobraplot2_deseq2, pointsize = 2, xaxisrange = c(0, 0.8)) +
theme(legend.position = "bottom") + guides(colour = guide_legend(nrow = 2))
fpc(cobraplot2_deseq2)$fullmethod <- gsub("_overall", "", fpc(cobraplot2_deseq2)$fullmethod)
plot_fpc(cobraplot2_deseq2) + plot_theme() +
theme(legend.position = "bottom") + guides(colour = guide_legend(nrow = 2))
## Stratify by diff isoform usage
cobraperf3_deseq2 <- calculate_performance(cobra_deseq2, binary_truth = "status_overall",
splv = "diffisouse", maxsplit = 4)
cobraplot3_deseq2 <- prepare_data_for_plot(cobraperf3_deseq2, incltruth = TRUE,
incloverall = FALSE,
colorscheme = muted[c(1, 3, 5, 8, 10, 13)])
levels(fdrtpr(cobraplot3_deseq2)$splitval)[levels(fdrtpr(cobraplot3_deseq2)$splitval) == "diffisouse:0"] <- "no differential isoform usage"
levels(fdrtpr(cobraplot3_deseq2)$splitval)[levels(fdrtpr(cobraplot3_deseq2)$splitval) == "diffisouse:1"] <- "differential isoform usage"
levels(fdrtprcurve(cobraplot3_deseq2)$splitval)[levels(fdrtprcurve(cobraplot3_deseq2)$splitval) == "diffisouse:0"] <- "no differential isoform usage"
levels(fdrtprcurve(cobraplot3_deseq2)$splitval)[levels(fdrtprcurve(cobraplot3_deseq2)$splitval) == "diffisouse:1"] <- "differential isoform usage"
plot_fdrtprcurve(cobraplot3_deseq2, pointsize = 2, xaxisrange = c(0, 0.8)) +
theme(legend.position = "bottom") + guides(colour = guide_legend(nrow = 2))
df1 <- Reduce(function(...) merge(..., by = "gene", all = TRUE),
list(data.frame(gene = rownames(res_fc_deseq2$res),
featureCounts = res_fc_deseq2$res$log2FoldChange,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_fc_avetxl_deseq2$res),
featureCounts_avetxl = res_fc_avetxl_deseq2$res$log2FoldChange,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_scaledTPM_deseq2$res),
scaledTPM_salmon = res_sal_scaledTPM_deseq2$res$log2FoldChange,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_simplesum_deseq2$res),
simplesum_salmon = res_sal_simplesum_deseq2$res$log2FoldChange,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_simplesum_avetxl_deseq2$res),
simplesum_salmon_avetxl =
res_sal_simplesum_avetxl_deseq2$res$log2FoldChange,
stringsAsFactors = FALSE)))
rownames(df1) <- df1$gene
df1$gene <- NULL
pairs(df1, upper.panel = panel_smooth, lower.panel = panel_cor)
truth_gene <- read.delim(truth_gene_file, header = TRUE, as.is = TRUE, row.names = 1)
df2 <- Reduce(function(...) merge(..., by = "gene", all = TRUE),
list(data.frame(gene = rownames(salmon_quant$geneTPM_sal),
salmon_quant$geneTPM_sal,
stringsAsFactors = FALSE),
data.frame(gene = rownames(truth_gene),
DTU = c("No DTU", "DTU")[truth_gene$dtu + 1],
DTE = c("No DTE", "DTE")[truth_gene$dte + 1],
DGE = c("No DGE", "DGE")[truth_gene$dge + 1],
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_scaledTPM_deseq2$res),
scaledTPM_salmon_logFC = res_sal_scaledTPM_deseq2$res$log2FoldChange,
scaledTPM_salmon_basemean = res_sal_scaledTPM_deseq2$res$baseMean,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_simplesum_deseq2$res),
simplesum_salmon_logFC = res_sal_simplesum_deseq2$res$log2FoldChange,
simplesum_salmon_basemean = res_sal_simplesum_deseq2$res$baseMean,
stringsAsFactors = FALSE),
data.frame(gene = rownames(res_sal_simplesum_avetxl_deseq2$res),
simplesum_salmon_avetxl_logFC =
res_sal_simplesum_avetxl_deseq2$res$log2FoldChange,
simplesum_salmon_avetxl_basemean =
res_sal_simplesum_avetxl_deseq2$res$baseMean,
stringsAsFactors = FALSE)))
rownames(df2) <- df2$gene
df2$gene <- NULL
df2$scaledTPM_salmon_basemeanbinary <- Hmisc::cut2(df2$scaledTPM_salmon_basemean, g = 2)
df2$simplesum_salmon_basemeanbinary <- Hmisc::cut2(df2$simplesum_salmon_basemean, g = 2)
df2$simplesum_salmon_avetxl_basemeanbinary <- Hmisc::cut2(df2$simplesum_salmon_avetxl_basemean, g = 2)
df2$categ <- rep("Not differential", nrow(df2))
df2$categ[df2$DTU == "DTU"] <- "DTU"
df2$categ[df2$DTE == "DTE"] <- "DTE"
df2$categ[df2$DGE == "DGE"] <- "DGE"
df2$sumA <- df2$sampleA1 + df2$sampleA2 + df2$sampleA3
df2$sumB <- df2$sampleB1 + df2$sampleB2 + df2$sampleB3
df2$allzero_onecond <- "expressed in both groups"
df2$allzero_onecond[union(which(df2$sumA == 0), which(df2$sumB == 0))] <- "expressed in one group"
ggplot(df2, aes(x = simplesum_salmon_logFC, y = scaledTPM_salmon_logFC, col = DTU)) +
geom_abline(intercept = 0, slope = 1) +
geom_point(size = 2, alpha = 0.5) +
plot_theme() + ggtitle("sim2") + theme(legend.position = "bottom") +
scale_color_manual(values = c("red", "blue"), name = "") +
xlab("simplesum_salmon, logFC") + ylab("scaledTPM_salmon, logFC") +
guides(colour = guide_legend(override.aes = list(size = 7)))
ggplot(df2, aes(x = simplesum_salmon_logFC, y = scaledTPM_salmon_logFC, col = categ)) +
geom_abline(intercept = 0, slope = 1) +
geom_point(size = 2, alpha = 0.5) +
plot_theme() + ggtitle("sim2") + theme(legend.position = "bottom") +
scale_color_manual(values = c("red", "blue", "green", "dimgrey"), name = "") +
xlab("simplesum_salmon, logFC") + ylab("scaledTPM_salmon, logFC") +
guides(colour = guide_legend(override.aes = list(size = 7)))
ggplot(df2, aes(x = simplesum_salmon_logFC, y = scaledTPM_salmon_logFC, col = allzero_onecond)) +
geom_abline(intercept = 0, slope = 1) +
geom_point(size = 2, alpha = 0.5) +
plot_theme() + ggtitle("sim2") + theme(legend.position = "bottom") +
scale_color_manual(values = c("blue", "red"), name = "") +
xlab("simplesum_salmon, logFC") + ylab("scaledTPM_salmon, logFC") +
guides(colour = guide_legend(override.aes = list(size = 7)))
ggplot(subset(df2, !is.na(scaledTPM_salmon_basemeanbinary)),
aes(x = simplesum_salmon_logFC, y = scaledTPM_salmon_logFC,
col = scaledTPM_salmon_basemeanbinary)) +
geom_abline(intercept = 0, slope = 1) +
geom_point(size = 2, alpha = 0.5) +
facet_wrap(~scaledTPM_salmon_basemeanbinary) +
plot_theme() + ggtitle("sim2") + theme(legend.position = "bottom") +
xlab("simplesum_salmon, logFC") + ylab("scaledTPM_salmon, logFC") +
scale_color_manual(values = c("red", "blue"), name = "scaledTPM_salmon, base mean") +
guides(colour = guide_legend(override.aes = list(size = 7)))
cobra <- COBRAData(score = data.frame(simplesum_salmon = abs(res_sal_simplesum_deseq2$res$log2FoldChange),
row.names = rownames(res_sal_simplesum_deseq2$res)))
cobra <- COBRAData(score = data.frame(scaledTPM_salmon = abs(res_sal_scaledTPM_deseq2$res$log2FoldChange),
row.names = rownames(res_sal_scaledTPM_deseq2$res)),
object_to_extend = cobra)
cobra <- COBRAData(score = data.frame(simplesum_salmon_avetxl =
abs(res_sal_simplesum_avetxl_deseq2$res$log2FoldChange),
row.names = rownames(res_sal_simplesum_avetxl_deseq2$res)),
object_to_extend = cobra)
cobra <- COBRAData(score = data.frame(featureCounts = abs(res_fc_deseq2$res$log2FoldChange),
row.names = rownames(res_fc_deseq2$res)),
object_to_extend = cobra)
cobra <- COBRAData(score = data.frame(featureCounts_avetxl = abs(res_fc_avetxl_deseq2$res$log2FoldChange),
row.names = rownames(res_fc_avetxl_deseq2$res)),
object_to_extend = cobra)
truth_gene <- read.delim(truth_gene_file, header = TRUE, as.is = TRUE, row.names = 1)
cobra <- COBRAData(truth = truth_gene, object_to_extend = cobra)
cobraperf <- iCOBRA::calculate_performance(cobra, cont_truth = "logFC_overall")
cobraplot1 <- iCOBRA::prepare_data_for_plot(cobraperf, incltruth = TRUE, facetted = FALSE,
colorscheme = muted[c(1, 3, 5, 8, 10)])
corr(cobraplot1)$fullmethod <- gsub("_overall", "", corr(cobraplot1)$fullmethod)
deviation(cobraplot1)$fullmethod <- gsub("_overall", "", deviation(cobraplot1)$fullmethod)
scatter(cobraplot1)$fullmethod <- gsub("_overall", "", scatter(cobraplot1)$fullmethod)
iCOBRA::plot_corr(cobraplot1, pointsize = 5, corrtype = "spearman") + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2))
corr(cobraplot1)[, c("basemethod", "splitval", "PEARSON", "SPEARMAN")]
## basemethod splitval PEARSON SPEARMAN
## 1 featureCounts overall 0.7612140 0.4742037
## 2 featureCounts_avetxl overall 0.8223537 0.5314520
## 3 scaledTPM_salmon overall 0.7730250 0.5086579
## 4 simplesum_salmon overall 0.7441198 0.4547910
## 5 simplesum_salmon_avetxl overall 0.8011819 0.5077898
iCOBRA::plot_deviation(cobraplot1) + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2))
iCOBRA::plot_scatter(cobraplot1, pointsize = 2) + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2)) + geom_abline(intercept = 0, slope = 1) + geom_point(size = 2)
## Stratify by differential isoform usage
cobraperf2 <- iCOBRA::calculate_performance(cobra, cont_truth = "logFC_overall",
splv = "diffisouse")
cobraplot2 <- iCOBRA::prepare_data_for_plot(cobraperf2, incltruth = TRUE, facetted = TRUE,
incloverall = FALSE,
colorscheme = muted[c(1, 3, 5, 8, 10)])
levels(corr(cobraplot2)$splitval)[levels(corr(cobraplot2)$splitval) == "diffisouse:0"] <-
"no differential isoform usage"
levels(corr(cobraplot2)$splitval)[levels(corr(cobraplot2)$splitval) == "diffisouse:1"] <-
"differential isoform usage"
levels(deviation(cobraplot2)$splitval)[levels(deviation(cobraplot2)$splitval) == "diffisouse:0"] <-
"no differential isoform usage"
levels(deviation(cobraplot2)$splitval)[levels(deviation(cobraplot2)$splitval) == "diffisouse:1"] <-
"differential isoform usage"
levels(scatter(cobraplot2)$splitval)[levels(scatter(cobraplot2)$splitval) == "diffisouse:0"] <-
"no differential isoform usage"
levels(scatter(cobraplot2)$splitval)[levels(scatter(cobraplot2)$splitval) == "diffisouse:1"] <-
"differential isoform usage"
iCOBRA::plot_corr(cobraplot2, pointsize = 5, corrtype = "spearman") + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2))
corr(cobraplot2)[, c("basemethod", "splitval", "PEARSON", "SPEARMAN")]
## basemethod splitval PEARSON SPEARMAN
## 1 featureCounts no differential isoform usage 0.8288448 0.6326767
## 2 featureCounts differential isoform usage 0.5212447 0.1205842
## 4 featureCounts_avetxl no differential isoform usage 0.8310185 0.6345294
## 5 featureCounts_avetxl differential isoform usage 0.7821129 0.4530669
## 7 scaledTPM_salmon no differential isoform usage 0.7640564 0.5934580
## 8 scaledTPM_salmon differential isoform usage 0.8307163 0.5052982
## 10 simplesum_salmon no differential isoform usage 0.7900246 0.5991675
## 11 simplesum_salmon differential isoform usage 0.5556445 0.1326059
## 13 simplesum_salmon_avetxl no differential isoform usage 0.7940672 0.6002799
## 14 simplesum_salmon_avetxl differential isoform usage 0.8480329 0.4755535
iCOBRA::plot_deviation(cobraplot2) + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2))
iCOBRA::plot_scatter(cobraplot2, pointsize = 2, doflip = TRUE) + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2)) + geom_abline(intercept = 0, slope = 1) + geom_point(size = 2)
################################################################################
## Remove genes with true logFC = 0
keep <- rownames(truth(cobra))[which(truth(cobra)$logFC_overall != 0)]
cobrasub <- cobra[keep, ]
cobraperf <- iCOBRA::calculate_performance(cobrasub, cont_truth = "logFC_overall")
cobraplot1 <- iCOBRA::prepare_data_for_plot(cobraperf, incltruth = TRUE, facetted = FALSE,
colorscheme = muted[c(1, 3, 5, 8, 10)])
corr(cobraplot1)$fullmethod <- gsub("_overall", "", corr(cobraplot1)$fullmethod)
deviation(cobraplot1)$fullmethod <- gsub("_overall", "", deviation(cobraplot1)$fullmethod)
scatter(cobraplot1)$fullmethod <- gsub("_overall", "", scatter(cobraplot1)$fullmethod)
iCOBRA::plot_corr(cobraplot1, pointsize = 5, corrtype = "spearman") + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2))
corr(cobraplot1)[, c("basemethod", "splitval", "PEARSON", "SPEARMAN")]
## basemethod splitval PEARSON SPEARMAN
## 1 featureCounts overall 0.8742009 0.8780341
## 2 featureCounts_avetxl overall 0.8925442 0.9007627
## 3 scaledTPM_salmon overall 0.8842811 0.8911180
## 4 simplesum_salmon overall 0.8752962 0.8744314
## 5 simplesum_salmon_avetxl overall 0.8916919 0.8933705
iCOBRA::plot_deviation(cobraplot1) + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2))
iCOBRA::plot_scatter(cobraplot1, pointsize = 2) + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2)) + geom_abline(intercept = 0, slope = 1) + geom_point(size = 2)
## Stratify by diff isoform usage
cobraperf2 <- iCOBRA::calculate_performance(cobrasub, cont_truth = "logFC_overall",
splv = "diffisouse")
cobraplot2 <- iCOBRA::prepare_data_for_plot(cobraperf2, incltruth = TRUE, facetted = TRUE,
incloverall = FALSE,
colorscheme = muted[c(1, 3, 5, 8, 10)])
levels(corr(cobraplot2)$splitval)[levels(corr(cobraplot2)$splitval) == "diffisouse:0"] <-
"no differential isoform usage"
levels(corr(cobraplot2)$splitval)[levels(corr(cobraplot2)$splitval) == "diffisouse:1"] <-
"differential isoform usage"
levels(deviation(cobraplot2)$splitval)[levels(deviation(cobraplot2)$splitval) == "diffisouse:0"] <-
"no differential isoform usage"
levels(deviation(cobraplot2)$splitval)[levels(deviation(cobraplot2)$splitval) == "diffisouse:1"] <-
"differential isoform usage"
levels(scatter(cobraplot2)$splitval)[levels(scatter(cobraplot2)$splitval) == "diffisouse:0"] <-
"no differential isoform usage"
levels(scatter(cobraplot2)$splitval)[levels(scatter(cobraplot2)$splitval) == "diffisouse:1"] <-
"differential isoform usage"
iCOBRA::plot_corr(cobraplot2, pointsize = 5, corrtype = "spearman") + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2))
corr(cobraplot2)[, c("basemethod", "splitval", "PEARSON", "SPEARMAN")]
## basemethod splitval PEARSON SPEARMAN
## 1 featureCounts no differential isoform usage 0.8152888 0.8516040
## 2 featureCounts differential isoform usage 0.8359298 0.7265373
## 4 featureCounts_avetxl no differential isoform usage 0.8197844 0.8578961
## 5 featureCounts_avetxl differential isoform usage 0.8906274 0.7877366
## 7 scaledTPM_salmon no differential isoform usage 0.8022713 0.8332482
## 8 scaledTPM_salmon differential isoform usage 0.9121384 0.7938213
## 10 simplesum_salmon no differential isoform usage 0.8099607 0.8341344
## 11 simplesum_salmon differential isoform usage 0.8568606 0.7346615
## 13 simplesum_salmon_avetxl no differential isoform usage 0.8136126 0.8408756
## 14 simplesum_salmon_avetxl differential isoform usage 0.9107156 0.7826426
iCOBRA::plot_deviation(cobraplot2) + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2))
iCOBRA::plot_scatter(cobraplot2, pointsize = 2, doflip = TRUE) + theme(legend.position = "bottom") +
guides(colour = guide_legend(nrow = 2)) + geom_abline(intercept = 0, slope = 1) + geom_point(size = 2)
Given the gene count matrices defined above (corresponding to the featureCounts, simplesum and scaledTPM matrices mentioned in the manuscript), we apply limma-voom to perform differential gene expression. For the simplesum and featureCounts matrices, we also perform the analyses using the offsets derived from the average transcript lengths (simplesum_avetxl).
res_sal_simplesum_voom <- diff_expression_voom(counts = salmon_quant$geneCOUNT_sal_simplesum,
meta = meta, cond_name = "condition",
sample_name = "sample",
gene_length_matrix = NULL)
res_sal_simplesum_avetxl_voom <- diff_expression_voom(counts = salmon_quant$geneCOUNT_sal_simplesum,
meta = meta, cond_name = "condition",
sample_name = "sample",
gene_length_matrix = salmon_quant$avetxlength)
res_sal_scaledTPM_voom <- diff_expression_voom(counts = salmon_quant$geneCOUNT_sal_scaledTPM,
meta = meta, cond_name = "condition",
sample_name = "sample",
gene_length_matrix = NULL)
res_fc_voom <- diff_expression_voom(counts = geneCOUNT_fc,
meta = meta, cond_name = "condition",
sample_name = "sample",
gene_length_matrix = NULL)
res_fc_avetxl_voom <- diff_expression_voom(counts = geneCOUNT_fc,
meta = meta, cond_name = "condition",
sample_name = "sample",
gene_length_matrix = salmon_quant$avetxlength)
cobra_voom <- COBRAData(padj = data.frame(simplesum_salmon = res_sal_simplesum_voom$tt$adj.P.Val,
row.names = rownames(res_sal_simplesum_voom$tt)))
cobra_voom <- COBRAData(padj = data.frame(scaledTPM_salmon = res_sal_scaledTPM_voom$tt$adj.P.Val,
row.names = rownames(res_sal_scaledTPM_voom$tt)),
object_to_extend = cobra_voom)
cobra_voom <- COBRAData(padj = data.frame(simplesum_salmon_avetxl = res_sal_simplesum_avetxl_voom$tt$adj.P.Val,
row.names = rownames(res_sal_simplesum_avetxl_voom$tt)),
object_to_extend = cobra_voom)
cobra_voom <- COBRAData(padj = data.frame(featureCounts = res_fc_voom$tt$adj.P.Val,
row.names = rownames(res_fc_voom$tt)),
object_to_extend = cobra_voom)
cobra_voom <- COBRAData(padj = data.frame(featureCounts_avetxl = res_fc_avetxl_voom$tt$adj.P.Val,
row.names = rownames(res_fc_avetxl_voom$tt)),
object_to_extend = cobra_voom)
truth_gene <- read.delim(truth_gene_file, header = TRUE, as.is = TRUE, row.names = 1)
cobra_voom <- COBRAData(truth = truth_gene, object_to_extend = cobra_voom)
cobraperf_voom <- calculate_performance(cobra_voom, binary_truth = "status_overall")
cobraplot1_voom <- prepare_data_for_plot(cobraperf_voom, incltruth = FALSE,
colorscheme = muted[c(1, 3, 5, 8, 10)])
plot_overlap(cobraplot1_voom, cex = c(1, 0.7, 0.7))
## FDR/TPR and FPC
cobraplot2_voom <- prepare_data_for_plot(cobraperf_voom, incltruth = TRUE,
colorscheme = muted[c(1, 3, 5, 8, 10, 13)],
facetted = FALSE)
fdrtpr(cobraplot2_voom)$fullmethod <- gsub("_overall", "", fdrtpr(cobraplot2_voom)$fullmethod)
fdrtprcurve(cobraplot2_voom)$fullmethod <- gsub("_overall", "", fdrtprcurve(cobraplot2_voom)$fullmethod)
plot_fdrtprcurve(cobraplot2_voom, pointsize = 2, xaxisrange = c(0, 0.8)) +
theme(legend.position = "bottom") + guides(colour = guide_legend(nrow = 2))
fpc(cobraplot2_voom)$fullmethod <- gsub("_overall", "", fpc(cobraplot2_voom)$fullmethod)
plot_fpc(cobraplot2_voom) + plot_theme() +
theme(legend.position = "bottom") + guides(colour = guide_legend(nrow = 2))
## Stratify by diff isoform usage
cobraperf3_voom <- calculate_performance(cobra_voom, binary_truth = "status_overall",
splv = "diffisouse", maxsplit = 4)
cobraplot3_voom <- prepare_data_for_plot(cobraperf3_voom, incltruth = TRUE,
incloverall = FALSE,
colorscheme = muted[c(1, 3, 5, 8, 10, 13)])
levels(fdrtpr(cobraplot3_voom)$splitval)[levels(fdrtpr(cobraplot3_voom)$splitval) == "diffisouse:0"] <- "no differential isoform usage"
levels(fdrtpr(cobraplot3_voom)$splitval)[levels(fdrtpr(cobraplot3_voom)$splitval) == "diffisouse:1"] <- "differential isoform usage"
levels(fdrtprcurve(cobraplot3_voom)$splitval)[levels(fdrtprcurve(cobraplot3_voom)$splitval) == "diffisouse:0"] <- "no differential isoform usage"
levels(fdrtprcurve(cobraplot3_voom)$splitval)[levels(fdrtprcurve(cobraplot3_voom)$splitval) == "diffisouse:1"] <- "differential isoform usage"
plot_fdrtprcurve(cobraplot3_voom, pointsize = 2, xaxisrange = c(0, 0.8)) +
theme(legend.position = "bottom") + guides(colour = guide_legend(nrow = 2))
panel_cor <- function(x, y, digits = 3, cex.cor) {
## Panel function to print Pearson and Spearman correlations
usr <- par("usr")
on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r1 <- abs(cor(x, y, method = "pearson", use = "complete"))
txt1 <- format(c(r1, 0.123456789), digits = digits)[1]
r2 <- abs(cor(x, y, method = "spearman", use = "complete"))
txt2 <- format(c(r2, 0.123456789), digits = digits)[1]
text(0.5, 0.35, paste("pearson =", txt1), cex = 1.1)
text(0.5, 0.65, paste("spearman =", txt2), cex = 1.1)
}
panel_smooth<-function (x, y, col = "blue", bg = NA, pch = ".",
cex = 0.8, ...) {
## Panel function to plot points
points(x, y, pch = pch, col = col, bg = bg, cex = cex)
}
plot_theme <- function() {
## ggplot2 plotting theme
theme_grey() +
theme(legend.position = "right",
panel.background = element_rect(fill = "white", colour = "black"),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
strip.text = element_text(size = 10),
strip.background = element_rect(fill = NA, colour = "black"),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
plot.title = element_text(colour = "black", size = 20))
}
calc_lengths_mapping <- function(gtf, cdna_fasta, feature_lengths_file,
tx_gene_file) {
## Function to calculate gene and transcript lengths from transcript cDNA
## fasta and gtf file. Also generate mapping between transcript and gene IDs.
suppressPackageStartupMessages(library(GenomicFeatures))
suppressPackageStartupMessages(library(Biostrings))
## Gene/transcript lengths ===============================================
## Transcripts/genes present in gtf file
if (!is.null(gtf)) {
txdb <- makeTxDbFromGFF(gtf, format = "gtf")
ebg <- exonsBy(txdb, "gene")
ebt <- exonsBy(txdb, "tx", use.names = TRUE)
ebg_red <- reduce(ebg)
gene_length <- sum(width(ebg_red))
ebt_red <- reduce(ebt)
tx_length <- sum(width(ebt_red))
} else {
tx_length <- c()
gene_length <- c()
}
## Extend with transcripts from cDNA fasta
cdna <- readDNAStringSet(gsub("\\.gz", "", cdna_fasta))
tx_length2 <- width(cdna)
names(tx_length2) <- sapply(names(cdna), function(i) strsplit(i, " ")[[1]][1])
tx_length2 <- tx_length2[setdiff(names(tx_length2), names(tx_length))]
tx_length <- c(tx_length, tx_length2)
## Gene/transcript mapping ===============================================
## Transcripts/genes present in gtf file
if (!is.null(gtf)) {
tbg <- transcriptsBy(txdb, "gene")
tx2gene <- stack(lapply(tbg, function(w) w$tx_name))
colnames(tx2gene) <- c("transcript", "gene")
} else {
tx2gene <- data.frame(transcript = c(), gene = c())
}
## Extend with mappings from cDNA fasta file
tx <- sapply(names(cdna), function(i) strsplit(i, " ")[[1]][1])
gn <- sapply(names(cdna), function(i) gsub("gene:", "", strsplit(i, " ")[[1]][4]))
tx2gene2 <- data.frame(transcript = tx, gene = gn,
stringsAsFactors = FALSE)
rownames(tx2gene2) <- NULL
tx2gene2 <- tx2gene2[match(setdiff(tx2gene2$transcript,
tx2gene$transcript), tx2gene2$transcript), ]
tx2gene <- rbind(tx2gene, tx2gene2)
gene2tx <- tx2gene[, c("gene", "transcript")]
save(gene_length, tx_length, file = feature_lengths_file)
save(gene2tx, tx2gene, file = tx_gene_file)
}
diff_expression_edgeR <- function(counts, meta, cond_name, sample_name,
gene_length_matrix = NULL) {
## Differential expression analysis with edgeR
suppressPackageStartupMessages(library(edgeR))
## Prepare DGEList
counts <- round(counts)
cts <- counts[rowSums(is.na(counts)) == 0, ]
cts <- cts[rowSums(cts) != 0, ]
dge <-
DGEList(counts = cts, group = meta[, cond_name][match(colnames(cts),
meta[, sample_name])])
## If average transcript lengths provided, add as offset
if (!is.null(gene_length_matrix)) {
egf <- gene_length_matrix[match(rownames(cts), rownames(gene_length_matrix)),
match(colnames(cts), colnames(gene_length_matrix))]
egf <- egf / exp(rowMeans(log(egf)))
o <- log(calcNormFactors(cts/egf)) + log(colSums(cts/egf))
dge$offset <- t(t(log(egf)) + o)
} else {
dge <- calcNormFactors(dge)
}
## Estimate dispersions and fit model
design <- model.matrix(~dge$samples$group)
dge <- estimateGLMCommonDisp(dge, design = design)
dge <- estimateGLMTrendedDisp(dge, design = design)
dge <- estimateGLMTagwiseDisp(dge, design = design)
fit <- glmFit(dge, design = design)
lrt <- glmLRT(fit)
tt <- topTags(lrt, n = Inf)$table
return(list(dge = dge, tt = tt))
}
diff_expression_DESeq2 <- function(txi = NULL, counts, meta, cond_name,
level1, level2, sample_name) {
## Differential expression analysis with DESeq2
suppressPackageStartupMessages(library(DESeq2,
lib.loc = "/home/charlotte/R/x86_64-pc-linux-gnu-library/3.2"))
## If tximport object provided, generate DESeqDataSet from it. Otherwise,
## use the provided count matrix.
if (!is.null(txi)) {
txi$counts <- round(txi$counts)
keep_feat <- rownames(txi$counts[rowSums(is.na(txi$counts)) == 0 & rowSums(txi$counts) != 0, ])
txi <- lapply(txi, function(w) {
if (!is.null(dim(w))) w[match(keep_feat, rownames(w)), ]
else w
})
dsd <- DESeqDataSetFromTximport(txi,
colData = meta[match(colnames(txi$counts),
meta[, sample_name]), ],
design = as.formula(paste0("~", cond_name)))
} else {
counts <- round(counts)
cts = counts[rowSums(is.na(counts)) == 0, ]
cts <- cts[rowSums(cts) != 0, ]
dsd <- DESeqDataSetFromMatrix(countData = round(cts),
colData = meta[match(colnames(cts),
meta[, sample_name]), ],
design = as.formula(paste0("~", cond_name)))
}
## Estimate dispersions and fit model
dsd <- DESeq(dsd, test = "Wald", fitType = "local", betaPrior = TRUE)
res <- as.data.frame(results(dsd, contrast = c(cond_name, level2, level1),
cooksCutoff = FALSE, independentFiltering = FALSE))
return(list(dsd = dsd, res = res))
}
## Slight modification of limma::voom(), allowing offsets to be taken into account
myvoom <- function (counts, design = NULL, lib.size = NULL,
normalize.method = "none",
plot = FALSE, span = 0.5, ...) {
library(limma)
out <- list()
if (is(counts, "DGEList")) {
out$genes <- counts$genes
out$targets <- counts$samples
if (is.null(design) && diff(range(as.numeric(counts$sample$group))) >
0)
design <- model.matrix(~group, data = counts$samples)
if (is.null(lib.size))
lib.size <- exp(t(counts$offset)) ## modification
counts <- counts$counts
}
else {
isExpressionSet <- suppressPackageStartupMessages(is(counts,
"ExpressionSet"))
if (isExpressionSet) {
if (length(Biobase::fData(counts)))
out$genes <- Biobase::fData(counts)
if (length(Biobase::pData(counts)))
out$targets <- Biobase::pData(counts)
counts <- Biobase::exprs(counts)
}
else {
counts <- as.matrix(counts)
}
}
if (is.null(design)) {
design <- matrix(1, ncol(counts), 1)
rownames(design) <- colnames(counts)
colnames(design) <- "GrandMean"
}
if (is.null(lib.size))
lib.size <- colSums(counts)
y <- t(log2(t(counts + 0.5)/(lib.size + 1) * 1e+06))
y <- limma::normalizeBetweenArrays(y, method = normalize.method)
fit <- limma::lmFit(y, design, ...)
if (is.null(fit$Amean))
fit$Amean <- rowMeans(y, na.rm = TRUE)
sx <- fit$Amean + colMeans(log2(lib.size + 1)) - log2(1e+06)
sy <- sqrt(fit$sigma)
allzero <- rowSums(counts) == 0
if (any(allzero)) {
sx <- sx[!allzero]
sy <- sy[!allzero]
}
l <- lowess(sx, sy, f = span)
if (plot) {
plot(sx, sy, xlab = "log2( count size + 0.5 )", ylab = "Sqrt( standard deviation )",
pch = 16, cex = 0.25)
title("voom: Mean-variance trend")
lines(l, col = "red")
}
f <- approxfun(l, rule = 2)
if (fit$rank < ncol(design)) {
j <- fit$pivot[1:fit$rank]
fitted.values <- fit$coef[, j, drop = FALSE] %*% t(fit$design[,
j, drop = FALSE])
}
else {
fitted.values <- fit$coef %*% t(fit$design)
}
fitted.cpm <- 2^fitted.values
fitted.count <- 1e-06 * t(t(fitted.cpm) * (lib.size + 1))
fitted.logcount <- log2(fitted.count)
w <- 1/f(fitted.logcount)^4
dim(w) <- dim(fitted.logcount)
out$E <- y
out$weights <- w
out$design <- design
if (is.null(out$targets))
out$targets <- data.frame(lib.size = lib.size)
else out$targets$lib.size <- lib.size
new("EList", out)
}
diff_expression_voom <- function(counts, meta, cond_name, sample_name,
gene_length_matrix = NULL) {
## Differential expression analysis with voom
suppressPackageStartupMessages(library(limma))
suppressPackageStartupMessages(library(edgeR))
## Prepare DGEList
counts <- round(counts)
cts <- counts[rowSums(is.na(counts)) == 0, ]
cts <- cts[rowSums(cts) != 0, ]
dge <-
DGEList(counts = cts, group = meta[, cond_name][match(colnames(cts),
meta[, sample_name])])
design <- model.matrix(~dge$samples$group)
## If average transcript lengths provided, add as offset
if (!is.null(gene_length_matrix)) {
egf <- gene_length_matrix[match(rownames(cts), rownames(gene_length_matrix)),
match(colnames(cts), colnames(gene_length_matrix))]
egf <- egf / exp(rowMeans(log(egf)))
o <- log(calcNormFactors(cts/egf)) + log(colSums(cts/egf))
dge$offset <- t(t(log(egf)) + o)
v <- myvoom(dge, design = design)
} else {
dge <- calcNormFactors(dge)
v <- voom(dge, design = design)
}
## Transform and fit model
fit <- lmFit(v, design = design)
fit <- eBayes(fit)
tt <- topTable(fit, n = Inf)
return(list(dge = dge, tt = tt))
}
sessionInfo()
## R version 3.2.2 (2015-08-14)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.4 LTS
##
## locale:
## [1] LC_CTYPE=C LC_NUMERIC=C LC_TIME=en_CA.UTF-8
## [4] LC_COLLATE=en_CA.UTF-8 LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_CA.UTF-8
## [7] LC_PAPER=en_CA.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] biomaRt_2.26.1 DEXSeq_1.16.7 DESeq2_1.11.6
## [4] RcppArmadillo_0.6.400.2.2 Rcpp_0.12.2 SummarizedExperiment_1.0.2
## [7] Biobase_2.30.0 GenomicRanges_1.22.3 GenomeInfoDb_1.6.2
## [10] IRanges_2.4.6 S4Vectors_0.8.6 BiocGenerics_0.16.1
## [13] BiocParallel_1.4.3 iCOBRA_0.99.7 Hmisc_3.17-1
## [16] Formula_1.2-1 survival_2.38-3 lattice_0.20-33
## [19] ggplot2_2.0.0 tximport_0.0.7 dplyr_0.4.3
## [22] edgeR_3.12.0 limma_3.26.3
##
## loaded via a namespace (and not attached):
## [1] splines_3.2.2 gtools_3.5.0 shiny_0.13.0 assertthat_0.1
## [5] statmod_1.4.23 latticeExtra_0.6-26 Rsamtools_1.22.0 yaml_2.1.13
## [9] RSQLite_1.0.0 digest_0.6.9 RColorBrewer_1.1-2 XVector_0.10.0
## [13] colorspace_1.2-6 htmltools_0.3 httpuv_1.3.3 plyr_1.8.3
## [17] XML_3.98-1.3 genefilter_1.52.0 zlibbioc_1.16.0 xtable_1.8-0
## [21] scales_0.3.0 gdata_2.17.0 annotate_1.48.0 DT_0.1
## [25] ROCR_1.0-7 lazyeval_0.1.10 nnet_7.3-11 magrittr_1.5
## [29] mime_0.4 evaluate_0.8 gplots_2.17.0 hwriter_1.3.2
## [33] foreign_0.8-66 shinydashboard_0.5.1 tools_3.2.2 formatR_1.2.1
## [37] stringr_1.0.0 locfit_1.5-9.1 munsell_0.4.2 cluster_2.0.3
## [41] Biostrings_2.38.3 AnnotationDbi_1.32.3 lambda.r_1.1.7 caTools_1.17.1
## [45] RCurl_1.95-4.7 futile.logger_1.4.1 grid_3.2.2 htmlwidgets_0.5
## [49] labeling_0.3 bitops_1.0-6 shinyBS_0.61 rmarkdown_0.9.2
## [53] gtable_0.1.2 DBI_0.3.1 reshape2_1.4.1 R6_2.1.1
## [57] gridExtra_2.0.0 knitr_1.12.3 futile.options_1.0.0 KernSmooth_2.23-15
## [61] stringi_1.0-1 geneplotter_1.48.0 rpart_4.1-10 acepack_1.3-3.3