This report outlines the analysis of the GSE72165 data set, consisting of 3 samples from the carotid body and 3 samples from the adrenal medulla in mice.

Preparation

(Back to top)

Reference directories and packages

(Back to top)

basedir <- "/home/charlotte/gene_vs_tx_quantification"
refdir <- "/home/Shared/data/annotation"
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(tximport))
suppressPackageStartupMessages(library(iCOBRA))
suppressPackageStartupMessages(library(dplyr))

Metadata definition

(Back to top)

A metadata table was generated using the information provided in Gene Expression Omnibus.

meta <- read.delim(paste0(basedir, "/data/gse72165/gse72165_phenodata.txt"), 
                   header = TRUE, as.is = TRUE)
rownames(meta) <- meta$srr.id
meta
##                gsm.id     srr.id condition
## SRR2172551 GSM1856457 SRR2172551   carotid
## SRR2172552 GSM1856458 SRR2172552   carotid
## SRR2172553 GSM1856459 SRR2172553   carotid
## SRR2172554 GSM1856460 SRR2172554   medulla
## SRR2172555 GSM1856461 SRR2172555   medulla
## SRR2172556 GSM1856462 SRR2172556   medulla

FASTQ file download

(Back to top)

The code below downloads the FASTQ files for the six samples from the SRA.

fastq_files <- c(paste0("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR217/00", 
                        c(1:6), "/", 
                        meta$srr.id, "/", meta$srr.id, ".fastq.gz"))
for (fq in fastq_files) {
  if (!file.exists(paste0(basedir, "/data/gse72165/fastq/", basename(fq)))) {
    cmd <- paste0("wget -P ", basedir, "/data/gse72165/fastq ", fq)
    message(cmd)
    system(cmd)
  } else {
    message(paste0(fq, " is already downloaded."))
  }
}
## ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR217/001/SRR2172551/SRR2172551.fastq.gz is already downloaded.
## ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR217/002/SRR2172552/SRR2172552.fastq.gz is already downloaded.
## ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR217/003/SRR2172553/SRR2172553.fastq.gz is already downloaded.
## ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR217/004/SRR2172554/SRR2172554.fastq.gz is already downloaded.
## ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR217/005/SRR2172555/SRR2172555.fastq.gz is already downloaded.
## ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR217/006/SRR2172556/SRR2172556.fastq.gz is already downloaded.

Reference file preparation and index building

(Back to top)

We use the GRCm38.82 Mus musculus Ensembl annotation for the analysis. Here, we download the fasta file with the cDNA sequences and build an index for quantification with Salmon.

cdna_fasta <- paste0(refdir, "/Mouse/Ensembl_GRCm38.82/cDNA/Mus_musculus.GRCm38.cdna.all.fa.gz")
salmon_index <- paste0(basedir, "/annotation/Mouse_Ensembl_GRCm38.82/salmon_index/Mus_musculus.GRCm38.82.gtf.sidx")

############################### cDNA FASTA ################################
if (!file.exists(cdna_fasta)) {
  cmd <- paste0("wget -P ", refdir, "/Mouse/Ensembl_GRCm38.82/cDNA ", 
                "ftp://ftp.ensembl.org/pub/release-82/fasta/mus_musculus/", 
                "cdna/Mus_musculus.GRCm38.cdna.all.fa.gz")
  message(cmd)
  system(cmd)
  cmd <- paste0("gunzip -c ", cdna_fasta, "> ", gsub("\\.gz$", "", cdna_fasta))
  message(cmd)
  system(cmd)
} else {
  message(paste0("Reference cDNA fasta is already downloaded."))
}
## Reference cDNA fasta is already downloaded.
## Build Salmon index
cmd <- paste("salmon index -i", salmon_index, "-t", gsub("\\.gz$", "", cdna_fasta), "-p 5 --type quasi")
message(cmd)
## salmon index -i /home/charlotte/gene_vs_tx_quantification/annotation/Mouse_Ensembl_GRCm38.82/salmon_index/Mus_musculus.GRCm38.82.gtf.sidx -t /home/Shared/data/annotation/Mouse/Ensembl_GRCm38.82/cDNA/Mus_musculus.GRCm38.cdna.all.fa -p 5 --type quasi
if (!file.exists(paste0(salmon_index, "/hash.bin"))) {
  system(cmd)
} else {
  message("Salmon index already exists.")
}
## Salmon index already exists.

Definition of gene-to-transcript mapping

(Back to top)

Next, we derive a mapping between transcript and gene identifiers from the cDNA file downloaded above.

feature_lengths_file <- paste0(basedir, "/annotation/Mouse_Ensembl_GRCm38.82/feature_lengths.Rdata")
tx_gene_file <- paste0(basedir, "/annotation/Mouse_Ensembl_GRCm38.82/tx_gene_map.Rdata")
## Calculate gene and transcript lengths, get gene-transcript mapping
if (!file.exists(feature_lengths_file)) {
  calc_lengths_mapping(gtf = NULL, cdna_fasta = cdna_fasta, 
                       feature_lengths_file = feature_lengths_file, 
                       tx_gene_file = tx_gene_file) 
} else {
  message("feature lengths and tx-to-gene map already calculated.")
}
## feature lengths and tx-to-gene map already calculated.
load(tx_gene_file)

Salmon abundance quantification

(Back to top)

Next, we use Salmon to calculate transcript abundance estimates for each of the six samples.

salmon_basedir <- paste0(basedir, "/quantifications/gse72165/salmon")

fqs <- list.files(paste0(basedir, "/data/gse72165/fastq"), full.names = TRUE)
names(fqs) <- gsub("\\.fastq.gz", "", basename(fqs))

for (i in 1:length(fqs)) {
  if (!file.exists(paste0(salmon_basedir, "/", names(fqs)[i], "/quant.sf"))) {
    cmd <- sprintf("bash -c 'salmon quant -i %s -l U -r %s -p 5 -o %s'",
                   salmon_index,
                   paste0("<(gunzip -c ", fqs[i], ")"),
                   paste0(salmon_basedir, "/", names(fqs)[i]))
    cat(cmd, "\n")
    system(cmd)
  } else {
    cat("Salmon results for", names(fqs)[i], "already exist.\n")
  }
}
## Salmon results for SRR2172551 already exist.
## Salmon results for SRR2172552 already exist.
## Salmon results for SRR2172553 already exist.
## Salmon results for SRR2172554 already exist.
## Salmon results for SRR2172555 already exist.
## Salmon results for SRR2172556 already exist.

Summarization of Salmon results and offset estimation

(Back to top)

We use the tximport package (https://github.com/mikelove/tximport) to generate count matrices and offset matrices (average transcript lengths) from the Salmon transcript-level estimates. We generate two different count matrices (simplesum and scaledTPM), and additionally create offsets to be used with the simplesum matrix.

salmon_files <- list.files(salmon_basedir, pattern = "SRR", full.names = TRUE)
salmon_files <- salmon_files[file.info(salmon_files)$isdir]
salmon_files <- paste0(salmon_files, "/quant.sf")
salmon_files <- salmon_files[file.exists(salmon_files)]
names(salmon_files) <- basename(gsub("/quant.sf", "", salmon_files))
txi_salmonsimplesum <- tximport(files = salmon_files, type = "salmon", txIn = TRUE,
                                txOut = FALSE, countsFromAbundance = "no", 
                                gene2tx = gene2tx)
## reading in files
## 1 2 3 4 5 6 
## summarizing abundance
## summarizing counts
## summarizing length
txi_salmonscaledtpm <- tximport(files = salmon_files, type = "salmon", txIn = TRUE,
                                txOut = FALSE, countsFromAbundance = "scaledTPM", 
                                gene2tx = gene2tx)
## reading in files
## 1 2 3 4 5 6 
## summarizing abundance
## summarizing counts
## summarizing length
salmon_quant <- list(geneCOUNT_sal_simplesum = txi_salmonsimplesum$counts,
                     geneCOUNT_sal_scaledTPM = txi_salmonscaledtpm$counts,
                     avetxlength = txi_salmonsimplesum$length,
                     geneTPM_sal = txi_salmonsimplesum$abundance,
                     txi_salmonsimplesum = txi_salmonsimplesum,
                     txi_salmonscaledtpm = txi_salmonscaledtpm)

Differential expression analysis

(Back to top)

Given the gene count matrices defined above we apply edgeR and DESeq2 to perform differential gene expression. For the simplesum matrix, we also apply edgeR and DESeq2 using the offsets derived from the average transcript lengths (simplesum_avetxl).

edgeR

(Back to top)

res_sal_simplesum_edgeR <- diff_expression_edgeR(counts = salmon_quant$geneCOUNT_sal_simplesum, 
                                                 meta = meta, cond_name = "condition", 
                                                 sample_name = "srr.id", 
                                                 gene_length_matrix = NULL)
res_sal_simplesum_avetxl_edgeR <- diff_expression_edgeR(counts = salmon_quant$geneCOUNT_sal_simplesum, 
                                                        meta = meta, cond_name = "condition", 
                                                        sample_name = "srr.id", 
                                                        gene_length_matrix = salmon_quant$avetxlength)
res_sal_scaledTPM_edgeR <- diff_expression_edgeR(counts = salmon_quant$geneCOUNT_sal_scaledTPM, 
                                                 meta = meta, cond_name = "condition", 
                                                 sample_name = "srr.id", 
                                                 gene_length_matrix = NULL)

Diagnostics

(Back to top)

dfh <- data.frame(pvalue = c(res_sal_scaledTPM_edgeR$tt$PValue,
                             res_sal_simplesum_avetxl_edgeR$tt$PValue,
                             res_sal_simplesum_edgeR$tt$PValue),
                  mth = c(rep("scaledTPM_salmon, edgeR", nrow(res_sal_scaledTPM_edgeR$tt)),
                          rep("simplesum_salmon_avetxl, edgeR", nrow(res_sal_simplesum_avetxl_edgeR$tt)),
                          rep("simplesum_salmon, edgeR", nrow(res_sal_simplesum_edgeR$tt))))
ggplot(dfh, aes(x = pvalue)) + geom_histogram() + facet_wrap(~mth) + 
  plot_theme() + 
  xlab("p-value") + ylab("count")

par(mfrow = c(1, 3))
plotBCV(res_sal_scaledTPM_edgeR$dge, main = "scaledTPM_salmon, edgeR")
plotBCV(res_sal_simplesum_avetxl_edgeR$dge, main = "simplesum_salmon_avetxl, edgeR")
plotBCV(res_sal_simplesum_edgeR$dge, main = "simplesum_salmon, edgeR")

par(mfrow = c(1, 3))
plotSmear(res_sal_scaledTPM_edgeR$dge, main = "scaledTPM_salmon, edgeR", ylim = c(-13, 13))
plotSmear(res_sal_simplesum_avetxl_edgeR$dge, main = "simplesum_salmon_avetxl, edgeR", ylim = c(-13, 13))
plotSmear(res_sal_simplesum_edgeR$dge, main = "simplesum_salmon, edgeR", ylim = c(-13, 13))