RNA-seq
- Following steps you saw in the theoretical lesson, make a
differential expression analysis. All the data are in the
Datasets/RNA_seq.zip
folder. See
Annotation.csv
for information about samples. Data have
been retrieved from
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE193127
and contain expression counts for 6 prostate cancer samples of VCaP
cells. Three of them have been treated with siRNA of FOXA1 and the other
3 are controls.
- Load
gene.counts
file with “vcap” in the name. Use only
protein coding
genes to obtain a Feature counts matrix
options(stringsAsFactors = F)
f <- list.files(path="Datasets/RNA_seq",
recursive=F,
pattern=".gene.counts"
,full.names=T)
f=f[grep("pc3", f, invert = T)]
a=read.delim2(f[1], header=T, skip=1)
map2=a[,c("Geneid", "Length","gene_name", "gene_type")]
for(i in f){
b=read.delim2(i, header=T, skip=1)
map2$new=b[,ncol(b)]
map2$new=as.numeric(map2$new)
colnames(map2)[ncol(map2)]=gsub("\\.", "-", gsub(".Aligned.sortedByCoord.out.bam", "", colnames(b)[ncol(b)]))
}
rownames(map2)=map2$Geneid
saveRDS(map2, "expression_counts.rds")
#head(map2)
# For this analysis we will use only protein coding
map2=subset(map2, gene_type=="protein_coding")
saveRDS(map2, "expression_counts_protein_coding_genes.rds")
head(map2)
## Geneid Length gene_name gene_type nsi-R1
## ENSG00000186092.6_4 ENSG00000186092.6_4 2618 OR4F5 protein_coding 2.12
## ENSG00000237683.5 ENSG00000237683.5 2661 AL627309.1 protein_coding 124.92
## ENSG00000235249.1 ENSG00000235249.1 995 OR4F29 protein_coding 3.66
## ENSG00000284662.1_1 ENSG00000284662.1_1 995 OR4F16 protein_coding 2.91
## ENSG00000269831.1 ENSG00000269831.1 129 AL669831.1 protein_coding 7.50
## ENSG00000269308.1 ENSG00000269308.1 57 AL645608.2 protein_coding 0.00
## nsi-R2 nsi-R4 siFOXA1-R1 siFOXA1-R2 siFOXA1-R4
## ENSG00000186092.6_4 0.54 0.50 3.50 1.21 3.00
## ENSG00000237683.5 123.30 131.75 125.53 145.73 117.36
## ENSG00000235249.1 5.36 1.82 2.28 3.26 1.16
## ENSG00000284662.1_1 4.86 1.57 1.78 2.59 1.16
## ENSG00000269831.1 17.33 15.00 16.25 11.00 19.50
## ENSG00000269308.1 0.00 0.00 0.00 0.00 0.00
dim(map2)
## [1] 20298 10
#length(unique(map2$Geneid))
#length(unique(map2$gene_name))
#nrow(map2)
- Define a function with all required passages for differential
expression analysis with
DeSeq2
. Use sample info annotated
in Annotation.csv
to distinguish between case and
controls
# First we define a function with all required passages for differential expression analysis with DeSeq2
get_DESEQ = function(counts, samples, cond, ctrl,cores=1){
# Perform DESeq analysis
require(DESeq2)
require(BiocParallel)
tmp= floor(as.matrix(counts[,samples])) # DeSeq2 requires integer values
colData <- data.frame(condition=factor(cond))
colData$condition <- relevel(colData$condition, ref = ctrl)
dds <- DESeqDataSetFromMatrix(tmp, colData, formula(~condition)) # this function normalize data
dds <- dds[ rowSums(counts(dds)) > 1, ]
dds <- DESeq(dds, parallel = T, BPPARAM = MulticoreParam(workers = cores))
return(dds)
}
# DE ======
library(DESeq2)
info=read.csv("Datasets/RNA_seq/Annotation.csv")
rownames(map2) = paste0(map2$gene_name,'_', rownames(map2))
samples=colnames(map2)[5:ncol(map2)]
group=info$Treatment[match(samples, info$Sample_name)]
deg = get_DESEQ(map2
, samples=samples
, cond = group
, ctrl='Control')
## Loading required package: BiocParallel
## converting counts to integer mode
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 1 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 1 workers
res <- results(deg, name="condition_siFOXA1_vs_Control")
res <- res[order(res$pvalue),]
res$gene_name=map2$gene_name[match(rownames(res), rownames(map2))]
res$gene_id=map2$Geneid[match(rownames(res), rownames(map2))]
head(res)
## log2 fold change (MLE): condition siFOXA1 vs Control
## Wald test p-value: condition siFOXA1 vs Control
## DataFrame with 6 rows and 8 columns
## baseMean log2FoldChange lfcSE stat
## <numeric> <numeric> <numeric> <numeric>
## FGFRL1_ENSG00000127418.14_3 6200.61 -2.79986 0.0692654 -40.4222
## ABCG2_ENSG00000118777.11_4 1697.39 5.96884 0.1413869 42.2163
## FBXO32_ENSG00000156804.7_2 5535.90 5.29011 0.0976957 54.1489
## TGFB3_ENSG00000119699.7_2 2198.95 6.91383 0.1745355 39.6128
## AMOT_ENSG00000126016.15_3 2542.08 3.66825 0.0933480 39.2965
## BTG1_ENSG00000133639.4_3 6240.26 3.03103 0.0832533 36.4073
## pvalue padj gene_name
## <numeric> <numeric> <character>
## FGFRL1_ENSG00000127418.14_3 0.00000e+00 0.00000e+00 FGFRL1
## ABCG2_ENSG00000118777.11_4 0.00000e+00 0.00000e+00 ABCG2
## FBXO32_ENSG00000156804.7_2 0.00000e+00 0.00000e+00 FBXO32
## TGFB3_ENSG00000119699.7_2 0.00000e+00 0.00000e+00 TGFB3
## AMOT_ENSG00000126016.15_3 0.00000e+00 0.00000e+00 AMOT
## BTG1_ENSG00000133639.4_3 3.26186e-290 9.15931e-287 BTG1
## gene_id
## <character>
## FGFRL1_ENSG00000127418.14_3 ENSG00000127418.14_3
## ABCG2_ENSG00000118777.11_4 ENSG00000118777.11_4
## FBXO32_ENSG00000156804.7_2 ENSG00000156804.7_2
## TGFB3_ENSG00000119699.7_2 ENSG00000119699.7_2
## AMOT_ENSG00000126016.15_3 ENSG00000126016.15_3
## BTG1_ENSG00000133639.4_3 ENSG00000133639.4_3
saveRDS(res, "Differential_expression_deseq2.rds")
- Visualize the results with a
Volcano plot
using
0.01
as cutoff for padj
and 1
as
cutoff for log2FoldChange
to identify differentially
expressed genes (DEGs)
library(EnhancedVolcano)
EnhancedVolcano(toptable = res, x = "log2FoldChange", y = "padj", lab = res$gene_name, pCutoff = 0.01, FCcutoff = 1)

dex = subset(res, padj<=0.01 & abs(log2FoldChange)>=1)
dex$diff="Down"
dex$diff[which(sign(dex$log2FoldChange)>0)]="Up"
table(dex$diff)
##
## Down Up
## 754 1145
- Perform an over-representation analysis using DEGs previously
identified. Use
msigdbr
to extract KEGG gene sets and
clusterProfiler
to perform the over-representation
analysis.
library(msigdbr)
library(tidyverse)
library(clusterProfiler)
kegg = msigdbr(species = "Homo sapiens", collection = 'C2',subcollection = "CP:KEGG_LEGACY") %>% dplyr::select(gs_name, gene_symbol)
df=rbind.data.frame()
for(i in c("Up", "Down")){
genes=subset(dex, diff==i)
g=unique(genes$gene_name)
y = enricher(g
, TERM2GENE = kegg
, pAdjustMethod = "BH"
, pvalueCutoff=1
, qvalueCutoff=1 )
if(!is.null(y)){
y = y@result
y$Description=fct_reorder(y$Description, as.numeric(sub("/\\d+", "", y$GeneRatio)), .desc = F)
y$rank = rank(y$pvalue)
}
head(y)
y$gene_set="Kegg"
y$diff=i
df=rbind.data.frame(df, y)
}
head(df)
## ID
## KEGG_STEROID_HORMONE_BIOSYNTHESIS KEGG_STEROID_HORMONE_BIOSYNTHESIS
## KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS
## KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450
## KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM
## KEGG_RETINOL_METABOLISM KEGG_RETINOL_METABOLISM
## KEGG_ASCORBATE_AND_ALDARATE_METABOLISM KEGG_ASCORBATE_AND_ALDARATE_METABOLISM
## Description
## KEGG_STEROID_HORMONE_BIOSYNTHESIS KEGG_STEROID_HORMONE_BIOSYNTHESIS
## KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS
## KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450
## KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM
## KEGG_RETINOL_METABOLISM KEGG_RETINOL_METABOLISM
## KEGG_ASCORBATE_AND_ALDARATE_METABOLISM KEGG_ASCORBATE_AND_ALDARATE_METABOLISM
## GeneRatio BgRatio
## KEGG_STEROID_HORMONE_BIOSYNTHESIS 18/336 55/5244
## KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS 11/336 28/5244
## KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 17/336 70/5244
## KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM 12/336 41/5244
## KEGG_RETINOL_METABOLISM 15/336 64/5244
## KEGG_ASCORBATE_AND_ALDARATE_METABOLISM 9/336 25/5244
## pvalue p.adjust
## KEGG_STEROID_HORMONE_BIOSYNTHESIS 3.436528e-09 5.429714e-07
## KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS 5.084707e-07 4.016918e-05
## KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 1.194397e-06 6.290491e-05
## KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM 5.725068e-06 2.261402e-04
## KEGG_RETINOL_METABOLISM 8.229556e-06 2.600540e-04
## KEGG_ASCORBATE_AND_ALDARATE_METABOLISM 1.335901e-05 3.517873e-04
## qvalue
## KEGG_STEROID_HORMONE_BIOSYNTHESIS 4.919661e-07
## KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS 3.639580e-05
## KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 5.699579e-05
## KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM 2.048972e-04
## KEGG_RETINOL_METABOLISM 2.356252e-04
## KEGG_ASCORBATE_AND_ALDARATE_METABOLISM 3.187414e-04
## geneID
## KEGG_STEROID_HORMONE_BIOSYNTHESIS AKR1C1/HSD3B1/AKR1C3/HSD17B2/UGT1A6/CYP1A1/CYP3A5/UGT1A5/AKR1C2/HSD11B2/UGT1A1/UGT1A8/UGT1A3/UGT1A10/UGT1A9/UGT1A4/UGT1A7/SULT2B1
## KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS CRYL1/UGP2/UGT1A6/UGT1A5/UGT1A1/UGT1A8/UGT1A3/UGT1A10/UGT1A9/UGT1A4/UGT1A7
## KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 AKR1C1/AKR1C3/UGT1A6/CYP1A1/CYP3A5/UGT1A5/AKR1C2/UGT1A1/UGT1A8/UGT1A3/UGT1A10/UGT1A9/UGT1A4/UGT1A7/GSTA1/CYP2C18/GSTA2
## KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM HMOX1/ALAD/UGT1A6/UGT1A5/UGT1A1/UGT1A8/UGT1A3/UGT1A10/UGT1A9/UGT1A4/UGT1A7/FTH1
## KEGG_RETINOL_METABOLISM UGT1A6/CYP1A1/CYP3A5/UGT1A5/UGT1A1/UGT1A8/UGT1A3/BCO1/UGT1A10/UGT1A9/UGT1A4/UGT1A7/CYP26A1/ALDH1A2/CYP2C18
## KEGG_ASCORBATE_AND_ALDARATE_METABOLISM UGT1A6/UGT1A5/UGT1A1/UGT1A8/UGT1A3/UGT1A10/UGT1A9/UGT1A4/UGT1A7
## Count rank gene_set diff
## KEGG_STEROID_HORMONE_BIOSYNTHESIS 18 1 Kegg Up
## KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS 11 2 Kegg Up
## KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 17 3 Kegg Up
## KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM 12 4 Kegg Up
## KEGG_RETINOL_METABOLISM 15 5 Kegg Up
## KEGG_ASCORBATE_AND_ALDARATE_METABOLISM 9 6 Kegg Up
df$Description=gsub("KEGG_", "", df$Description)
df=subset(df, p.adjust<=0.1)
library(DOSE)
library(ggplot2)
df=df[order(parse_ratio(df$GeneRatio), decreasing = F),]
df$Description=factor(df$Description, levels=unique(df$Description))
ggplot(df
,aes(
x= parse_ratio(GeneRatio)
,y = Description)
) +
geom_segment(aes(xend=0, yend = Description)) +
geom_point(aes(fill=p.adjust, size = parse_ratio(GeneRatio), shape=diff)) +
scale_shape_manual(values=c(25, 24))+
scale_fill_viridis_c(option = 'D',direction = -1) +
scale_size_continuous(range=c(2, 10)) +
theme_bw() +
xlab("GeneRatio") +
ylab(NULL) +
ggtitle("Enriched Disease Ontology")+
theme(legend.position="bottom", legend.direction="horizontal")

- Perform an over-representation analysis using DEGs previously
identified. Use
msigdbr
to extract REACTOME
gene sets and clusterProfiler
to perform the
over-representation analysis (check https://www.gsea-msigdb.org/gsea/msigdb/human/genesets.jsp?collection=H).
library(msigdbr)
library(tidyverse)
library(clusterProfiler)
reactome = msigdbr(species = "Homo sapiens", collection = 'C2', subcollection = 'CP:REACTOME') %>% dplyr::select(gs_name, gene_symbol)
df=rbind.data.frame()
for(i in c("Up", "Down")){
genes=subset(dex, diff==i)
g=unique(genes$gene_name)
y = enricher(g
, TERM2GENE = reactome
, pAdjustMethod = "BH"
, pvalueCutoff=1
, qvalueCutoff=1 )
if(!is.null(y)){
y = y@result
y$Description=fct_reorder(y$Description, as.numeric(sub("/\\d+", "", y$GeneRatio)), .desc = F)
y$rank = rank(y$pvalue)
}
head(y)
y$gene_set="REACTOME"
y$diff=i
df=rbind.data.frame(df, y)
}
head(df)
## ID
## REACTOME_BIOLOGICAL_OXIDATIONS REACTOME_BIOLOGICAL_OXIDATIONS
## REACTOME_DRUG_ADME REACTOME_DRUG_ADME
## REACTOME_PHASE_II_CONJUGATION_OF_COMPOUNDS REACTOME_PHASE_II_CONJUGATION_OF_COMPOUNDS
## REACTOME_PARACETAMOL_ADME REACTOME_PARACETAMOL_ADME
## REACTOME_GLUCURONIDATION REACTOME_GLUCURONIDATION
## REACTOME_ASPIRIN_ADME REACTOME_ASPIRIN_ADME
## Description
## REACTOME_BIOLOGICAL_OXIDATIONS REACTOME_BIOLOGICAL_OXIDATIONS
## REACTOME_DRUG_ADME REACTOME_DRUG_ADME
## REACTOME_PHASE_II_CONJUGATION_OF_COMPOUNDS REACTOME_PHASE_II_CONJUGATION_OF_COMPOUNDS
## REACTOME_PARACETAMOL_ADME REACTOME_PARACETAMOL_ADME
## REACTOME_GLUCURONIDATION REACTOME_GLUCURONIDATION
## REACTOME_ASPIRIN_ADME REACTOME_ASPIRIN_ADME
## GeneRatio BgRatio pvalue
## REACTOME_BIOLOGICAL_OXIDATIONS 43/665 220/11290 1.947395e-12
## REACTOME_DRUG_ADME 27/665 109/11290 1.045996e-10
## REACTOME_PHASE_II_CONJUGATION_OF_COMPOUNDS 25/665 109/11290 2.927645e-09
## REACTOME_PARACETAMOL_ADME 11/665 29/11290 3.554036e-07
## REACTOME_GLUCURONIDATION 10/665 25/11290 6.848318e-07
## REACTOME_ASPIRIN_ADME 13/665 44/11290 8.677656e-07
## p.adjust qvalue
## REACTOME_BIOLOGICAL_OXIDATIONS 1.914289e-09 1.742406e-09
## REACTOME_DRUG_ADME 5.141068e-08 4.679454e-08
## REACTOME_PHASE_II_CONJUGATION_OF_COMPOUNDS 9.592916e-07 8.731572e-07
## REACTOME_PARACETAMOL_ADME 8.734044e-05 7.949818e-05
## REACTOME_GLUCURONIDATION 1.346379e-04 1.225489e-04
## REACTOME_ASPIRIN_ADME 1.421689e-04 1.294036e-04
## geneID
## REACTOME_BIOLOGICAL_OXIDATIONS CYP2J2/CYP4F11/MAOA/CYP4F3/AOC1/CYP4F22/CYP4B1/PTGS1/CYP4F8/GLYATL1/SULT2A1/UGP2/SULT1C2/GLYATL2/UGT1A6/CYP2W1/CYP1A1/NR1H4/CYP3A5/UGT1A5/UGT1A1/UGT1A8/UGT1A3/CYP4F12/UGT1A10/UGT1A9/UGT1A4/UGT1A7/DPEP1/TPST2/GSTA1/SULT1C4/SULT1B1/CHAC1/SULT2B1/CYP26A1/GGT6/SULT1A4/CYP2C18/SULT1A3/GSTA2/CYP46A1/GLYAT
## REACTOME_DRUG_ADME ABCG2/BCHE/VAV3/AKR1C1/GLYATL1/SULT2A1/GLYATL2/UGT1A6/XDH/ABCC3/UGT1A5/HSD11B2/UGT1A1/UGT1A8/UGT1A3/UGT1A10/UGT1A9/UGT1A4/UGT1A7/GSTA1/SULT1C4/GGT6/SULT1A4/SULT1A3/SLC28A3/GSTA2/GLYAT
## REACTOME_PHASE_II_CONJUGATION_OF_COMPOUNDS GLYATL1/SULT2A1/UGP2/SULT1C2/GLYATL2/UGT1A6/UGT1A5/UGT1A1/UGT1A8/UGT1A3/UGT1A10/UGT1A9/UGT1A4/UGT1A7/TPST2/GSTA1/SULT1C4/SULT1B1/CHAC1/SULT2B1/GGT6/SULT1A4/SULT1A3/GSTA2/GLYAT
## REACTOME_PARACETAMOL_ADME ABCG2/SULT2A1/UGT1A6/ABCC3/UGT1A1/UGT1A10/UGT1A9/SULT1C4/GGT6/SULT1A4/SULT1A3
## REACTOME_GLUCURONIDATION UGP2/UGT1A6/UGT1A5/UGT1A1/UGT1A8/UGT1A3/UGT1A10/UGT1A9/UGT1A4/UGT1A7
## REACTOME_ASPIRIN_ADME BCHE/GLYATL1/GLYATL2/UGT1A6/ABCC3/UGT1A5/UGT1A1/UGT1A8/UGT1A3/UGT1A9/UGT1A4/UGT1A7/GLYAT
## Count rank gene_set diff
## REACTOME_BIOLOGICAL_OXIDATIONS 43 1 REACTOME Up
## REACTOME_DRUG_ADME 27 2 REACTOME Up
## REACTOME_PHASE_II_CONJUGATION_OF_COMPOUNDS 25 3 REACTOME Up
## REACTOME_PARACETAMOL_ADME 11 4 REACTOME Up
## REACTOME_GLUCURONIDATION 10 5 REACTOME Up
## REACTOME_ASPIRIN_ADME 13 6 REACTOME Up
df$Description=gsub("REACTOME_", "", df$Description)
df=subset(df, p.adjust<=0.1)
library(DOSE)
library(ggplot2)
df=df[order(parse_ratio(df$GeneRatio), decreasing = F),]
df$Description=factor(df$Description, levels=unique(df$Description))
ggplot(df
,aes(
x= parse_ratio(GeneRatio)
,y = Description)
) +
geom_segment(aes(xend=0, yend = Description)) +
geom_point(aes(fill=p.adjust, size = parse_ratio(GeneRatio), shape=diff)) +
scale_shape_manual(values=c(25, 24))+
scale_fill_viridis_c(option = 'D',direction = -1) +
scale_size_continuous(range=c(2, 10)) +
theme_bw() +
xlab("GeneRatio") +
ylab(NULL) +
ggtitle("Enriched Disease Ontology")+
theme(legend.position="bottom", legend.direction="horizontal")
