We will go through a full analysis of differential expression.
Suppose we have Feature Counts results for genes (gene counts). We have to build the expression matrix.
options(stringsAsFactors = F)
f <- list.files(path="Data/RNA_seq",
recursive=F,
pattern=".gene.counts"
,full.names=T)
a=read.delim2(f[1], header=T, skip=1)
map2=a[,c("Geneid", "Length","gene_name", "gene_type")]
for(i in f){
b=read.delim2(i, header=T, skip=1)
map2$new=b[,ncol(b)]
map2$new=as.numeric(map2$new)
colnames(map2)[ncol(map2)]=gsub("\\.", "-", gsub(".Aligned.sortedByCoord.out.bam", "", colnames(b)[ncol(b)]))
}
rownames(map2)=map2$Geneid
saveRDS(map2, "Data/RNA_seq/expression_counts.rds")
head(map2)
## Geneid Length gene_name
## ENSG00000223972.5_2 ENSG00000223972.5_2 1735 DDX11L1
## ENSG00000227232.5_2 ENSG00000227232.5_2 1351 WASH7P
## ENSG00000243485.5_4 ENSG00000243485.5_4 1021 RP11-34P13.3
## ENSG00000237613.2_2 ENSG00000237613.2_2 1219 FAM138A
## ENSG00000268020.3_4 ENSG00000268020.3_4 840 OR4G4P
## ENSG00000240361.2_3 ENSG00000240361.2_3 1414 OR4G11P
## gene_type fxa1-psi03-51-131-010
## ENSG00000223972.5_2 transcribed_unprocessed_pseudogene 2
## ENSG00000227232.5_2 unprocessed_pseudogene 120
## ENSG00000243485.5_4 lincRNA 3
## ENSG00000237613.2_2 lincRNA 1
## ENSG00000268020.3_4 unprocessed_pseudogene 0
## ENSG00000240361.2_3 transcribed_unprocessed_pseudogene 2
## fxa1-psi04-51-131-010 fxa1-psi07-51-134-110
## ENSG00000223972.5_2 6 1
## ENSG00000227232.5_2 317 186
## ENSG00000243485.5_4 3 0
## ENSG00000237613.2_2 0 0
## ENSG00000268020.3_4 1 1
## ENSG00000240361.2_3 4 1
## fxa1-psi09-51-134-010 fxa1-pwt03-51-131-000
## ENSG00000223972.5_2 2 2
## ENSG00000227232.5_2 297 155
## ENSG00000243485.5_4 0 2
## ENSG00000237613.2_2 0 0
## ENSG00000268020.3_4 1 0
## ENSG00000240361.2_3 0 1
## fxa1-pwt04-51-131-000 fxa1-pwt07-51-134-100
## ENSG00000223972.5_2 2 0
## ENSG00000227232.5_2 178 86
## ENSG00000243485.5_4 1 3
## ENSG00000237613.2_2 0 0
## ENSG00000268020.3_4 0 1
## ENSG00000240361.2_3 0 0
## fxa1-pwt09-51-134-000
## ENSG00000223972.5_2 1
## ENSG00000227232.5_2 407
## ENSG00000243485.5_4 0
## ENSG00000237613.2_2 1
## ENSG00000268020.3_4 0
## ENSG00000240361.2_3 0
# For this analysis we will use only protein coding
map2=subset(map2, gene_type=="protein_coding")
saveRDS(map2, "Data/RNA_seq/expression_counts_protein_coding_genes.rds")
head(map2)
## Geneid Length gene_name gene_type
## ENSG00000186092.6_4 ENSG00000186092.6_4 2618 OR4F5 protein_coding
## ENSG00000237683.5 ENSG00000237683.5 2661 AL627309.1 protein_coding
## ENSG00000235249.1 ENSG00000235249.1 995 OR4F29 protein_coding
## ENSG00000284662.1_1 ENSG00000284662.1_1 995 OR4F16 protein_coding
## ENSG00000269831.1 ENSG00000269831.1 129 AL669831.1 protein_coding
## ENSG00000269308.1 ENSG00000269308.1 57 AL645608.2 protein_coding
## fxa1-psi03-51-131-010 fxa1-psi04-51-131-010
## ENSG00000186092.6_4 5 2
## ENSG00000237683.5 13 30
## ENSG00000235249.1 3 0
## ENSG00000284662.1_1 0 0
## ENSG00000269831.1 1 6
## ENSG00000269308.1 2 0
## fxa1-psi07-51-134-110 fxa1-psi09-51-134-010
## ENSG00000186092.6_4 2 2
## ENSG00000237683.5 132 196
## ENSG00000235249.1 0 0
## ENSG00000284662.1_1 0 0
## ENSG00000269831.1 2 4
## ENSG00000269308.1 0 1
## fxa1-pwt03-51-131-000 fxa1-pwt04-51-131-000
## ENSG00000186092.6_4 2 0
## ENSG00000237683.5 20 16
## ENSG00000235249.1 1 0
## ENSG00000284662.1_1 0 0
## ENSG00000269831.1 4 2
## ENSG00000269308.1 1 0
## fxa1-pwt07-51-134-100 fxa1-pwt09-51-134-000
## ENSG00000186092.6_4 0 2
## ENSG00000237683.5 45 179
## ENSG00000235249.1 0 0
## ENSG00000284662.1_1 0 0
## ENSG00000269831.1 0 7
## ENSG00000269308.1 0 0
length(unique(map2$Geneid))
## [1] 20298
length(unique(map2$gene_name))
## [1] 20278
nrow(map2)
## [1] 20298
We have our expression matrix, nevertheless we need information about
our data to proceed. We find them in the Exp_info.csv
.
Notice that experiments are done on the same cell line but by different
operators. As we are not interested in this difference we have to
correct for this bias before performing a differential gene
expression.
The same if there were any other sources of unwonted diversity.
info=read.csv("Data/RNA_seq/Exp_info.csv")
info
## Barcode Cell_line Treatment Operator
## 1 fxa1-psi03-51-131-010 PC3 treated A
## 2 fxa1-psi04-51-131-010 PC3 treated A
## 3 fxa1-psi07-51-134-110 PC3 treated B
## 4 fxa1-psi09-51-134-010 PC3 treated B
## 5 fxa1-pwt03-51-131-000 PC3 not_treated A
## 6 fxa1-pwt04-51-131-000 PC3 not_treated A
## 7 fxa1-pwt07-51-134-100 PC3 not_treated B
## 8 fxa1-pwt09-51-134-000 PC3 not_treated B
library(sva)
batch=info$Operator[match(colnames(map2)[5:ncol(map2)], info$Barcode)]
batch
## [1] "A" "A" "B" "B" "A" "A" "B" "B"
group=info$Treatment[match(colnames(map2)[5:ncol(map2)], info$Barcode)]
group
## [1] "treated" "treated" "treated" "treated" "not_treated"
## [6] "not_treated" "not_treated" "not_treated"
c=ComBat_seq(as.matrix(map2[,5:ncol(map2)]), batch=batch, group=group) # requires a matrix of counts only
## Found 2 batches
## Using full model in ComBat-seq.
## Adjusting for 1 covariate(s) or covariate level(s)
## Estimating dispersions
## Fitting the GLM model
## Shrinkage off - using GLM estimates for parameters
## Adjusting the data
c=cbind.data.frame(map2[,1:4], c)
saveRDS(c, "Data/RNA_seq/expression_counts_protein_coding_genes_batch_corrected_by_operator.rds")
We will use DESeq2
package from Bioconductor (see https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html
for more info)
# First we define a function with all required passages for differential expression analysis with DeSeq2
get_DESEQ = function(counts, samples, cond, ctrl, cores=1){
# Perform DESeq analysis
require(DESeq2)
require(BiocParallel)
tmp= floor(as.matrix(counts[,samples])) # DeSeq2 requires integer values
colData <- data.frame(condition=factor(cond))
colData$condition <- relevel(colData$condition, ref = ctrl)
dds <- DESeqDataSetFromMatrix(tmp, colData, formula(~condition)) # this function normalize data
dds <- dds[ rowSums(counts(dds)) > 1, ]
dds <- DESeq(dds, parallel = T, BPPARAM = MulticoreParam(workers = cores))
dds
}
# DE ======
library(DESeq2)
rownames(c) = paste0(c$gene_name,'_', rownames(c))
samples=colnames(c)[grep("fxa1", colnames(c))]
group=info$Treatment[match(samples, info$Barcode)]
deg = get_DESEQ(c
, samples=samples
, cond = group
, ctrl='not_treated')
## converting counts to integer mode
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 1 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 1 workers
res <- results(deg, name="condition_treated_vs_not_treated") #paste0("condition_", case, "_vs_", control)
res <- res[order(res$pvalue),]
res$gene_name=c$gene_name[match(rownames(res), rownames(c))]
res$gene_id=c$Geneid[match(rownames(res), rownames(c))]
head(res)
## log2 fold change (MLE): condition treated vs not treated
## Wald test p-value: condition treated vs not treated
## DataFrame with 6 rows and 8 columns
## baseMean log2FoldChange lfcSE stat
## <numeric> <numeric> <numeric> <numeric>
## UBA7_ENSG00000182179.12_3 1993.23 3.35314 0.0932504 35.9584
## BTN3A1_ENSG00000026950.16_4 2003.18 2.95638 0.0855382 34.5621
## UBE2L6_ENSG00000156587.15_3 6424.09 2.86761 0.0900144 31.8573
## BTN3A3_ENSG00000111801.15_3 1566.13 2.54437 0.0820215 31.0207
## OAS2_ENSG00000111335.12_2 13807.01 3.55329 0.1157605 30.6952
## BTN3A2_ENSG00000186470.13_4 1478.70 2.65564 0.0875166 30.3445
## pvalue padj gene_name
## <numeric> <numeric> <character>
## UBA7_ENSG00000182179.12_3 3.73526e-283 6.21883e-279 UBA7
## BTN3A1_ENSG00000026950.16_4 9.38854e-262 7.81549e-258 BTN3A1
## UBE2L6_ENSG00000156587.15_3 1.04314e-222 5.78910e-219 UBE2L6
## BTN3A3_ENSG00000111801.15_3 2.83156e-211 1.17857e-207 BTN3A3
## OAS2_ENSG00000111335.12_2 6.59275e-207 2.19525e-203 OAS2
## BTN3A2_ENSG00000186470.13_4 2.97288e-202 8.24925e-199 BTN3A2
## gene_id
## <character>
## UBA7_ENSG00000182179.12_3 ENSG00000182179.12_3
## BTN3A1_ENSG00000026950.16_4 ENSG00000026950.16_4
## UBE2L6_ENSG00000156587.15_3 ENSG00000156587.15_3
## BTN3A3_ENSG00000111801.15_3 ENSG00000111801.15_3
## OAS2_ENSG00000111335.12_2 ENSG00000111335.12_2
## BTN3A2_ENSG00000186470.13_4 ENSG00000186470.13_4
saveRDS(res, "Data/RNA_seq/Differential_expression_deseq2.rds")
# Figures ===========
## Estimate dispersion trend and apply a variance stabilizing transformation
vsd <- vst(deg, blind=FALSE)
sampleDists <- dist(t(assay(vsd)))
library(RColorBrewer)
library(pheatmap)
library(ggplot2)
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- colnames(vsd)
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors)
pcaData <- DESeq2::plotPCA(vsd, intgroup=c("condition"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=condition)) +
geom_point(size=3) + ggrepel::geom_label_repel(aes(label=name))+
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance"))+coord_fixed()+ggpubr::theme_pubr()
# PCA and correlation between samples do not change respect to tested conditions
We will use EnhancedVolcano
package from Bioconductor
(https://www.bioconductor.org/packages/devel/bioc/vignettes/EnhancedVolcano/inst/doc/EnhancedVolcano.html#introduction)
library(EnhancedVolcano)
EnhancedVolcano(toptable = res, x = "log2FoldChange", y = "padj", lab = res$gene_name, pCutoff = 0.01, FCcutoff = 1)
dex = subset(res, padj<=0.01 & abs(log2FoldChange)>=1)
dex$diff="Down"
dex$diff[which(sign(dex$log2FoldChange)>0)]="Up"
table(dex$diff)
##
## Down Up
## 331 908
We will use msigdbr
to extract gene sets from https://www.gsea-msigdb.org/gsea/msigdb/ and
clusterProfiler
to perform the over-representation
analysis.
library(msigdbr)
library(tidyverse)
library(clusterProfiler)
kegg = msigdbr(species = "Homo sapiens", collection = 'C2', subcollection = 'CP:KEGG_LEGACY') %>% dplyr::select(gs_name, gene_symbol)
df=rbind.data.frame()
for(i in c("Up", "Down")){
genes=subset(dex, diff==i)
g=unique(genes$gene_name)
y = enricher(g
, TERM2GENE = kegg
, pAdjustMethod = "BH"
, pvalueCutoff=1
, qvalueCutoff=1 )
if(!is.null(y)){
y = y@result
y$Description=fct_reorder(y$Description, as.numeric(sub("/\\d+", "", y$GeneRatio)), .desc = F)
y$rank = rank(y$pvalue)
}
head(y)
y$gene_set="Kegg"
y$diff=i
df=rbind.data.frame(df, y)
}
head(df)
## ID
## KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION
## KEGG_TYPE_I_DIABETES_MELLITUS KEGG_TYPE_I_DIABETES_MELLITUS
## KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION
## KEGG_GRAFT_VERSUS_HOST_DISEASE KEGG_GRAFT_VERSUS_HOST_DISEASE
## KEGG_ALLOGRAFT_REJECTION KEGG_ALLOGRAFT_REJECTION
## KEGG_CELL_ADHESION_MOLECULES_CAMS KEGG_CELL_ADHESION_MOLECULES_CAMS
## Description
## KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION
## KEGG_TYPE_I_DIABETES_MELLITUS KEGG_TYPE_I_DIABETES_MELLITUS
## KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION
## KEGG_GRAFT_VERSUS_HOST_DISEASE KEGG_GRAFT_VERSUS_HOST_DISEASE
## KEGG_ALLOGRAFT_REJECTION KEGG_ALLOGRAFT_REJECTION
## KEGG_CELL_ADHESION_MOLECULES_CAMS KEGG_CELL_ADHESION_MOLECULES_CAMS
## GeneRatio BgRatio pvalue
## KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION 41/305 264/5244 3.362848e-09
## KEGG_TYPE_I_DIABETES_MELLITUS 15/305 43/5244 7.210719e-09
## KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION 21/305 88/5244 1.633183e-08
## KEGG_GRAFT_VERSUS_HOST_DISEASE 14/305 41/5244 3.186525e-08
## KEGG_ALLOGRAFT_REJECTION 13/305 37/5244 6.790217e-08
## KEGG_CELL_ADHESION_MOLECULES_CAMS 25/305 133/5244 1.161059e-07
## p.adjust qvalue
## KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION 4.943386e-07 4.318604e-07
## KEGG_TYPE_I_DIABETES_MELLITUS 5.299879e-07 4.630041e-07
## KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION 8.002595e-07 6.991168e-07
## KEGG_GRAFT_VERSUS_HOST_DISEASE 1.171048e-06 1.023042e-06
## KEGG_ALLOGRAFT_REJECTION 1.996324e-06 1.744014e-06
## KEGG_CELL_ADHESION_MOLECULES_CAMS 2.844594e-06 2.485073e-06
## geneID
## KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION TNFSF10/CSF1/IL15/LIFR/FLT3LG/FAS/CXCL16/CCL28/TNFRSF14/KITLG/CXCL10/HGF/CCR1/IL7R/CXCL9/CXCL11/IL15RA/CX3CL1/IL12RB1/TNFRSF1B/IL6/CXCL8/IL18R1/TNFRSF10C/IL4R/IL1B/TNFRSF6B/IL22RA1/IL7/EGF/ACVRL1/NGFR/IL2RG/CCL25/CXCL1/CXCL6/CTF1/TNFSF13B/CD40/CSF3/CCL5
## KEGG_TYPE_I_DIABETES_MELLITUS HLA-E/HLA-F/FAS/HLA-B/HLA-C/ICA1/HLA-A/HLA-G/GAD1/IL1B/HLA-DRA/HLA-DPB1/HLA-DMA/HLA-DPA1/HLA-DRB1
## KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION IFI30/PSME2/HLA-E/PSME1/TAP2/TAPBP/HLA-F/TAP1/B2M/HLA-B/HLA-C/CIITA/CD74/CTSS/HLA-A/HLA-G/HLA-DRA/HLA-DPB1/HLA-DMA/HLA-DPA1/HLA-DRB1
## KEGG_GRAFT_VERSUS_HOST_DISEASE HLA-E/HLA-F/FAS/HLA-B/HLA-C/HLA-A/HLA-G/IL6/IL1B/HLA-DRA/HLA-DPB1/HLA-DMA/HLA-DPA1/HLA-DRB1
## KEGG_ALLOGRAFT_REJECTION HLA-E/HLA-F/FAS/HLA-B/HLA-C/HLA-A/HLA-G/HLA-DRA/HLA-DPB1/HLA-DMA/HLA-DPA1/HLA-DRB1/CD40
## KEGG_CELL_ADHESION_MOLECULES_CAMS HLA-E/HLA-F/ICAM1/HLA-B/HLA-C/CD274/PDCD1LG2/HLA-A/CLDN7/HLA-G/SELL/NLGN1/CDH3/HLA-DRA/SELPLG/HLA-DPB1/HLA-DMA/CD226/CLDN23/HLA-DPA1/PTPRC/HLA-DRB1/CNTN1/SDC2/CD40
## Count rank gene_set diff
## KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION 41 1 Kegg Up
## KEGG_TYPE_I_DIABETES_MELLITUS 15 2 Kegg Up
## KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION 21 3 Kegg Up
## KEGG_GRAFT_VERSUS_HOST_DISEASE 14 4 Kegg Up
## KEGG_ALLOGRAFT_REJECTION 13 5 Kegg Up
## KEGG_CELL_ADHESION_MOLECULES_CAMS 25 6 Kegg Up
df$Description=gsub("KEGG_", "", df$Description)
# What are GeneRatio and BgRatio?
# GeneRatio is the ratio of input genes that are annotated in a term
# BgRatio is the ratio of all genes that are annotated in this term
# Visit also: https://www.cell.com/the-innovation/pdf/S2666-6758(21)00066-7.pdf for more details
saveRDS(df, "Data/RNA_seq/Overrepresentation_analysis_on_differentially_expressed_genes.rds")
df=subset(df, p.adjust<=0.1)
library(DOSE)
library(ggplot2)
df=df[order(parse_ratio(df$GeneRatio), decreasing = F),]
df$Description=factor(df$Description, levels=unique(df$Description))
ggplot(df
,aes(
x= parse_ratio(GeneRatio)
,y = Description)
) +
geom_segment(aes(xend=0, yend = Description)) +
geom_point(aes(fill=p.adjust, size = parse_ratio(GeneRatio), shape=diff)) +
scale_shape_manual(values=c(25, 24))+
scale_fill_viridis_c(option = 'D',direction = -1) +
scale_size_continuous(range=c(2, 10)) +
theme_bw() +
xlab("GeneRatio") +
ylab(NULL) +
ggtitle("Enriched Disease Ontology")+
theme(legend.position="bottom", legend.direction="horizontal")