2025-03-25

We will go through a full analysis of differential expression.

1. Import and prepare data

Suppose we have Feature Counts results for genes (gene counts). We have to build the expression matrix.

options(stringsAsFactors = F)

f  <- list.files(path="Data/RNA_seq",
                 recursive=F,
                 pattern=".gene.counts"
                 ,full.names=T)


a=read.delim2(f[1], header=T, skip=1)


map2=a[,c("Geneid", "Length","gene_name", "gene_type")]

for(i in f){
  
  b=read.delim2(i, header=T, skip=1)
  map2$new=b[,ncol(b)]
  map2$new=as.numeric(map2$new)
  colnames(map2)[ncol(map2)]=gsub("\\.", "-", gsub(".Aligned.sortedByCoord.out.bam", "", colnames(b)[ncol(b)]))
}


rownames(map2)=map2$Geneid

saveRDS(map2, "Data/RNA_seq/expression_counts.rds")

head(map2)

##                                  Geneid Length    gene_name
## ENSG00000223972.5_2 ENSG00000223972.5_2   1735      DDX11L1
## ENSG00000227232.5_2 ENSG00000227232.5_2   1351       WASH7P
## ENSG00000243485.5_4 ENSG00000243485.5_4   1021 RP11-34P13.3
## ENSG00000237613.2_2 ENSG00000237613.2_2   1219      FAM138A
## ENSG00000268020.3_4 ENSG00000268020.3_4    840       OR4G4P
## ENSG00000240361.2_3 ENSG00000240361.2_3   1414      OR4G11P
##                                              gene_type fxa1-psi03-51-131-010
## ENSG00000223972.5_2 transcribed_unprocessed_pseudogene                     2
## ENSG00000227232.5_2             unprocessed_pseudogene                   120
## ENSG00000243485.5_4                            lincRNA                     3
## ENSG00000237613.2_2                            lincRNA                     1
## ENSG00000268020.3_4             unprocessed_pseudogene                     0
## ENSG00000240361.2_3 transcribed_unprocessed_pseudogene                     2
##                     fxa1-psi04-51-131-010 fxa1-psi07-51-134-110
## ENSG00000223972.5_2                     6                     1
## ENSG00000227232.5_2                   317                   186
## ENSG00000243485.5_4                     3                     0
## ENSG00000237613.2_2                     0                     0
## ENSG00000268020.3_4                     1                     1
## ENSG00000240361.2_3                     4                     1
##                     fxa1-psi09-51-134-010 fxa1-pwt03-51-131-000
## ENSG00000223972.5_2                     2                     2
## ENSG00000227232.5_2                   297                   155
## ENSG00000243485.5_4                     0                     2
## ENSG00000237613.2_2                     0                     0
## ENSG00000268020.3_4                     1                     0
## ENSG00000240361.2_3                     0                     1
##                     fxa1-pwt04-51-131-000 fxa1-pwt07-51-134-100
## ENSG00000223972.5_2                     2                     0
## ENSG00000227232.5_2                   178                    86
## ENSG00000243485.5_4                     1                     3
## ENSG00000237613.2_2                     0                     0
## ENSG00000268020.3_4                     0                     1
## ENSG00000240361.2_3                     0                     0
##                     fxa1-pwt09-51-134-000
## ENSG00000223972.5_2                     1
## ENSG00000227232.5_2                   407
## ENSG00000243485.5_4                     0
## ENSG00000237613.2_2                     1
## ENSG00000268020.3_4                     0
## ENSG00000240361.2_3                     0

# For this analysis we will use only protein coding

map2=subset(map2, gene_type=="protein_coding")

saveRDS(map2, "Data/RNA_seq/expression_counts_protein_coding_genes.rds")

head(map2)

##                                  Geneid Length  gene_name      gene_type
## ENSG00000186092.6_4 ENSG00000186092.6_4   2618      OR4F5 protein_coding
## ENSG00000237683.5     ENSG00000237683.5   2661 AL627309.1 protein_coding
## ENSG00000235249.1     ENSG00000235249.1    995     OR4F29 protein_coding
## ENSG00000284662.1_1 ENSG00000284662.1_1    995     OR4F16 protein_coding
## ENSG00000269831.1     ENSG00000269831.1    129 AL669831.1 protein_coding
## ENSG00000269308.1     ENSG00000269308.1     57 AL645608.2 protein_coding
##                     fxa1-psi03-51-131-010 fxa1-psi04-51-131-010
## ENSG00000186092.6_4                     5                     2
## ENSG00000237683.5                      13                    30
## ENSG00000235249.1                       3                     0
## ENSG00000284662.1_1                     0                     0
## ENSG00000269831.1                       1                     6
## ENSG00000269308.1                       2                     0
##                     fxa1-psi07-51-134-110 fxa1-psi09-51-134-010
## ENSG00000186092.6_4                     2                     2
## ENSG00000237683.5                     132                   196
## ENSG00000235249.1                       0                     0
## ENSG00000284662.1_1                     0                     0
## ENSG00000269831.1                       2                     4
## ENSG00000269308.1                       0                     1
##                     fxa1-pwt03-51-131-000 fxa1-pwt04-51-131-000
## ENSG00000186092.6_4                     2                     0
## ENSG00000237683.5                      20                    16
## ENSG00000235249.1                       1                     0
## ENSG00000284662.1_1                     0                     0
## ENSG00000269831.1                       4                     2
## ENSG00000269308.1                       1                     0
##                     fxa1-pwt07-51-134-100 fxa1-pwt09-51-134-000
## ENSG00000186092.6_4                     0                     2
## ENSG00000237683.5                      45                   179
## ENSG00000235249.1                       0                     0
## ENSG00000284662.1_1                     0                     0
## ENSG00000269831.1                       0                     7
## ENSG00000269308.1                       0                     0

length(unique(map2$Geneid))

## [1] 20298

length(unique(map2$gene_name))

## [1] 20278

nrow(map2)

## [1] 20298

2. Correct data for batches (if any)

We have our expression matrix, nevertheless we need information about our data to proceed. We find them in the Exp_info.csv. Notice that experiments are done on the same cell line but by different operators. As we are not interested in this difference we have to correct for this bias before performing a differential gene expression.

The same if there were any other sources of unwonted diversity.

info=read.csv("Data/RNA_seq/Exp_info.csv")

info

##                 Barcode Cell_line   Treatment Operator
## 1 fxa1-psi03-51-131-010       PC3     treated        A
## 2 fxa1-psi04-51-131-010       PC3     treated        A
## 3 fxa1-psi07-51-134-110       PC3     treated        B
## 4 fxa1-psi09-51-134-010       PC3     treated        B
## 5 fxa1-pwt03-51-131-000       PC3 not_treated        A
## 6 fxa1-pwt04-51-131-000       PC3 not_treated        A
## 7 fxa1-pwt07-51-134-100       PC3 not_treated        B
## 8 fxa1-pwt09-51-134-000       PC3 not_treated        B

library(sva)

batch=info$Operator[match(colnames(map2)[5:ncol(map2)], info$Barcode)]
batch

## [1] "A" "A" "B" "B" "A" "A" "B" "B"

group=info$Treatment[match(colnames(map2)[5:ncol(map2)], info$Barcode)]
group

## [1] "treated"     "treated"     "treated"     "treated"     "not_treated"
## [6] "not_treated" "not_treated" "not_treated"

c=ComBat_seq(as.matrix(map2[,5:ncol(map2)]), batch=batch, group=group) # requires a matrix of counts only

## Found 2 batches
## Using full model in ComBat-seq.
## Adjusting for 1 covariate(s) or covariate level(s)
## Estimating dispersions
## Fitting the GLM model
## Shrinkage off - using GLM estimates for parameters
## Adjusting the data

c=cbind.data.frame(map2[,1:4], c)
saveRDS(c, "Data/RNA_seq/expression_counts_protein_coding_genes_batch_corrected_by_operator.rds")

3. Perform the Differential Expression Analysis while making a Principal Component Analysis

We will use DESeq2 package from Bioconductor (see https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html for more info)

# First we define a function with all required passages for differential expression analysis with DeSeq2

get_DESEQ = function(counts, samples, cond, ctrl, cores=1){
  
  # Perform DESeq analysis
  require(DESeq2)
  require(BiocParallel)
  
  tmp= floor(as.matrix(counts[,samples])) # DeSeq2 requires integer values
  colData <- data.frame(condition=factor(cond))
  colData$condition <- relevel(colData$condition, ref = ctrl) 
  dds <- DESeqDataSetFromMatrix(tmp, colData, formula(~condition)) # this function normalize data
  
  dds <- dds[ rowSums(counts(dds)) > 1, ]
  
  dds <- DESeq(dds, parallel = T, BPPARAM = MulticoreParam(workers = cores))
  
  dds
  
}

# DE ======

library(DESeq2)

rownames(c) = paste0(c$gene_name,'_', rownames(c))

samples=colnames(c)[grep("fxa1", colnames(c))]

group=info$Treatment[match(samples, info$Barcode)]

deg = get_DESEQ(c
                , samples=samples
                , cond = group
                , ctrl='not_treated')

## converting counts to integer mode

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates: 1 workers

## mean-dispersion relationship

## final dispersion estimates, fitting model and testing: 1 workers

res <- results(deg, name="condition_treated_vs_not_treated") #paste0("condition_", case, "_vs_", control)

res <- res[order(res$pvalue),]

res$gene_name=c$gene_name[match(rownames(res), rownames(c))]
res$gene_id=c$Geneid[match(rownames(res), rownames(c))]

head(res)

## log2 fold change (MLE): condition treated vs not treated 
## Wald test p-value: condition treated vs not treated 
## DataFrame with 6 rows and 8 columns
##                              baseMean log2FoldChange     lfcSE      stat
##                             <numeric>      <numeric> <numeric> <numeric>
## UBA7_ENSG00000182179.12_3     1993.23        3.35314 0.0932504   35.9584
## BTN3A1_ENSG00000026950.16_4   2003.18        2.95638 0.0855382   34.5621
## UBE2L6_ENSG00000156587.15_3   6424.09        2.86761 0.0900144   31.8573
## BTN3A3_ENSG00000111801.15_3   1566.13        2.54437 0.0820215   31.0207
## OAS2_ENSG00000111335.12_2    13807.01        3.55329 0.1157605   30.6952
## BTN3A2_ENSG00000186470.13_4   1478.70        2.65564 0.0875166   30.3445
##                                   pvalue         padj   gene_name
##                                <numeric>    <numeric> <character>
## UBA7_ENSG00000182179.12_3   3.73526e-283 6.21883e-279        UBA7
## BTN3A1_ENSG00000026950.16_4 9.38854e-262 7.81549e-258      BTN3A1
## UBE2L6_ENSG00000156587.15_3 1.04314e-222 5.78910e-219      UBE2L6
## BTN3A3_ENSG00000111801.15_3 2.83156e-211 1.17857e-207      BTN3A3
## OAS2_ENSG00000111335.12_2   6.59275e-207 2.19525e-203        OAS2
## BTN3A2_ENSG00000186470.13_4 2.97288e-202 8.24925e-199      BTN3A2
##                                          gene_id
##                                      <character>
## UBA7_ENSG00000182179.12_3   ENSG00000182179.12_3
## BTN3A1_ENSG00000026950.16_4 ENSG00000026950.16_4
## UBE2L6_ENSG00000156587.15_3 ENSG00000156587.15_3
## BTN3A3_ENSG00000111801.15_3 ENSG00000111801.15_3
## OAS2_ENSG00000111335.12_2   ENSG00000111335.12_2
## BTN3A2_ENSG00000186470.13_4 ENSG00000186470.13_4

saveRDS(res, "Data/RNA_seq/Differential_expression_deseq2.rds")

# Figures ===========

## Estimate dispersion trend and apply a variance stabilizing transformation
vsd <- vst(deg, blind=FALSE)
sampleDists <- dist(t(assay(vsd)))

library(RColorBrewer)
library(pheatmap)
library(ggplot2)

sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- colnames(vsd)
colnames(sampleDistMatrix) <- NULL

colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
              clustering_distance_rows=sampleDists,
              clustering_distance_cols=sampleDists,
              col=colors)

pcaData <- DESeq2::plotPCA(vsd, intgroup=c("condition"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

ggplot(pcaData, aes(PC1, PC2, color=condition)) +
  geom_point(size=3) + ggrepel::geom_label_repel(aes(label=name))+
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance"))+coord_fixed()+ggpubr::theme_pubr()

# PCA and correlation between samples do not change respect to tested conditions

4. Volcano plot and selection of differentially expressed genes

We will use EnhancedVolcano package from Bioconductor (https://www.bioconductor.org/packages/devel/bioc/vignettes/EnhancedVolcano/inst/doc/EnhancedVolcano.html#introduction)

library(EnhancedVolcano)

EnhancedVolcano(toptable = res, x = "log2FoldChange", y = "padj", lab = res$gene_name, pCutoff = 0.01, FCcutoff = 1)

dex = subset(res, padj<=0.01 & abs(log2FoldChange)>=1)
dex$diff="Down"
dex$diff[which(sign(dex$log2FoldChange)>0)]="Up"

table(dex$diff)

## 
## Down   Up 
##  331  908

5. Over-representation analysis

We will use msigdbr to extract gene sets from https://www.gsea-msigdb.org/gsea/msigdb/ and clusterProfiler to perform the over-representation analysis.

library(msigdbr)
library(tidyverse)
library(clusterProfiler)


kegg = msigdbr(species = "Homo sapiens", collection = 'C2', subcollection = 'CP:KEGG_LEGACY') %>% dplyr::select(gs_name, gene_symbol)


df=rbind.data.frame()

for(i in c("Up", "Down")){
  
  genes=subset(dex, diff==i)
  g=unique(genes$gene_name)

  y = enricher(g
                 , TERM2GENE = kegg
                 , pAdjustMethod = "BH"
                 , pvalueCutoff=1
                 , qvalueCutoff=1 )
  
    if(!is.null(y)){
      y = y@result
      y$Description=fct_reorder(y$Description, as.numeric(sub("/\\d+", "", y$GeneRatio)), .desc = F)
      y$rank = rank(y$pvalue)
    }
  
  head(y)
  
  y$gene_set="Kegg"
  
  y$diff=i
  
  df=rbind.data.frame(df, y)
  
}

head(df)

##                                                                                      ID
## KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION
## KEGG_TYPE_I_DIABETES_MELLITUS                             KEGG_TYPE_I_DIABETES_MELLITUS
## KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION       KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION
## KEGG_GRAFT_VERSUS_HOST_DISEASE                           KEGG_GRAFT_VERSUS_HOST_DISEASE
## KEGG_ALLOGRAFT_REJECTION                                       KEGG_ALLOGRAFT_REJECTION
## KEGG_CELL_ADHESION_MOLECULES_CAMS                     KEGG_CELL_ADHESION_MOLECULES_CAMS
##                                                                             Description
## KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION
## KEGG_TYPE_I_DIABETES_MELLITUS                             KEGG_TYPE_I_DIABETES_MELLITUS
## KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION       KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION
## KEGG_GRAFT_VERSUS_HOST_DISEASE                           KEGG_GRAFT_VERSUS_HOST_DISEASE
## KEGG_ALLOGRAFT_REJECTION                                       KEGG_ALLOGRAFT_REJECTION
## KEGG_CELL_ADHESION_MOLECULES_CAMS                     KEGG_CELL_ADHESION_MOLECULES_CAMS
##                                             GeneRatio  BgRatio       pvalue
## KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION    41/305 264/5244 3.362848e-09
## KEGG_TYPE_I_DIABETES_MELLITUS                  15/305  43/5244 7.210719e-09
## KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION       21/305  88/5244 1.633183e-08
## KEGG_GRAFT_VERSUS_HOST_DISEASE                 14/305  41/5244 3.186525e-08
## KEGG_ALLOGRAFT_REJECTION                       13/305  37/5244 6.790217e-08
## KEGG_CELL_ADHESION_MOLECULES_CAMS              25/305 133/5244 1.161059e-07
##                                                 p.adjust       qvalue
## KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION 4.943386e-07 4.318604e-07
## KEGG_TYPE_I_DIABETES_MELLITUS               5.299879e-07 4.630041e-07
## KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION    8.002595e-07 6.991168e-07
## KEGG_GRAFT_VERSUS_HOST_DISEASE              1.171048e-06 1.023042e-06
## KEGG_ALLOGRAFT_REJECTION                    1.996324e-06 1.744014e-06
## KEGG_CELL_ADHESION_MOLECULES_CAMS           2.844594e-06 2.485073e-06
##                                                                                                                                                                                                                                                                                                    geneID
## KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION TNFSF10/CSF1/IL15/LIFR/FLT3LG/FAS/CXCL16/CCL28/TNFRSF14/KITLG/CXCL10/HGF/CCR1/IL7R/CXCL9/CXCL11/IL15RA/CX3CL1/IL12RB1/TNFRSF1B/IL6/CXCL8/IL18R1/TNFRSF10C/IL4R/IL1B/TNFRSF6B/IL22RA1/IL7/EGF/ACVRL1/NGFR/IL2RG/CCL25/CXCL1/CXCL6/CTF1/TNFSF13B/CD40/CSF3/CCL5
## KEGG_TYPE_I_DIABETES_MELLITUS                                                                                                                                                                           HLA-E/HLA-F/FAS/HLA-B/HLA-C/ICA1/HLA-A/HLA-G/GAD1/IL1B/HLA-DRA/HLA-DPB1/HLA-DMA/HLA-DPA1/HLA-DRB1
## KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION                                                                                                                             IFI30/PSME2/HLA-E/PSME1/TAP2/TAPBP/HLA-F/TAP1/B2M/HLA-B/HLA-C/CIITA/CD74/CTSS/HLA-A/HLA-G/HLA-DRA/HLA-DPB1/HLA-DMA/HLA-DPA1/HLA-DRB1
## KEGG_GRAFT_VERSUS_HOST_DISEASE                                                                                                                                                                                HLA-E/HLA-F/FAS/HLA-B/HLA-C/HLA-A/HLA-G/IL6/IL1B/HLA-DRA/HLA-DPB1/HLA-DMA/HLA-DPA1/HLA-DRB1
## KEGG_ALLOGRAFT_REJECTION                                                                                                                                                                                          HLA-E/HLA-F/FAS/HLA-B/HLA-C/HLA-A/HLA-G/HLA-DRA/HLA-DPB1/HLA-DMA/HLA-DPA1/HLA-DRB1/CD40
## KEGG_CELL_ADHESION_MOLECULES_CAMS                                                                                                     HLA-E/HLA-F/ICAM1/HLA-B/HLA-C/CD274/PDCD1LG2/HLA-A/CLDN7/HLA-G/SELL/NLGN1/CDH3/HLA-DRA/SELPLG/HLA-DPB1/HLA-DMA/CD226/CLDN23/HLA-DPA1/PTPRC/HLA-DRB1/CNTN1/SDC2/CD40
##                                             Count rank gene_set diff
## KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION    41    1     Kegg   Up
## KEGG_TYPE_I_DIABETES_MELLITUS                  15    2     Kegg   Up
## KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION       21    3     Kegg   Up
## KEGG_GRAFT_VERSUS_HOST_DISEASE                 14    4     Kegg   Up
## KEGG_ALLOGRAFT_REJECTION                       13    5     Kegg   Up
## KEGG_CELL_ADHESION_MOLECULES_CAMS              25    6     Kegg   Up

df$Description=gsub("KEGG_", "", df$Description)


# What are GeneRatio and BgRatio?

# GeneRatio is the ratio of input genes that are annotated in a term
# BgRatio is the ratio of all genes that are annotated in this term

# Visit also: https://www.cell.com/the-innovation/pdf/S2666-6758(21)00066-7.pdf for more details

saveRDS(df, "Data/RNA_seq/Overrepresentation_analysis_on_differentially_expressed_genes.rds")


df=subset(df, p.adjust<=0.1)


library(DOSE)
library(ggplot2)


df=df[order(parse_ratio(df$GeneRatio), decreasing = F),]
df$Description=factor(df$Description, levels=unique(df$Description))

ggplot(df
       ,aes(
         x=  parse_ratio(GeneRatio)
        ,y = Description)
) + 
  geom_segment(aes(xend=0, yend = Description)) +
  geom_point(aes(fill=p.adjust, size = parse_ratio(GeneRatio), shape=diff)) +
  scale_shape_manual(values=c(25, 24))+
  scale_fill_viridis_c(option = 'D',direction = -1) +
  scale_size_continuous(range=c(2, 10)) +
  theme_bw() + 
  xlab("GeneRatio") +
  ylab(NULL) + 
  ggtitle("Enriched Disease Ontology")+
  theme(legend.position="bottom", legend.direction="horizontal")