RNA-seq


Perform a differential expression analysis using the “pc3” samples in the Datasets/RNA_seq.zip folder (ony protein_coding genes). See Annotation2.csv for information about samples. You need to generate:

  • a ggtexttable plot with UP and DOWN frequencies
  • a volcano plot using the same thresholds of the previous exercises
  • enrichment plots using KEGG, HALLMARK and REACTOME

Then combine all these plot together with ggpubr to obtain a single figure with labels (check the graphical parameters of each plot to find the best arrangement)


options(stringsAsFactors = F)

f  <- list.files(path="Datasets/RNA_seq",
                 recursive=F,
                 pattern=".gene.counts"
                 ,full.names=T)

f=f[grep("pc3", f)]


a=read.delim2(f[1], header=T, skip=1)


map2=a[,c("Geneid", "Length","gene_name", "gene_type")]

for(i in f){
  
  b=read.delim2(i, header=T, skip=1)
  map2$new=b[,ncol(b)]
  map2$new=as.numeric(map2$new)
  colnames(map2)[ncol(map2)]=gsub("\\.", "-", gsub(".Aligned.sortedByCoord.out.bam", "", colnames(b)[ncol(b)]))
}

rownames(map2)=map2$Geneid

saveRDS(map2, "expression_counts.rds")

map2=subset(map2, gene_type=="protein_coding")

saveRDS(map2, "expression_counts_protein_coding_genes.rds")
# First we define a function with all required passages for differential expression analysis with DeSeq2

get_DESEQ = function(counts, samples, cond, ctrl,cores=1){
  
  # Perform DESeq analysis
  require(DESeq2)
  require(BiocParallel)
  
  tmp= floor(as.matrix(counts[,samples])) # DeSeq2 requires integer values
  colData <- data.frame(condition=factor(cond))
  colData$condition <- relevel(colData$condition, ref = ctrl) 
  dds <- DESeqDataSetFromMatrix(tmp, colData, formula(~condition)) # this function normalize data
  
  dds <- dds[ rowSums(counts(dds)) > 1, ]
  
  dds <- DESeq(dds, parallel = T, BPPARAM = MulticoreParam(workers = cores))
  
  return(dds)
  
}
# DE ======

library(DESeq2)
info=read.csv("Datasets/RNA_seq/Annotation2.csv")

rownames(map2) = paste0(map2$gene_name,'_', rownames(map2))

samples=colnames(map2)[5:ncol(map2)]

group=info$Treatment[match(samples, info$Sample_name)]

deg = get_DESEQ(map2
                , samples=samples
                , cond = group
                , ctrl='Control')

res <- results(deg, name="condition_siFOXA1_vs_Control")

res <- res[order(res$pvalue),]

res$gene_name=map2$gene_name[match(rownames(res), rownames(map2))]
res$gene_id=map2$Geneid[match(rownames(res), rownames(map2))]

saveRDS(res, "Differential_expression_deseq2.rds")
library(EnhancedVolcano)

plot1=EnhancedVolcano(toptable = res, x = "log2FoldChange", y = "padj", lab = res$gene_name, pCutoff = 0.01, FCcutoff = 1,axisLabSize = 10,titleLabSize = 10,captionLabSize = 10,labSize = 3,legendLabSize = 10)


dex = subset(res, padj<=0.01 & abs(log2FoldChange)>=1)
dex$diff="Down"
dex$diff[which(sign(dex$log2FoldChange)>0)]="Up"

#table(dex$diff)
library(msigdbr)
library(tidyverse)
library(clusterProfiler)


hallmark = msigdbr(species = "Homo sapiens", collection = 'H') %>% dplyr::select(gs_name, gene_symbol)


df=rbind.data.frame()

for(i in c("Up", "Down")){
  
  genes=subset(dex, diff==i)
  g=unique(genes$gene_name)

  y = enricher(g
                 , TERM2GENE = hallmark
                 , pAdjustMethod = "BH"
                 , pvalueCutoff=1
                 , qvalueCutoff=1 )
  
    if(!is.null(y)){
      y = y@result
      y$Description=fct_reorder(y$Description, as.numeric(sub("/\\d+", "", y$GeneRatio)), .desc = F)
      y$rank = rank(y$pvalue)
    }
  
  head(y)
  
  y$gene_set="Hallmark"
  
  y$diff=i
  
  df=rbind.data.frame(df, y)
  
}


df$Description=gsub("HALLMARK_", "", df$Description)

df=subset(df, p.adjust<=0.1)


library(DOSE)
library(ggplot2)


df=df[order(parse_ratio(df$GeneRatio), decreasing = F),]
df$Description=factor(df$Description, levels=unique(df$Description))

plot2=ggplot(df
       ,aes(
         x=  parse_ratio(GeneRatio)
        ,y = Description)
) + 
  geom_segment(aes(xend=0, yend = Description)) +
  geom_point(aes(fill=p.adjust, size = parse_ratio(GeneRatio), shape=diff)) +
  scale_shape_manual(values=c(25, 24))+
  scale_fill_viridis_c(option = 'D',direction = -1) +
  scale_size_continuous(range=c(2, 10)) +
  theme_bw() + 
  xlab("GeneRatio") +
  ylab(NULL) + 
  ggtitle("Hallmark")+
  theme(legend.position="bottom", legend.direction="horizontal")
kegg = msigdbr(species = "Homo sapiens", collection = 'C2', subcollection = 'CP:KEGG_LEGACY') %>% dplyr::select(gs_name, gene_symbol)


df=rbind.data.frame()

for(i in c("Up", "Down")){
  
  genes=subset(dex, diff==i)
  g=unique(genes$gene_name)

  y = enricher(g
                 , TERM2GENE = kegg
                 , pAdjustMethod = "BH"
                 , pvalueCutoff=1
                 , qvalueCutoff=1 )
  
    if(!is.null(y)){
      y = y@result
      y$Description=fct_reorder(y$Description, as.numeric(sub("/\\d+", "", y$GeneRatio)), .desc = F)
      y$rank = rank(y$pvalue)
    }
  
  head(y)
  
  y$gene_set="KEGG"
  
  y$diff=i
  
  df=rbind.data.frame(df, y)
  
}


df$Description=gsub("KEGG_", "", df$Description)

df=subset(df, p.adjust<=0.001)

df=df[order(parse_ratio(df$GeneRatio), decreasing = F),]
df=df[1:10,]
df$Description=factor(df$Description, levels=unique(df$Description))

plot3=ggplot(df
       ,aes(
         x=  parse_ratio(GeneRatio)
        ,y = Description)
) + 
  geom_segment(aes(xend=0, yend = Description)) +
  geom_point(aes(fill=p.adjust, size = parse_ratio(GeneRatio), shape=diff)) +
  scale_shape_manual(values=c(25, 24))+
  scale_fill_viridis_c(option = 'D',direction = -1) +
  scale_size_continuous(range=c(2, 10)) +
  theme_bw() + 
  xlab("GeneRatio") +
  ylab(NULL) + 
  ggtitle("KEGG")+
  theme(legend.position="bottom", legend.direction="horizontal")
reactome = msigdbr(species = "Homo sapiens", collection = 'C2', subcollection = 'CP:REACTOME') %>% dplyr::select(gs_name, gene_symbol)


df=rbind.data.frame()

for(i in c("Up", "Down")){
  
  genes=subset(dex, diff==i)
  g=unique(genes$gene_name)

  y = enricher(g
                 , TERM2GENE = reactome
                 , pAdjustMethod = "BH"
                 , pvalueCutoff=1
                 , qvalueCutoff=1 )
  
    if(!is.null(y)){
      y = y@result
      y$Description=fct_reorder(y$Description, as.numeric(sub("/\\d+", "", y$GeneRatio)), .desc = F)
      y$rank = rank(y$pvalue)
    }
  
  head(y)
  
  y$gene_set="REACTOME"
  
  y$diff=i
  
  df=rbind.data.frame(df, y)
  
}


df$Description=gsub("REACTOME_", "", df$Description)

df=subset(df, p.adjust<=0.001)

df=df[order(parse_ratio(df$GeneRatio), decreasing = F),]
df=df[1:10,]
df$Description=factor(df$Description, levels=unique(df$Description))

plot4=ggplot(df
       ,aes(
         x=  parse_ratio(GeneRatio)
        ,y = Description)
) + 
  geom_segment(aes(xend=0, yend = Description)) +
  geom_point(aes(fill=p.adjust, size = parse_ratio(GeneRatio), shape=diff)) +
  scale_shape_manual(values=c(25, 24))+
  scale_fill_viridis_c(option = 'D',direction = -1) +
  scale_size_continuous(range=c(2, 10)) +
  theme_bw() + 
  xlab("GeneRatio") +
  ylab(NULL) + 
  ggtitle("REACTOME")+
  theme(legend.position="bottom", legend.direction="horizontal")
library(ggpubr)
freq.table=data.frame(table(dex$diff))
colnames(freq.table)[1]=c("Type")
plot0=ggtexttable(freq.table,rows = NULL,theme = ttheme("minimal"))
ggarrange(ggarrange(plot0,plot1,NULL,nrow = 1,labels = c("A","B")),ggarrange(plot2,plot3,labels = c("C","D")),plot4,ncol = 1,heights = c(1.5,2,2),labels = c(NA,NA,"E"))