Perform a differential expression analysis using the “pc3” samples in
the Datasets/RNA_seq.zip
folder (ony
protein_coding
genes). See Annotation2.csv
for
information about samples. You need to generate:
ggtexttable
plot with UP and DOWN frequenciesvolcano plot
using the same thresholds of the
previous exercisesKEGG
, HALLMARK
and
REACTOME
Then combine all these plot together with ggpubr
to
obtain a single figure with labels (check the graphical parameters of
each plot to find the best arrangement)
options(stringsAsFactors = F)
f <- list.files(path="Datasets/RNA_seq",
recursive=F,
pattern=".gene.counts"
,full.names=T)
f=f[grep("pc3", f)]
a=read.delim2(f[1], header=T, skip=1)
map2=a[,c("Geneid", "Length","gene_name", "gene_type")]
for(i in f){
b=read.delim2(i, header=T, skip=1)
map2$new=b[,ncol(b)]
map2$new=as.numeric(map2$new)
colnames(map2)[ncol(map2)]=gsub("\\.", "-", gsub(".Aligned.sortedByCoord.out.bam", "", colnames(b)[ncol(b)]))
}
rownames(map2)=map2$Geneid
saveRDS(map2, "expression_counts.rds")
map2=subset(map2, gene_type=="protein_coding")
saveRDS(map2, "expression_counts_protein_coding_genes.rds")
# First we define a function with all required passages for differential expression analysis with DeSeq2
get_DESEQ = function(counts, samples, cond, ctrl,cores=1){
# Perform DESeq analysis
require(DESeq2)
require(BiocParallel)
tmp= floor(as.matrix(counts[,samples])) # DeSeq2 requires integer values
colData <- data.frame(condition=factor(cond))
colData$condition <- relevel(colData$condition, ref = ctrl)
dds <- DESeqDataSetFromMatrix(tmp, colData, formula(~condition)) # this function normalize data
dds <- dds[ rowSums(counts(dds)) > 1, ]
dds <- DESeq(dds, parallel = T, BPPARAM = MulticoreParam(workers = cores))
return(dds)
}
# DE ======
library(DESeq2)
info=read.csv("Datasets/RNA_seq/Annotation2.csv")
rownames(map2) = paste0(map2$gene_name,'_', rownames(map2))
samples=colnames(map2)[5:ncol(map2)]
group=info$Treatment[match(samples, info$Sample_name)]
deg = get_DESEQ(map2
, samples=samples
, cond = group
, ctrl='Control')
res <- results(deg, name="condition_siFOXA1_vs_Control")
res <- res[order(res$pvalue),]
res$gene_name=map2$gene_name[match(rownames(res), rownames(map2))]
res$gene_id=map2$Geneid[match(rownames(res), rownames(map2))]
saveRDS(res, "Differential_expression_deseq2.rds")
library(EnhancedVolcano)
plot1=EnhancedVolcano(toptable = res, x = "log2FoldChange", y = "padj", lab = res$gene_name, pCutoff = 0.01, FCcutoff = 1,axisLabSize = 10,titleLabSize = 10,captionLabSize = 10,labSize = 3,legendLabSize = 10)
dex = subset(res, padj<=0.01 & abs(log2FoldChange)>=1)
dex$diff="Down"
dex$diff[which(sign(dex$log2FoldChange)>0)]="Up"
#table(dex$diff)
library(msigdbr)
library(tidyverse)
library(clusterProfiler)
hallmark = msigdbr(species = "Homo sapiens", collection = 'H') %>% dplyr::select(gs_name, gene_symbol)
df=rbind.data.frame()
for(i in c("Up", "Down")){
genes=subset(dex, diff==i)
g=unique(genes$gene_name)
y = enricher(g
, TERM2GENE = hallmark
, pAdjustMethod = "BH"
, pvalueCutoff=1
, qvalueCutoff=1 )
if(!is.null(y)){
y = y@result
y$Description=fct_reorder(y$Description, as.numeric(sub("/\\d+", "", y$GeneRatio)), .desc = F)
y$rank = rank(y$pvalue)
}
head(y)
y$gene_set="Hallmark"
y$diff=i
df=rbind.data.frame(df, y)
}
df$Description=gsub("HALLMARK_", "", df$Description)
df=subset(df, p.adjust<=0.1)
library(DOSE)
library(ggplot2)
df=df[order(parse_ratio(df$GeneRatio), decreasing = F),]
df$Description=factor(df$Description, levels=unique(df$Description))
plot2=ggplot(df
,aes(
x= parse_ratio(GeneRatio)
,y = Description)
) +
geom_segment(aes(xend=0, yend = Description)) +
geom_point(aes(fill=p.adjust, size = parse_ratio(GeneRatio), shape=diff)) +
scale_shape_manual(values=c(25, 24))+
scale_fill_viridis_c(option = 'D',direction = -1) +
scale_size_continuous(range=c(2, 10)) +
theme_bw() +
xlab("GeneRatio") +
ylab(NULL) +
ggtitle("Hallmark")+
theme(legend.position="bottom", legend.direction="horizontal")
kegg = msigdbr(species = "Homo sapiens", collection = 'C2', subcollection = 'CP:KEGG_LEGACY') %>% dplyr::select(gs_name, gene_symbol)
df=rbind.data.frame()
for(i in c("Up", "Down")){
genes=subset(dex, diff==i)
g=unique(genes$gene_name)
y = enricher(g
, TERM2GENE = kegg
, pAdjustMethod = "BH"
, pvalueCutoff=1
, qvalueCutoff=1 )
if(!is.null(y)){
y = y@result
y$Description=fct_reorder(y$Description, as.numeric(sub("/\\d+", "", y$GeneRatio)), .desc = F)
y$rank = rank(y$pvalue)
}
head(y)
y$gene_set="KEGG"
y$diff=i
df=rbind.data.frame(df, y)
}
df$Description=gsub("KEGG_", "", df$Description)
df=subset(df, p.adjust<=0.001)
df=df[order(parse_ratio(df$GeneRatio), decreasing = F),]
df=df[1:10,]
df$Description=factor(df$Description, levels=unique(df$Description))
plot3=ggplot(df
,aes(
x= parse_ratio(GeneRatio)
,y = Description)
) +
geom_segment(aes(xend=0, yend = Description)) +
geom_point(aes(fill=p.adjust, size = parse_ratio(GeneRatio), shape=diff)) +
scale_shape_manual(values=c(25, 24))+
scale_fill_viridis_c(option = 'D',direction = -1) +
scale_size_continuous(range=c(2, 10)) +
theme_bw() +
xlab("GeneRatio") +
ylab(NULL) +
ggtitle("KEGG")+
theme(legend.position="bottom", legend.direction="horizontal")
reactome = msigdbr(species = "Homo sapiens", collection = 'C2', subcollection = 'CP:REACTOME') %>% dplyr::select(gs_name, gene_symbol)
df=rbind.data.frame()
for(i in c("Up", "Down")){
genes=subset(dex, diff==i)
g=unique(genes$gene_name)
y = enricher(g
, TERM2GENE = reactome
, pAdjustMethod = "BH"
, pvalueCutoff=1
, qvalueCutoff=1 )
if(!is.null(y)){
y = y@result
y$Description=fct_reorder(y$Description, as.numeric(sub("/\\d+", "", y$GeneRatio)), .desc = F)
y$rank = rank(y$pvalue)
}
head(y)
y$gene_set="REACTOME"
y$diff=i
df=rbind.data.frame(df, y)
}
df$Description=gsub("REACTOME_", "", df$Description)
df=subset(df, p.adjust<=0.001)
df=df[order(parse_ratio(df$GeneRatio), decreasing = F),]
df=df[1:10,]
df$Description=factor(df$Description, levels=unique(df$Description))
plot4=ggplot(df
,aes(
x= parse_ratio(GeneRatio)
,y = Description)
) +
geom_segment(aes(xend=0, yend = Description)) +
geom_point(aes(fill=p.adjust, size = parse_ratio(GeneRatio), shape=diff)) +
scale_shape_manual(values=c(25, 24))+
scale_fill_viridis_c(option = 'D',direction = -1) +
scale_size_continuous(range=c(2, 10)) +
theme_bw() +
xlab("GeneRatio") +
ylab(NULL) +
ggtitle("REACTOME")+
theme(legend.position="bottom", legend.direction="horizontal")
library(ggpubr)
freq.table=data.frame(table(dex$diff))
colnames(freq.table)[1]=c("Type")
plot0=ggtexttable(freq.table,rows = NULL,theme = ttheme("minimal"))
ggarrange(ggarrange(plot0,plot1,NULL,nrow = 1,labels = c("A","B")),ggarrange(plot2,plot3,labels = c("C","D")),plot4,ncol = 1,heights = c(1.5,2,2),labels = c(NA,NA,"E"))