RDS
files available from
Datasets
folders:Differential_expression_deseq2_pc3.rds
: dataset
including gene expression results of PC3 cell line replicatesDifferential_expression_deseq2_vcap.rds
: dataset
including gene expression results of VCAP cell line replicatesHow many genes are in each table?
vcap = readRDS('Exercises/Datasets/Differential_expression_deseq2_pc3.rds')
pc3 = readRDS('Exercises/Datasets/Differential_expression_deseq2_vcap.rds')
message("Dim of PC3 table:", dim(pc3)[1] , ' ' , dim(pc3)[2])
## Dim of PC3 table:17890 8
message("Dim of VCAP table:", dim(vcap)[1] , ' ' , dim(vcap)[2])
## Dim of VCAP table:19038 8
padj ≤ 0.01
and |log2FoldChange| ≥ 1
.
Create a new column diff
based on the sign of
log2FoldChange
:
log2FoldChange > 0
= up
log2FoldChange ≤ 0
= down
How many differentially expressed genes are there per cell line model? How many are up-regulated and down-regulated?
Summarize the results in a table.
# PC3
pc3 = subset(pc3, padj <= 0.01 & abs(log2FoldChange) >= 1)
pc3$diff = "Down"
pc3$diff[which(sign(pc3$log2FoldChange)>0)] = "Up"
#VCAP
vcap = subset(vcap, padj <= 0.01 & abs(log2FoldChange) >= 1)
vcap$diff = "Down"
vcap$diff[which(sign(vcap$log2FoldChange)>0)] = "Up"
Venn diagram
to show the intersection of
differentially expressed genes between VCAP and PC3.library(VennDiagram)
library(MetBrewer)
my_palette = met.brewer(name = "Cassatt2", n = 2, type = "discrete")
venn.diagram(
x = list(vcap$gene_id, pc3$gene_id),
category.names = c("VCAP", "PC3"),
filename = "venn_diagram_differentially_expressed.png",
output = TRUE,
# Output features
imagetype = "png" ,
height = 480 ,
width = 480 ,
resolution = 300,
cex = .6,
fontface = "bold",
fontfamily = "sans",
# Set names
cat.cex = 0.2,
cat.fontface = "bold",
cat.default.pos = "outer",
# Circles
lwd = 2,
lty = 'blank',
fill = my_palette
)
## [1] 1
UpSet plot
to visualize the intersection of
the following gene sets:data = list(
vcap_up = subset(vcap, diff == "Up")$gene_id,
vcap_down = subset(vcap, diff == "Down")$gene_id,
pc3_up = subset(pc3, diff == "Up")$gene_id,
pc3_down = subset(pc3, diff == "Down")$gene_id
)
library(UpSetR)
upset(fromList(data), nsets = 4, sets.bar.color = "tan" , main.bar.color = "steelblue4")
log2FoldChange
values from both PC3 and VCAP experiments. How many such genes are
there?Now, create a scatter plot, using:
log2FoldChange
values from both datasetsbaseMean
as the size of the points.Evaluate the correlation between log2FoldChange
values.
up_pc3 = subset(pc3, diff == "Up")
up_common = subset(vcap, gene_id %in% up_pc3$gene_id & diff =='Up')
up_common$log2FoldChange_PC3 = pc3$log2FoldChange[match(up_common$gene_id, pc3$gene_id)]
dim(up_common)
## [1] 151 10
library(ggplot2)
library(ggpubr)
up = ggplot(up_common, aes(x = log2FoldChange, y = log2FoldChange_PC3, size = baseMean)) + geom_point(color = "red") + theme_bw() +
xlab('log2FC VCAP') + ylab('log2FC PC3') + stat_cor()
up
log2FoldChange
values from both PC3 and VCAP experiments. How many such genes are
there?Now, create a scatter plot, using:
log2FoldChange
values from both datasetsbaseMean
as the size of the points.Evaluate the correlation between log2FoldChange
values.
down_pc3 = subset(pc3, diff == "Down")
down_common = subset(vcap, gene_id %in% down_pc3$gene_id & diff == 'Down')
down_common$log2FoldChange_PC3 = pc3$log2FoldChange[match(down_common$gene_id, pc3$gene_id)]
dim(down_common)
## [1] 24 10
down = ggplot(down_common, aes(x = log2FoldChange, y = log2FoldChange_PC3, size = baseMean)) + geom_point(color = "blue") + theme_bw() +
xlab('log2FC VCAP') + ylab('log2FC PC3') + stat_cor()
down
library(ggpubr)
ggarrange(up, down)
log2FoldChange
values:log2FoldChange > 0 and < 2
log2FoldChange between 2 and 5
log2FoldChange > 5
log2FoldChange < 0 and > -2
log2FoldChange between -2 and -5
log2FoldChange < -5
assign_de_category = function(log2fc){
if(log2fc > 0){
if(log2fc < 2){status = "LU"}
if(log2fc >= 2 & log2fc < 5){status = "MU"}
if(log2fc >= 5){status = "HU"}
} else if(log2fc < 0){
if(log2fc > -2){status = "LD"}
if(log2fc >- 5 & log2fc <= -2){status = "MD"}
if(log2fc <= -5){status = "HD"}
}
return(status)
}
all
, by concatenating the
differentially expressed gene dataframes from PC3 and VCAP.Before merging, add a new column to each dataframe to indicate the
cell model (PC3
or VCAP
) for proper
distinction.
vcap$cell_model = "VCAP"
pc3$cell_model = "PC3"
all = rbind.data.frame(as.data.frame(vcap), as.data.frame(pc3))
log2FoldChange
column of the all
dataframe:de_category
.all$de_category <- NA
for(i in 1:nrow(all)){
all$de_category[i] = assign_de_category(all$log2FoldChange[i])
}
de_category
.all$de_category = factor(all$de_category, levels = c("HD", "MD", "LD", "LU", "MU", "HU"))
plot1 = ggplot(all, aes(de_category, fill = cell_model)) + geom_bar(position = 'dodge', col = 'black') + theme_bw() +
scale_fill_manual(values = my_palette)
plot2 = ggplot(all, aes(cell_model, fill = de_category)) + geom_bar(position = 'fill', col = 'black') +
ggsci::scale_fill_rickandmorty() +
theme_bw()
ggarrange(plot1, plot2, nrow = 1)
Now, assess whether gene length impacts the category of differentially expressed genes.
Create a TxDb object from the GTF file:
File: gencode.v28lift37.basic.annotation.gtf.gz
(located in the Datasets
folder).
Extract genes that are present in the all
dataframe.
Compute gene length
for these genes.
Add the gene length
as a new column in the
all
dataframe.
library(GenomicFeatures)
# create TxDb from GTF
txdb = makeTxDbFromGFF('Exercises/Datasets/gencode.v28lift37.basic.annotation.gtf.gz')
# extract genes from TxDb
genes = genes(txdb)
# select genes included in all dataframe
genes = subset(genes, gene_id %in% unique(all$gene_id))
# compute gene length of selected genes
genes_length = width(genes)
# create a dataframe with gene id and gene length
length_df = data.frame('gene_id' = genes$gene_id
, 'length' = genes_length)
# add gene length to dataframe
all$gene_length = length_df$length[match(all$gene_id, length_df$gene_id)]
de_category
.Add Wilcoxon test results to assess whether there is a statistically
significant difference in gene length between PC3 and VCAP within each
de_category
.
ggplot(all, aes(cell_model, gene_length)) + geom_boxplot(notch = TRUE) + facet_wrap(~ de_category ) + scale_y_log10() +
stat_compare_means(label.y.npc = 'bottom') + theme_bw()