Exercise summary

Read the following two RDS files available from Datasets folders:

Differential_expression_deseq2_pc3.rds: dataset including gene expression results of PC3 cell line replicates
Differential_expression_deseq2_vcap.rds: dataset including gene expression results of VCAP cell line replicates

How many genes are in each table?

vcap = readRDS('Exercises/Datasets/Differential_expression_deseq2_pc3.rds')
pc3 = readRDS('Exercises/Datasets/Differential_expression_deseq2_vcap.rds')



message("Dim of PC3 table:", dim(pc3)[1] , ' ' , dim(pc3)[2])

## Dim of PC3 table:17890 8

message("Dim of VCAP table:", dim(vcap)[1] , ' ' , dim(vcap)[2])

## Dim of VCAP table:19038 8

For each dataset, select differentially expressed genes, defined as genes with:

padj ≤ 0.01 and |log2FoldChange| ≥ 1.

Create a new column diff based on the sign of log2FoldChange:

If log2FoldChange > 0 = up
If log2FoldChange ≤ 0 = down

How many differentially expressed genes are there per cell line model? How many are up-regulated and down-regulated?

Summarize the results in a table.

# PC3
pc3 = subset(pc3, padj <= 0.01 & abs(log2FoldChange) >= 1)
pc3$diff = "Down"
pc3$diff[which(sign(pc3$log2FoldChange)>0)] = "Up"

#VCAP 
vcap = subset(vcap, padj <= 0.01 & abs(log2FoldChange) >= 1)
vcap$diff = "Down"
vcap$diff[which(sign(vcap$log2FoldChange)>0)] = "Up"

Create a Venn diagram to show the intersection of differentially expressed genes between VCAP and PC3.
Ensure that the cell line names are labeled for each category in the plot.

library(VennDiagram)
library(MetBrewer)

my_palette = met.brewer(name = "Cassatt2", n = 2, type = "discrete")

venn.diagram(
        x = list(vcap$gene_id, pc3$gene_id),
        category.names = c("VCAP", "PC3"),
        filename = "venn_diagram_differentially_expressed.png",
        output = TRUE,
        
        # Output features
        imagetype = "png" ,
        height = 480 , 
        width = 480 , 
        resolution = 300,
        cex = .6,
        fontface = "bold",
        fontfamily = "sans",
        
        # Set names
        cat.cex = 0.2,
        cat.fontface = "bold",
        cat.default.pos = "outer",
        
        # Circles
        lwd = 2,
        lty = 'blank',
        fill = my_palette
        )

## [1] 1

Create an UpSet plot to visualize the intersection of the following gene sets:

up-regulated in PC3
up-regulated in VCAP
down-regulated in PC3
down-regulated in VCAP

data = list(
  vcap_up = subset(vcap, diff == "Up")$gene_id,
  vcap_down = subset(vcap, diff == "Down")$gene_id,
  pc3_up = subset(pc3, diff == "Up")$gene_id,
  pc3_down = subset(pc3, diff == "Down")$gene_id
)


library(UpSetR)
upset(fromList(data), nsets = 4,  sets.bar.color = "tan" , main.bar.color = "steelblue4")

Select genes that are up-regulated in both PC3 and VCAP cell models. Add the correspondent log2FoldChange values from both PC3 and VCAP experiments. How many such genes are there?

Now, create a scatter plot, using:

log2FoldChange values from both datasets
baseMean as the size of the points.

Evaluate the correlation between log2FoldChange values.

up_pc3 = subset(pc3, diff == "Up")
up_common = subset(vcap, gene_id %in% up_pc3$gene_id & diff =='Up')

up_common$log2FoldChange_PC3 = pc3$log2FoldChange[match(up_common$gene_id, pc3$gene_id)]

dim(up_common)

## [1] 151  10

library(ggplot2)
library(ggpubr)

up = ggplot(up_common, aes(x = log2FoldChange, y = log2FoldChange_PC3, size = baseMean)) + geom_point(color = "red") + theme_bw() +
  xlab('log2FC VCAP') + ylab('log2FC PC3') + stat_cor()
up

Select genes that are down-regulated in both PC3 and VCAP cell models. Add the correspondent log2FoldChange values from both PC3 and VCAP experiments. How many such genes are there?

Now, create a scatter plot, using:

log2FoldChange values from both datasets
baseMean as the size of the points.

Evaluate the correlation between log2FoldChange values.

down_pc3 = subset(pc3, diff == "Down")
down_common = subset(vcap, gene_id %in% down_pc3$gene_id & diff == 'Down')

down_common$log2FoldChange_PC3 = pc3$log2FoldChange[match(down_common$gene_id, pc3$gene_id)]

dim(down_common)

## [1] 24 10

down = ggplot(down_common, aes(x = log2FoldChange, y = log2FoldChange_PC3, size = baseMean)) + geom_point(color = "blue") + theme_bw() +
    xlab('log2FC VCAP') + ylab('log2FC PC3') + stat_cor()

down

Arrange the scatter plots of up-regulated and down-regulated genes together in a single layout for better visualization and comparison.

library(ggpubr)

ggarrange(up, down)

Create a function to classify genes based on their log2FoldChange values:

LU (Lowly upregulated): log2FoldChange > 0 and < 2
MU (Moderately upregulated): log2FoldChange between 2 and 5
HU (Highly upregulated): log2FoldChange > 5
LD (Lowly downregulated): log2FoldChange < 0 and > -2
MD (Moderately downregulated): log2FoldChange between -2 and -5
HD (Highly downregulated): log2FoldChange < -5

assign_de_category = function(log2fc){
  if(log2fc > 0){
    if(log2fc < 2){status = "LU"}
    if(log2fc >= 2 & log2fc < 5){status = "MU"}
    if(log2fc >= 5){status = "HU"}
    
  } else if(log2fc < 0){
      if(log2fc > -2){status = "LD"}
      if(log2fc >- 5 & log2fc <= -2){status = "MD"}
      if(log2fc <= -5){status = "HD"}  
    
    
    }
  return(status)
}

Create a unified dataframe all, by concatenating the differentially expressed gene dataframes from PC3 and VCAP.

Before merging, add a new column to each dataframe to indicate the cell model (PC3 or VCAP) for proper distinction.

vcap$cell_model = "VCAP"
pc3$cell_model = "PC3"

all = rbind.data.frame(as.data.frame(vcap), as.data.frame(pc3))

Apply the function created in Exercise 8 to all values in the log2FoldChange column of the all dataframe:

Store the results in a new column called de_category.

all$de_category <- NA
  
for(i in 1:nrow(all)){
  all$de_category[i] = assign_de_category(all$log2FoldChange[i])
}

Create a barplot to visualize the distribution of genes across categories de_category.

all$de_category = factor(all$de_category, levels = c("HD", "MD", "LD", "LU", "MU", "HU"))
plot1 = ggplot(all, aes(de_category, fill = cell_model)) + geom_bar(position = 'dodge', col = 'black') + theme_bw() + 
  scale_fill_manual(values = my_palette)

plot2 = ggplot(all, aes(cell_model, fill = de_category)) + geom_bar(position = 'fill', col = 'black') + 
  ggsci::scale_fill_rickandmorty() + 
  theme_bw()


ggarrange(plot1, plot2, nrow = 1)

Now, assess whether gene length impacts the category of differentially expressed genes.
Create a TxDb object from the GTF file:
File: gencode.v28lift37.basic.annotation.gtf.gz (located in the Datasets folder).
Extract genes that are present in the all dataframe.
Compute gene length for these genes.
Add the gene length as a new column in the all dataframe.

library(GenomicFeatures)

# create TxDb from GTF
txdb = makeTxDbFromGFF('Exercises/Datasets/gencode.v28lift37.basic.annotation.gtf.gz')

# extract genes from TxDb
genes = genes(txdb)

# select genes included in all dataframe
genes = subset(genes, gene_id %in% unique(all$gene_id))

# compute gene length of selected genes
genes_length = width(genes)

# create a dataframe with gene id and gene length
length_df = data.frame('gene_id' = genes$gene_id
                       , 'length' = genes_length)

# add gene length to dataframe
all$gene_length = length_df$length[match(all$gene_id, length_df$gene_id)]

Create a boxplot to visualize gene lengths across cell line models (PC3 and VCAP) within each differential expression category de_category.

Add Wilcoxon test results to assess whether there is a statistically significant difference in gene length between PC3 and VCAP within each de_category.

ggplot(all, aes(cell_model, gene_length)) + geom_boxplot(notch = TRUE) + facet_wrap(~ de_category ) + scale_y_log10() +
    stat_compare_means(label.y.npc = 'bottom') + theme_bw()

2025-03-21

Exercise summary