Exercise summary

  1. Read the following two RDS files available from Datasets folders:


How many genes are in each table?

vcap = readRDS('Exercises/Datasets/Differential_expression_deseq2_pc3.rds')
pc3 = readRDS('Exercises/Datasets/Differential_expression_deseq2_vcap.rds')



message("Dim of PC3 table:", dim(pc3)[1] , ' ' , dim(pc3)[2])
## Dim of PC3 table:17890 8
message("Dim of VCAP table:", dim(vcap)[1] , ' ' , dim(vcap)[2])
## Dim of VCAP table:19038 8

  1. For each dataset, select differentially expressed genes, defined as genes with:


Create a new column diff based on the sign of log2FoldChange:

How many differentially expressed genes are there per cell line model? How many are up-regulated and down-regulated?


Summarize the results in a table.

# PC3
pc3 = subset(pc3, padj <= 0.01 & abs(log2FoldChange) >= 1)
pc3$diff = "Down"
pc3$diff[which(sign(pc3$log2FoldChange)>0)] = "Up"

#VCAP 
vcap = subset(vcap, padj <= 0.01 & abs(log2FoldChange) >= 1)
vcap$diff = "Down"
vcap$diff[which(sign(vcap$log2FoldChange)>0)] = "Up"

  1. Create a Venn diagram to show the intersection of differentially expressed genes between VCAP and PC3.
    Ensure that the cell line names are labeled for each category in the plot.
library(VennDiagram)
library(MetBrewer)

my_palette = met.brewer(name = "Cassatt2", n = 2, type = "discrete")

venn.diagram(
        x = list(vcap$gene_id, pc3$gene_id),
        category.names = c("VCAP", "PC3"),
        filename = "venn_diagram_differentially_expressed.png",
        output = TRUE,
        
        # Output features
        imagetype = "png" ,
        height = 480 , 
        width = 480 , 
        resolution = 300,
        cex = .6,
        fontface = "bold",
        fontfamily = "sans",
        
        # Set names
        cat.cex = 0.2,
        cat.fontface = "bold",
        cat.default.pos = "outer",
        
        # Circles
        lwd = 2,
        lty = 'blank',
        fill = my_palette
        )
## [1] 1

  1. Create an UpSet plot to visualize the intersection of the following gene sets:
data = list(
  vcap_up = subset(vcap, diff == "Up")$gene_id,
  vcap_down = subset(vcap, diff == "Down")$gene_id,
  pc3_up = subset(pc3, diff == "Up")$gene_id,
  pc3_down = subset(pc3, diff == "Down")$gene_id
)


library(UpSetR)
upset(fromList(data), nsets = 4,  sets.bar.color = "tan" , main.bar.color = "steelblue4")


  1. Select genes that are up-regulated in both PC3 and VCAP cell models. Add the correspondent log2FoldChange values from both PC3 and VCAP experiments. How many such genes are there?

Now, create a scatter plot, using:

Evaluate the correlation between log2FoldChange values.

up_pc3 = subset(pc3, diff == "Up")
up_common = subset(vcap, gene_id %in% up_pc3$gene_id & diff =='Up')

up_common$log2FoldChange_PC3 = pc3$log2FoldChange[match(up_common$gene_id, pc3$gene_id)]

dim(up_common)
## [1] 151  10
library(ggplot2)
library(ggpubr)

up = ggplot(up_common, aes(x = log2FoldChange, y = log2FoldChange_PC3, size = baseMean)) + geom_point(color = "red") + theme_bw() +
  xlab('log2FC VCAP') + ylab('log2FC PC3') + stat_cor()
up


  1. Select genes that are down-regulated in both PC3 and VCAP cell models. Add the correspondent log2FoldChange values from both PC3 and VCAP experiments. How many such genes are there?

Now, create a scatter plot, using:

Evaluate the correlation between log2FoldChange values.

down_pc3 = subset(pc3, diff == "Down")
down_common = subset(vcap, gene_id %in% down_pc3$gene_id & diff == 'Down')

down_common$log2FoldChange_PC3 = pc3$log2FoldChange[match(down_common$gene_id, pc3$gene_id)]

dim(down_common)
## [1] 24 10
down = ggplot(down_common, aes(x = log2FoldChange, y = log2FoldChange_PC3, size = baseMean)) + geom_point(color = "blue") + theme_bw() +
    xlab('log2FC VCAP') + ylab('log2FC PC3') + stat_cor()

down


  1. Arrange the scatter plots of up-regulated and down-regulated genes together in a single layout for better visualization and comparison.
library(ggpubr)

ggarrange(up, down)


  1. Create a function to classify genes based on their log2FoldChange values:
assign_de_category = function(log2fc){
  if(log2fc > 0){
    if(log2fc < 2){status = "LU"}
    if(log2fc >= 2 & log2fc < 5){status = "MU"}
    if(log2fc >= 5){status = "HU"}
    
  } else if(log2fc < 0){
      if(log2fc > -2){status = "LD"}
      if(log2fc >- 5 & log2fc <= -2){status = "MD"}
      if(log2fc <= -5){status = "HD"}  
    
    
    }
  return(status)
}

  1. Create a unified dataframe all, by concatenating the differentially expressed gene dataframes from PC3 and VCAP.

Before merging, add a new column to each dataframe to indicate the cell model (PC3 or VCAP) for proper distinction.

vcap$cell_model = "VCAP"
pc3$cell_model = "PC3"

all = rbind.data.frame(as.data.frame(vcap), as.data.frame(pc3))

  1. Apply the function created in Exercise 8 to all values in the log2FoldChange column of the all dataframe:
all$de_category <- NA
  
for(i in 1:nrow(all)){
  all$de_category[i] = assign_de_category(all$log2FoldChange[i])
}

  1. Create a barplot to visualize the distribution of genes across categories de_category.
all$de_category = factor(all$de_category, levels = c("HD", "MD", "LD", "LU", "MU", "HU"))
plot1 = ggplot(all, aes(de_category, fill = cell_model)) + geom_bar(position = 'dodge', col = 'black') + theme_bw() + 
  scale_fill_manual(values = my_palette)

plot2 = ggplot(all, aes(cell_model, fill = de_category)) + geom_bar(position = 'fill', col = 'black') + 
  ggsci::scale_fill_rickandmorty() + 
  theme_bw()


ggarrange(plot1, plot2, nrow = 1)


  1. Now, assess whether gene length impacts the category of differentially expressed genes.

  2. Create a TxDb object from the GTF file:
    File: gencode.v28lift37.basic.annotation.gtf.gz (located in the Datasets folder).

  3. Extract genes that are present in the all dataframe.

  4. Compute gene length for these genes.

  5. Add the gene length as a new column in the all dataframe.

library(GenomicFeatures)

# create TxDb from GTF
txdb = makeTxDbFromGFF('Exercises/Datasets/gencode.v28lift37.basic.annotation.gtf.gz')

# extract genes from TxDb
genes = genes(txdb)

# select genes included in all dataframe
genes = subset(genes, gene_id %in% unique(all$gene_id))

# compute gene length of selected genes
genes_length = width(genes)

# create a dataframe with gene id and gene length
length_df = data.frame('gene_id' = genes$gene_id
                       , 'length' = genes_length)

# add gene length to dataframe
all$gene_length = length_df$length[match(all$gene_id, length_df$gene_id)]

  1. Create a boxplot to visualize gene lengths across cell line models (PC3 and VCAP) within each differential expression category de_category.

Add Wilcoxon test results to assess whether there is a statistically significant difference in gene length between PC3 and VCAP within each de_category.

ggplot(all, aes(cell_model, gene_length)) + geom_boxplot(notch = TRUE) + facet_wrap(~ de_category ) + scale_y_log10() +
    stat_compare_means(label.y.npc = 'bottom') + theme_bw()