RNA-seq


  1. Following steps you saw in the theoretical lesson, make a differential expression analysis. All the data are in the Datasets/RNA_seq.zip folder. See Annotation.csv for information about samples. Data have been retrieved from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE193127 and contain expression counts for 6 prostate cancer samples of VCaP cells. Three of them have been treated with siRNA of FOXA1 and the other 3 are controls.

  1. Load gene.counts file with “vcap” in the name. Use only protein coding genes to obtain a Feature counts matrix
options(stringsAsFactors = F)

f  <- list.files(path="Datasets/RNA_seq",
                 recursive=F,
                 pattern=".gene.counts"
                 ,full.names=T)

f=f[grep("pc3", f, invert = T)]


a=read.delim2(f[1], header=T, skip=1)


map2=a[,c("Geneid", "Length","gene_name", "gene_type")]

for(i in f){
  
  b=read.delim2(i, header=T, skip=1)
  map2$new=b[,ncol(b)]
  map2$new=as.numeric(map2$new)
  colnames(map2)[ncol(map2)]=gsub("\\.", "-", gsub(".Aligned.sortedByCoord.out.bam", "", colnames(b)[ncol(b)]))
}


rownames(map2)=map2$Geneid

saveRDS(map2, "expression_counts.rds")

#head(map2)


# For this analysis we will use only protein coding

map2=subset(map2, gene_type=="protein_coding")

saveRDS(map2, "expression_counts_protein_coding_genes.rds")

head(map2)
##                                  Geneid Length  gene_name      gene_type nsi-R1
## ENSG00000186092.6_4 ENSG00000186092.6_4   2618      OR4F5 protein_coding   2.12
## ENSG00000237683.5     ENSG00000237683.5   2661 AL627309.1 protein_coding 124.92
## ENSG00000235249.1     ENSG00000235249.1    995     OR4F29 protein_coding   3.66
## ENSG00000284662.1_1 ENSG00000284662.1_1    995     OR4F16 protein_coding   2.91
## ENSG00000269831.1     ENSG00000269831.1    129 AL669831.1 protein_coding   7.50
## ENSG00000269308.1     ENSG00000269308.1     57 AL645608.2 protein_coding   0.00
##                     nsi-R2 nsi-R4 siFOXA1-R1 siFOXA1-R2 siFOXA1-R4
## ENSG00000186092.6_4   0.54   0.50       3.50       1.21       3.00
## ENSG00000237683.5   123.30 131.75     125.53     145.73     117.36
## ENSG00000235249.1     5.36   1.82       2.28       3.26       1.16
## ENSG00000284662.1_1   4.86   1.57       1.78       2.59       1.16
## ENSG00000269831.1    17.33  15.00      16.25      11.00      19.50
## ENSG00000269308.1     0.00   0.00       0.00       0.00       0.00
dim(map2)
## [1] 20298    10
#length(unique(map2$Geneid))
#length(unique(map2$gene_name))

#nrow(map2)

  1. Define a function with all required passages for differential expression analysis with DeSeq2. Use sample info annotated in Annotation.csv to distinguish between case and controls
# First we define a function with all required passages for differential expression analysis with DeSeq2

get_DESEQ = function(counts, samples, cond, ctrl,cores=1){
  
  # Perform DESeq analysis
  require(DESeq2)
  require(BiocParallel)
  
  tmp= floor(as.matrix(counts[,samples])) # DeSeq2 requires integer values
  colData <- data.frame(condition=factor(cond))
  colData$condition <- relevel(colData$condition, ref = ctrl) 
  dds <- DESeqDataSetFromMatrix(tmp, colData, formula(~condition)) # this function normalize data
  
  dds <- dds[ rowSums(counts(dds)) > 1, ]
  
  dds <- DESeq(dds, parallel = T, BPPARAM = MulticoreParam(workers = cores))
  
  return(dds)
  
}
# DE ======

library(DESeq2)
info=read.csv("Datasets/RNA_seq/Annotation.csv")

rownames(map2) = paste0(map2$gene_name,'_', rownames(map2))

samples=colnames(map2)[5:ncol(map2)]

group=info$Treatment[match(samples, info$Sample_name)]

deg = get_DESEQ(map2
                , samples=samples
                , cond = group
                , ctrl='Control')
## Loading required package: BiocParallel
## converting counts to integer mode
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 1 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 1 workers
res <- results(deg, name="condition_siFOXA1_vs_Control")

res <- res[order(res$pvalue),]

res$gene_name=map2$gene_name[match(rownames(res), rownames(map2))]
res$gene_id=map2$Geneid[match(rownames(res), rownames(map2))]

head(res)
## log2 fold change (MLE): condition siFOXA1 vs Control 
## Wald test p-value: condition siFOXA1 vs Control 
## DataFrame with 6 rows and 8 columns
##                              baseMean log2FoldChange     lfcSE      stat
##                             <numeric>      <numeric> <numeric> <numeric>
## FGFRL1_ENSG00000127418.14_3   6200.61       -2.79986 0.0692654  -40.4222
## ABCG2_ENSG00000118777.11_4    1697.39        5.96884 0.1413869   42.2163
## FBXO32_ENSG00000156804.7_2    5535.90        5.29011 0.0976957   54.1489
## TGFB3_ENSG00000119699.7_2     2198.95        6.91383 0.1745355   39.6128
## AMOT_ENSG00000126016.15_3     2542.08        3.66825 0.0933480   39.2965
## BTG1_ENSG00000133639.4_3      6240.26        3.03103 0.0832533   36.4073
##                                   pvalue         padj   gene_name
##                                <numeric>    <numeric> <character>
## FGFRL1_ENSG00000127418.14_3  0.00000e+00  0.00000e+00      FGFRL1
## ABCG2_ENSG00000118777.11_4   0.00000e+00  0.00000e+00       ABCG2
## FBXO32_ENSG00000156804.7_2   0.00000e+00  0.00000e+00      FBXO32
## TGFB3_ENSG00000119699.7_2    0.00000e+00  0.00000e+00       TGFB3
## AMOT_ENSG00000126016.15_3    0.00000e+00  0.00000e+00        AMOT
## BTG1_ENSG00000133639.4_3    3.26186e-290 9.15931e-287        BTG1
##                                          gene_id
##                                      <character>
## FGFRL1_ENSG00000127418.14_3 ENSG00000127418.14_3
## ABCG2_ENSG00000118777.11_4  ENSG00000118777.11_4
## FBXO32_ENSG00000156804.7_2   ENSG00000156804.7_2
## TGFB3_ENSG00000119699.7_2    ENSG00000119699.7_2
## AMOT_ENSG00000126016.15_3   ENSG00000126016.15_3
## BTG1_ENSG00000133639.4_3     ENSG00000133639.4_3
saveRDS(res, "Differential_expression_deseq2.rds")

  1. Visualize the results with a Volcano plot using 0.01 as cutoff for padj and 1 as cutoff for log2FoldChange to identify differentially expressed genes (DEGs)
library(EnhancedVolcano)

EnhancedVolcano(toptable = res, x = "log2FoldChange", y = "padj", lab = res$gene_name, pCutoff = 0.01, FCcutoff = 1)

dex = subset(res, padj<=0.01 & abs(log2FoldChange)>=1)
dex$diff="Down"
dex$diff[which(sign(dex$log2FoldChange)>0)]="Up"

table(dex$diff)
## 
## Down   Up 
##  754 1145

  1. Perform an over-representation analysis using DEGs previously identified. Use msigdbr to extract KEGG gene sets and clusterProfiler to perform the over-representation analysis.
library(msigdbr)
library(tidyverse)
library(clusterProfiler)


kegg = msigdbr(species = "Homo sapiens", collection = 'C2',subcollection = "CP:KEGG_LEGACY") %>% dplyr::select(gs_name, gene_symbol)


df=rbind.data.frame()

for(i in c("Up", "Down")){
  
  genes=subset(dex, diff==i)
  g=unique(genes$gene_name)

  y = enricher(g
                 , TERM2GENE = kegg
                 , pAdjustMethod = "BH"
                 , pvalueCutoff=1
                 , qvalueCutoff=1 )
  
    if(!is.null(y)){
      y = y@result
      y$Description=fct_reorder(y$Description, as.numeric(sub("/\\d+", "", y$GeneRatio)), .desc = F)
      y$rank = rank(y$pvalue)
    }
  
  head(y)
  
  y$gene_set="Kegg"
  
  y$diff=i
  
  df=rbind.data.frame(df, y)
  
}

head(df)
##                                                                                                  ID
## KEGG_STEROID_HORMONE_BIOSYNTHESIS                                 KEGG_STEROID_HORMONE_BIOSYNTHESIS
## KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS         KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS
## KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450
## KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM                 KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM
## KEGG_RETINOL_METABOLISM                                                     KEGG_RETINOL_METABOLISM
## KEGG_ASCORBATE_AND_ALDARATE_METABOLISM                       KEGG_ASCORBATE_AND_ALDARATE_METABOLISM
##                                                                                         Description
## KEGG_STEROID_HORMONE_BIOSYNTHESIS                                 KEGG_STEROID_HORMONE_BIOSYNTHESIS
## KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS         KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS
## KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450
## KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM                 KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM
## KEGG_RETINOL_METABOLISM                                                     KEGG_RETINOL_METABOLISM
## KEGG_ASCORBATE_AND_ALDARATE_METABOLISM                       KEGG_ASCORBATE_AND_ALDARATE_METABOLISM
##                                                   GeneRatio BgRatio
## KEGG_STEROID_HORMONE_BIOSYNTHESIS                    18/336 55/5244
## KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS        11/336 28/5244
## KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450    17/336 70/5244
## KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM            12/336 41/5244
## KEGG_RETINOL_METABOLISM                              15/336 64/5244
## KEGG_ASCORBATE_AND_ALDARATE_METABOLISM                9/336 25/5244
##                                                         pvalue     p.adjust
## KEGG_STEROID_HORMONE_BIOSYNTHESIS                 3.436528e-09 5.429714e-07
## KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS     5.084707e-07 4.016918e-05
## KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 1.194397e-06 6.290491e-05
## KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM         5.725068e-06 2.261402e-04
## KEGG_RETINOL_METABOLISM                           8.229556e-06 2.600540e-04
## KEGG_ASCORBATE_AND_ALDARATE_METABOLISM            1.335901e-05 3.517873e-04
##                                                         qvalue
## KEGG_STEROID_HORMONE_BIOSYNTHESIS                 4.919661e-07
## KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS     3.639580e-05
## KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 5.699579e-05
## KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM         2.048972e-04
## KEGG_RETINOL_METABOLISM                           2.356252e-04
## KEGG_ASCORBATE_AND_ALDARATE_METABOLISM            3.187414e-04
##                                                                                                                                                                              geneID
## KEGG_STEROID_HORMONE_BIOSYNTHESIS                 AKR1C1/HSD3B1/AKR1C3/HSD17B2/UGT1A6/CYP1A1/CYP3A5/UGT1A5/AKR1C2/HSD11B2/UGT1A1/UGT1A8/UGT1A3/UGT1A10/UGT1A9/UGT1A4/UGT1A7/SULT2B1
## KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS                                                            CRYL1/UGP2/UGT1A6/UGT1A5/UGT1A1/UGT1A8/UGT1A3/UGT1A10/UGT1A9/UGT1A4/UGT1A7
## KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450            AKR1C1/AKR1C3/UGT1A6/CYP1A1/CYP3A5/UGT1A5/AKR1C2/UGT1A1/UGT1A8/UGT1A3/UGT1A10/UGT1A9/UGT1A4/UGT1A7/GSTA1/CYP2C18/GSTA2
## KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM                                                           HMOX1/ALAD/UGT1A6/UGT1A5/UGT1A1/UGT1A8/UGT1A3/UGT1A10/UGT1A9/UGT1A4/UGT1A7/FTH1
## KEGG_RETINOL_METABOLISM                                                  UGT1A6/CYP1A1/CYP3A5/UGT1A5/UGT1A1/UGT1A8/UGT1A3/BCO1/UGT1A10/UGT1A9/UGT1A4/UGT1A7/CYP26A1/ALDH1A2/CYP2C18
## KEGG_ASCORBATE_AND_ALDARATE_METABOLISM                                                                              UGT1A6/UGT1A5/UGT1A1/UGT1A8/UGT1A3/UGT1A10/UGT1A9/UGT1A4/UGT1A7
##                                                   Count rank gene_set diff
## KEGG_STEROID_HORMONE_BIOSYNTHESIS                    18    1     Kegg   Up
## KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS        11    2     Kegg   Up
## KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450    17    3     Kegg   Up
## KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM            12    4     Kegg   Up
## KEGG_RETINOL_METABOLISM                              15    5     Kegg   Up
## KEGG_ASCORBATE_AND_ALDARATE_METABOLISM                9    6     Kegg   Up
df$Description=gsub("KEGG_", "", df$Description)

df=subset(df, p.adjust<=0.1)


library(DOSE)
library(ggplot2)


df=df[order(parse_ratio(df$GeneRatio), decreasing = F),]
df$Description=factor(df$Description, levels=unique(df$Description))

ggplot(df
       ,aes(
         x=  parse_ratio(GeneRatio)
        ,y = Description)
) + 
  geom_segment(aes(xend=0, yend = Description)) +
  geom_point(aes(fill=p.adjust, size = parse_ratio(GeneRatio), shape=diff)) +
  scale_shape_manual(values=c(25, 24))+
  scale_fill_viridis_c(option = 'D',direction = -1) +
  scale_size_continuous(range=c(2, 10)) +
  theme_bw() + 
  xlab("GeneRatio") +
  ylab(NULL) + 
  ggtitle("Enriched Disease Ontology")+
  theme(legend.position="bottom", legend.direction="horizontal")


  1. Perform an over-representation analysis using DEGs previously identified. Use msigdbr to extract REACTOME gene sets and clusterProfiler to perform the over-representation analysis (check https://www.gsea-msigdb.org/gsea/msigdb/human/genesets.jsp?collection=H).
library(msigdbr)
library(tidyverse)
library(clusterProfiler)


reactome = msigdbr(species = "Homo sapiens", collection = 'C2', subcollection = 'CP:REACTOME') %>% dplyr::select(gs_name, gene_symbol)


df=rbind.data.frame()

for(i in c("Up", "Down")){
  
  genes=subset(dex, diff==i)
  g=unique(genes$gene_name)

  y = enricher(g
                 , TERM2GENE = reactome
                 , pAdjustMethod = "BH"
                 , pvalueCutoff=1
                 , qvalueCutoff=1 )
  
    if(!is.null(y)){
      y = y@result
      y$Description=fct_reorder(y$Description, as.numeric(sub("/\\d+", "", y$GeneRatio)), .desc = F)
      y$rank = rank(y$pvalue)
    }
  
  head(y)
  
  y$gene_set="REACTOME"
  
  y$diff=i
  
  df=rbind.data.frame(df, y)
  
}

head(df)
##                                                                                    ID
## REACTOME_BIOLOGICAL_OXIDATIONS                         REACTOME_BIOLOGICAL_OXIDATIONS
## REACTOME_DRUG_ADME                                                 REACTOME_DRUG_ADME
## REACTOME_PHASE_II_CONJUGATION_OF_COMPOUNDS REACTOME_PHASE_II_CONJUGATION_OF_COMPOUNDS
## REACTOME_PARACETAMOL_ADME                                   REACTOME_PARACETAMOL_ADME
## REACTOME_GLUCURONIDATION                                     REACTOME_GLUCURONIDATION
## REACTOME_ASPIRIN_ADME                                           REACTOME_ASPIRIN_ADME
##                                                                           Description
## REACTOME_BIOLOGICAL_OXIDATIONS                         REACTOME_BIOLOGICAL_OXIDATIONS
## REACTOME_DRUG_ADME                                                 REACTOME_DRUG_ADME
## REACTOME_PHASE_II_CONJUGATION_OF_COMPOUNDS REACTOME_PHASE_II_CONJUGATION_OF_COMPOUNDS
## REACTOME_PARACETAMOL_ADME                                   REACTOME_PARACETAMOL_ADME
## REACTOME_GLUCURONIDATION                                     REACTOME_GLUCURONIDATION
## REACTOME_ASPIRIN_ADME                                           REACTOME_ASPIRIN_ADME
##                                            GeneRatio   BgRatio       pvalue
## REACTOME_BIOLOGICAL_OXIDATIONS                43/665 220/11290 1.947395e-12
## REACTOME_DRUG_ADME                            27/665 109/11290 1.045996e-10
## REACTOME_PHASE_II_CONJUGATION_OF_COMPOUNDS    25/665 109/11290 2.927645e-09
## REACTOME_PARACETAMOL_ADME                     11/665  29/11290 3.554036e-07
## REACTOME_GLUCURONIDATION                      10/665  25/11290 6.848318e-07
## REACTOME_ASPIRIN_ADME                         13/665  44/11290 8.677656e-07
##                                                p.adjust       qvalue
## REACTOME_BIOLOGICAL_OXIDATIONS             1.914289e-09 1.742406e-09
## REACTOME_DRUG_ADME                         5.141068e-08 4.679454e-08
## REACTOME_PHASE_II_CONJUGATION_OF_COMPOUNDS 9.592916e-07 8.731572e-07
## REACTOME_PARACETAMOL_ADME                  8.734044e-05 7.949818e-05
## REACTOME_GLUCURONIDATION                   1.346379e-04 1.225489e-04
## REACTOME_ASPIRIN_ADME                      1.421689e-04 1.294036e-04
##                                                                                                                                                                                                                                                                                                                                                  geneID
## REACTOME_BIOLOGICAL_OXIDATIONS             CYP2J2/CYP4F11/MAOA/CYP4F3/AOC1/CYP4F22/CYP4B1/PTGS1/CYP4F8/GLYATL1/SULT2A1/UGP2/SULT1C2/GLYATL2/UGT1A6/CYP2W1/CYP1A1/NR1H4/CYP3A5/UGT1A5/UGT1A1/UGT1A8/UGT1A3/CYP4F12/UGT1A10/UGT1A9/UGT1A4/UGT1A7/DPEP1/TPST2/GSTA1/SULT1C4/SULT1B1/CHAC1/SULT2B1/CYP26A1/GGT6/SULT1A4/CYP2C18/SULT1A3/GSTA2/CYP46A1/GLYAT
## REACTOME_DRUG_ADME                                                                                                                                              ABCG2/BCHE/VAV3/AKR1C1/GLYATL1/SULT2A1/GLYATL2/UGT1A6/XDH/ABCC3/UGT1A5/HSD11B2/UGT1A1/UGT1A8/UGT1A3/UGT1A10/UGT1A9/UGT1A4/UGT1A7/GSTA1/SULT1C4/GGT6/SULT1A4/SULT1A3/SLC28A3/GSTA2/GLYAT
## REACTOME_PHASE_II_CONJUGATION_OF_COMPOUNDS                                                                                                                              GLYATL1/SULT2A1/UGP2/SULT1C2/GLYATL2/UGT1A6/UGT1A5/UGT1A1/UGT1A8/UGT1A3/UGT1A10/UGT1A9/UGT1A4/UGT1A7/TPST2/GSTA1/SULT1C4/SULT1B1/CHAC1/SULT2B1/GGT6/SULT1A4/SULT1A3/GSTA2/GLYAT
## REACTOME_PARACETAMOL_ADME                                                                                                                                                                                                                                                 ABCG2/SULT2A1/UGT1A6/ABCC3/UGT1A1/UGT1A10/UGT1A9/SULT1C4/GGT6/SULT1A4/SULT1A3
## REACTOME_GLUCURONIDATION                                                                                                                                                                                                                                                           UGP2/UGT1A6/UGT1A5/UGT1A1/UGT1A8/UGT1A3/UGT1A10/UGT1A9/UGT1A4/UGT1A7
## REACTOME_ASPIRIN_ADME                                                                                                                                                                                                                                          BCHE/GLYATL1/GLYATL2/UGT1A6/ABCC3/UGT1A5/UGT1A1/UGT1A8/UGT1A3/UGT1A9/UGT1A4/UGT1A7/GLYAT
##                                            Count rank gene_set diff
## REACTOME_BIOLOGICAL_OXIDATIONS                43    1 REACTOME   Up
## REACTOME_DRUG_ADME                            27    2 REACTOME   Up
## REACTOME_PHASE_II_CONJUGATION_OF_COMPOUNDS    25    3 REACTOME   Up
## REACTOME_PARACETAMOL_ADME                     11    4 REACTOME   Up
## REACTOME_GLUCURONIDATION                      10    5 REACTOME   Up
## REACTOME_ASPIRIN_ADME                         13    6 REACTOME   Up
df$Description=gsub("REACTOME_", "", df$Description)

df=subset(df, p.adjust<=0.1)


library(DOSE)
library(ggplot2)


df=df[order(parse_ratio(df$GeneRatio), decreasing = F),]
df$Description=factor(df$Description, levels=unique(df$Description))

ggplot(df
       ,aes(
         x=  parse_ratio(GeneRatio)
        ,y = Description)
) + 
  geom_segment(aes(xend=0, yend = Description)) +
  geom_point(aes(fill=p.adjust, size = parse_ratio(GeneRatio), shape=diff)) +
  scale_shape_manual(values=c(25, 24))+
  scale_fill_viridis_c(option = 'D',direction = -1) +
  scale_size_continuous(range=c(2, 10)) +
  theme_bw() + 
  xlab("GeneRatio") +
  ylab(NULL) + 
  ggtitle("Enriched Disease Ontology")+
  theme(legend.position="bottom", legend.direction="horizontal")