GenomicFeatures and GenomicRanges packages

Read the CpG island data as previously shown (refer to the Input/Output or Conditional/Loops exercises). Assign it to the variable cpgi, using stringsAsFactors = FALSE.

cpgi = read.csv("Exercises/Datasets/CpGi.table.hg18.csv", stringsAsFactors = F)

Subset cpgi, keeping only CpG islands located on canonical chromosomes (chr1–chr22, chrX, and chrY). Reassign the result to cpgi.

cpgi = subset(cpgi,chrom %in% c(paste0("chr", 1:22), "chrX", "chrY") )

Convert cpgi into a GRanges Object. Use the options keep.extra.columns = TRUE and ignore.strand = TRUE.
Hint: If necessary, adjust the seqnames.field, start.field, end.field, and strand.field options.

library(GenomicRanges)
cpgi = makeGRangesFromDataFrame(cpgi,
                         , keep.extra.columns = TRUE
                         , ignore.strand = TRUE
                         , seqinfo = NULL
                         , seqnames.field = "chrom"
                         , start.field = "chromStart"
                         , end.field = "chromEnd")

cpgi

## GRanges object with 27639 ranges and 7 metadata columns:
##           seqnames            ranges strand |        name    length    cpgNum
##              <Rle>         <IRanges>  <Rle> | <character> <integer> <integer>
##       [1]     chr1       18598-19673      * |    CpG: 116      1075       116
##       [2]     chr1     124987-125426      * |     CpG: 30       439        30
##       [3]     chr1     317653-318092      * |     CpG: 29       439        29
##       [4]     chr1     427014-428027      * |     CpG: 84      1013        84
##       [5]     chr1     439136-440407      * |     CpG: 99      1271        99
##       ...      ...               ...    ... .         ...       ...       ...
##   [27635]    chr22 49482536-49482984      * |     CpG: 44       448        44
##   [27636]    chr22 49489668-49490174      * |     CpG: 38       506        38
##   [27637]    chr22 49505252-49506926      * |    CpG: 167      1674       167
##   [27638]    chr22 49515893-49516885      * |     CpG: 81       992        81
##   [27639]    chr22 49568638-49569183      * |     CpG: 63       545        63
##               gcNum    perCpg     perGc    obsExp
##           <integer> <numeric> <numeric> <numeric>
##       [1]       787      21.6      73.2      0.83
##       [2]       295      13.7      67.2      0.64
##       [3]       295      13.2      67.2      0.62
##       [4]       734      16.6      72.5      0.64
##       [5]       777      15.6      61.1      0.84
##       ...       ...       ...       ...       ...
##   [27635]       279      19.6      62.3      1.01
##   [27636]       328      15.0      64.8      0.72
##   [27637]      1153      20.0      68.9      0.85
##   [27638]       564      16.3      56.9      1.02
##   [27639]       407      23.1      74.7      0.83
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths

Assign the values from the name column as the names of the object using names(), then remove the name column.

names(cpgi) = cpgi$name
cpgi$name <- NULL

Install the TxDb package related to the human UCSC hg18 assembly, then load it and assign it to the variable genome. Take some time to explore this object.

BiocManager::install("TxDb.Hsapiens.UCSC.hg18.knownGene")

library(TxDb.Hsapiens.UCSC.hg18.knownGene)

genome = TxDb.Hsapiens.UCSC.hg18.knownGene

Extract promoters from the genome object and assign them to the variable prom. Extend the TSS by 1000 bp upstream and 100 bp downstream (into the gene body).
How many ranges do you obtain?
(If you receive a warning message, ignore it: it refers to a non-canonical chromosome)

library(GenomicFeatures)
prom = promoters(genome
                 , upstream = 1000
                 , downstream = 100)
prom

## GRanges object with 66803 ranges and 2 metadata columns:
##                 seqnames          ranges strand |     tx_id     tx_name
##                    <Rle>       <IRanges>  <Rle> | <integer> <character>
##   uc001aaa.2        chr1        116-1215      + |         1  uc001aaa.2
##   uc009vip.1        chr1        116-1215      + |         2  uc009vip.1
##   uc009vjg.1        chr1     18418-19517      + |         3  uc009vjg.1
##   uc009vjh.1        chr1     54425-55524      + |         4  uc009vjh.1
##   uc001aal.1        chr1     57954-59053      + |         5  uc001aal.1
##          ...         ...             ...    ... .       ...         ...
##   uc004foe.2 chrX_random   982088-983187      - |     66799  uc004foe.2
##   uc010nvr.1 chrX_random   982088-983187      - |     66800  uc010nvr.1
##   uc010nvs.1 chrX_random   982088-983187      - |     66801  uc010nvs.1
##   uc004foh.1 chrX_random 1326518-1327617      - |     66802  uc004foh.1
##   uc004foj.1 chrX_random 1397361-1398460      - |     66803  uc004foj.1
##   -------
##   seqinfo: 49 sequences (1 circular) from hg18 genome

Keep only the ranges that belong to canonical chromosomes (chr1–chr22, chrX, and chrY), and reassign the result to prom.
After subsetting, remember to update seqlevels.

prom = subset(prom, seqnames(prom) %in% c(paste0("chr", 1:22), "chrX", "chrY"))
seqlevels(prom)

##  [1] "chr1"          "chr2"          "chr3"          "chr4"         
##  [5] "chr5"          "chr6"          "chr7"          "chr8"         
##  [9] "chr9"          "chr10"         "chr11"         "chr12"        
## [13] "chr13"         "chr14"         "chr15"         "chr16"        
## [17] "chr17"         "chr18"         "chr19"         "chr20"        
## [21] "chr21"         "chr22"         "chrX"          "chrY"         
## [25] "chrM"          "chr1_random"   "chr2_random"   "chr3_random"  
## [29] "chr4_random"   "chr5_h2_hap1"  "chr5_random"   "chr6_cox_hap1"
## [33] "chr6_qbl_hap2" "chr6_random"   "chr7_random"   "chr8_random"  
## [37] "chr9_random"   "chr10_random"  "chr11_random"  "chr13_random" 
## [41] "chr15_random"  "chr16_random"  "chr17_random"  "chr18_random" 
## [45] "chr19_random"  "chr21_random"  "chr22_h2_hap1" "chr22_random" 
## [49] "chrX_random"

seqlevels(prom) <- c(paste0("chr", 1:22), "chrX", "chrY")

Retrieve the CpG islands that overlap with promoters and assign them to a new variable cpg_prom.

How many CpG islands are present in the initial cpgi object?
How many CpG islands intersect with promoters prom?

Hint: When retrieving the query hits from the overlap, use the unique() function to avoid redundant ranges.

ov = findOverlaps(cpgi, prom)

cpg_prom = unique(cpgi[queryHits(ov)])

length(cpgi)

## [1] 27639

length(cpg_prom)

## [1] 13919

Read the BED file containing methylation sites (H3K4me3) in untreated HeLa cells (H3K4me3_unstim_hg18_xset200_dupsN_ht5.sub.peaks_manipulated.bed in the Datasets folder) and assign it to the variable meth.
The data were retrieved from (BCGSC)[https://www.bcgsc.ca/data/histone-modification/histone-modification-data] and slightly modified.

library(rtracklayer)
meth = import.bed("Exercises/Datasets/H3K4me3_unstim_hg18_xset200_dupsN_ht5.sub.peaks_manipulated.bed")
meth

## GRanges object with 54487 ranges and 0 metadata columns:
##           seqnames              ranges strand
##              <Rle>           <IRanges>  <Rle>
##       [1]     chr3       281233-282145      *
##       [2]     chr3       393929-394278      *
##       [3]     chr3       441947-442992      *
##       [4]     chr3       564458-565228      *
##       [5]     chr3       566039-566320      *
##       ...      ...                 ...    ...
##   [54483]     chrX 154494644-154494860      *
##   [54484]     chrX 154494886-154496142      *
##   [54485]     chrX 154649633-154650466      *
##   [54486]     chrX 154650651-154650863      *
##   [54487]     chrX 154764236-154765286      *
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths

Find overlaps between cpg_prom and meth, then:

Retrieve the subset of cpg_prom that overlaps with meth and assign it to the variable cpg_prom_Ov
Hint: Use unique() to retrieve only unique positions.
Retrieve the subset of meth that overlaps with cpg_prom and assign it to the variable meth_Ov.
Hint: Use unique() to retrieve only unique positions.

ov = findOverlaps(cpg_prom, meth)

cpg_prom_Ov = unique(cpg_prom[queryHits(ov)])
meth_Ov = unique(meth[subjectHits(ov)])
meth_Ov

## GRanges object with 16479 ranges and 0 metadata columns:
##           seqnames            ranges strand
##              <Rle>         <IRanges>  <Rle>
##       [1]     chr1       18277-18635      *
##       [2]     chr1     703599-703962      *
##       [3]     chr1     704119-704692      *
##       [4]     chr1     751266-752745      *
##       [5]     chr1     848690-849158      *
##       ...      ...               ...    ...
##   [16475]    chr22 49412959-49413427      *
##   [16476]    chr22 49413590-49414224      *
##   [16477]    chr22 49459608-49459993      *
##   [16478]    chr22 49460164-49460900      *
##   [16479]    chr22 49568385-49568982      *
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths

Import the GTF file for Mouse version M24 located in the Datasets folder. Use the makeTxDbFromGFF() function. (Note: This operation may take some time.)
Assign the result to the variable mouse.
Explore this object using the columns() function: it contains a lot of useful information.

#     BiocManager::install("txdbmaker")
library(txdbmaker)
mouse = makeTxDbFromGFF("Exercises/Datasets/gencode.vM24.basic.annotation.gtf" )
mouse

## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: Exercises/Datasets/gencode.vM24.basic.annotation.gtf
## # Organism: NA
## # Taxonomy ID: NA
## # miRBase build ID: NA
## # Genome: NA
## # Nb of transcripts: 81726
## # Db created by: txdbmaker package from Bioconductor
## # Creation time: 2025-03-18 11:39:02 +0100 (Tue, 18 Mar 2025)
## # txdbmaker version at creation time: 1.2.1
## # RSQLite version at creation time: 2.3.9
## # DBSCHEMAVERSION: 1.2

Extract transcripts from mouse and assign them to the variable transc. Use the parameter columns = c("tx_name", "gene_id") to include additional information.

transc = transcripts(mouse, columns = c("tx_name", "gene_id"))
transc

## GRanges object with 81726 ranges and 2 metadata columns:
##           seqnames          ranges strand |              tx_name
##              <Rle>       <IRanges>  <Rle> |          <character>
##       [1]     chr1 3073253-3074322      + | ENSMUST00000193812.1
##       [2]     chr1 3102016-3102125      + | ENSMUST00000082908.1
##       [3]     chr1 3252757-3253236      + | ENSMUST00000192857.1
##       [4]     chr1 3466587-3513553      + | ENSMUST00000161581.1
##       [5]     chr1 3531795-3532720      + | ENSMUST00000192183.1
##       ...      ...             ...    ... .                  ...
##   [81722]     chrM       5260-5326      - | ENSMUST00000082401.1
##   [81723]     chrM       6870-6938      - | ENSMUST00000082403.1
##   [81724]     chrM     13552-14070      - | ENSMUST00000082419.1
##   [81725]     chrM     14071-14139      - | ENSMUST00000082420.1
##   [81726]     chrM     15356-15422      - | ENSMUST00000082423.1
##                        gene_id
##                <CharacterList>
##       [1] ENSMUSG00000102693.1
##       [2] ENSMUSG00000064842.1
##       [3] ENSMUSG00000102851.1
##       [4] ENSMUSG00000089699.1
##       [5] ENSMUSG00000103147.1
##       ...                  ...
##   [81722] ENSMUSG00000064350.1
##   [81723] ENSMUSG00000064352.1
##   [81724] ENSMUSG00000064368.1
##   [81725] ENSMUSG00000064369.1
##   [81726] ENSMUSG00000064372.1
##   -------
##   seqinfo: 22 sequences (1 circular) from an unspecified genome; no seqlengths

Create a vector all_transcripts containing all unique transcript names from transc.
Then, evaluate the length of the vector.

all_transcripts = unique(transc$tx_name)
head(all_transcripts)

## [1] "ENSMUST00000193812.1" "ENSMUST00000082908.1" "ENSMUST00000192857.1"
## [4] "ENSMUST00000161581.1" "ENSMUST00000192183.1" "ENSMUST00000193244.1"