Input/Output
or Conditional/Loops
exercises).
Assign it to the variable cpgi
, using
stringsAsFactors = FALSE
.cpgi = read.csv("Exercises/Datasets/CpGi.table.hg18.csv", stringsAsFactors = F)
cpgi
, keeping only CpG islands located on
canonical chromosomes (chr1–chr22, chrX, and chrY). Reassign the result
to cpgi.
cpgi = subset(cpgi,chrom %in% c(paste0("chr", 1:22), "chrX", "chrY") )
cpgi
into a GRanges Object
. Use
the options keep.extra.columns = TRUE
and
ignore.strand = TRUE
.seqnames.field
, start.field
,
end.field
, and strand.field
options.library(GenomicRanges)
cpgi = makeGRangesFromDataFrame(cpgi,
, keep.extra.columns = TRUE
, ignore.strand = TRUE
, seqinfo = NULL
, seqnames.field = "chrom"
, start.field = "chromStart"
, end.field = "chromEnd")
cpgi
## GRanges object with 27639 ranges and 7 metadata columns:
## seqnames ranges strand | name length cpgNum
## <Rle> <IRanges> <Rle> | <character> <integer> <integer>
## [1] chr1 18598-19673 * | CpG: 116 1075 116
## [2] chr1 124987-125426 * | CpG: 30 439 30
## [3] chr1 317653-318092 * | CpG: 29 439 29
## [4] chr1 427014-428027 * | CpG: 84 1013 84
## [5] chr1 439136-440407 * | CpG: 99 1271 99
## ... ... ... ... . ... ... ...
## [27635] chr22 49482536-49482984 * | CpG: 44 448 44
## [27636] chr22 49489668-49490174 * | CpG: 38 506 38
## [27637] chr22 49505252-49506926 * | CpG: 167 1674 167
## [27638] chr22 49515893-49516885 * | CpG: 81 992 81
## [27639] chr22 49568638-49569183 * | CpG: 63 545 63
## gcNum perCpg perGc obsExp
## <integer> <numeric> <numeric> <numeric>
## [1] 787 21.6 73.2 0.83
## [2] 295 13.7 67.2 0.64
## [3] 295 13.2 67.2 0.62
## [4] 734 16.6 72.5 0.64
## [5] 777 15.6 61.1 0.84
## ... ... ... ... ...
## [27635] 279 19.6 62.3 1.01
## [27636] 328 15.0 64.8 0.72
## [27637] 1153 20.0 68.9 0.85
## [27638] 564 16.3 56.9 1.02
## [27639] 407 23.1 74.7 0.83
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
name
column as the names of
the object using names()
, then remove the name
column.names(cpgi) = cpgi$name
cpgi$name <- NULL
TxDb package
related to the
human UCSC hg18 assembly
, then load it and assign it to the
variable genome
. Take some time to explore this
object.BiocManager::install("TxDb.Hsapiens.UCSC.hg18.knownGene")
library(TxDb.Hsapiens.UCSC.hg18.knownGene)
genome = TxDb.Hsapiens.UCSC.hg18.knownGene
genome
object and assign
them to the variable prom
. Extend the TSS by
1000
bp upstream and 100
bp downstream (into
the gene body).library(GenomicFeatures)
prom = promoters(genome
, upstream = 1000
, downstream = 100)
prom
## GRanges object with 66803 ranges and 2 metadata columns:
## seqnames ranges strand | tx_id tx_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## uc001aaa.2 chr1 116-1215 + | 1 uc001aaa.2
## uc009vip.1 chr1 116-1215 + | 2 uc009vip.1
## uc009vjg.1 chr1 18418-19517 + | 3 uc009vjg.1
## uc009vjh.1 chr1 54425-55524 + | 4 uc009vjh.1
## uc001aal.1 chr1 57954-59053 + | 5 uc001aal.1
## ... ... ... ... . ... ...
## uc004foe.2 chrX_random 982088-983187 - | 66799 uc004foe.2
## uc010nvr.1 chrX_random 982088-983187 - | 66800 uc010nvr.1
## uc010nvs.1 chrX_random 982088-983187 - | 66801 uc010nvs.1
## uc004foh.1 chrX_random 1326518-1327617 - | 66802 uc004foh.1
## uc004foj.1 chrX_random 1397361-1398460 - | 66803 uc004foj.1
## -------
## seqinfo: 49 sequences (1 circular) from hg18 genome
prom
.seqlevels
.prom = subset(prom, seqnames(prom) %in% c(paste0("chr", 1:22), "chrX", "chrY"))
seqlevels(prom)
## [1] "chr1" "chr2" "chr3" "chr4"
## [5] "chr5" "chr6" "chr7" "chr8"
## [9] "chr9" "chr10" "chr11" "chr12"
## [13] "chr13" "chr14" "chr15" "chr16"
## [17] "chr17" "chr18" "chr19" "chr20"
## [21] "chr21" "chr22" "chrX" "chrY"
## [25] "chrM" "chr1_random" "chr2_random" "chr3_random"
## [29] "chr4_random" "chr5_h2_hap1" "chr5_random" "chr6_cox_hap1"
## [33] "chr6_qbl_hap2" "chr6_random" "chr7_random" "chr8_random"
## [37] "chr9_random" "chr10_random" "chr11_random" "chr13_random"
## [41] "chr15_random" "chr16_random" "chr17_random" "chr18_random"
## [45] "chr19_random" "chr21_random" "chr22_h2_hap1" "chr22_random"
## [49] "chrX_random"
seqlevels(prom) <- c(paste0("chr", 1:22), "chrX", "chrY")
cpg_prom
.cpgi
object?prom
?Hint: When retrieving the query hits from the
overlap, use the unique()
function to avoid redundant
ranges.
ov = findOverlaps(cpgi, prom)
cpg_prom = unique(cpgi[queryHits(ov)])
length(cpgi)
## [1] 27639
length(cpg_prom)
## [1] 13919
BED
file containing methylation sites
(H3K4me3) in untreated HeLa cells
(H3K4me3_unstim_hg18_xset200_dupsN_ht5.sub.peaks_manipulated.bed
in the Datasets
folder) and assign it to the variable
meth
.library(rtracklayer)
meth = import.bed("Exercises/Datasets/H3K4me3_unstim_hg18_xset200_dupsN_ht5.sub.peaks_manipulated.bed")
meth
## GRanges object with 54487 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr3 281233-282145 *
## [2] chr3 393929-394278 *
## [3] chr3 441947-442992 *
## [4] chr3 564458-565228 *
## [5] chr3 566039-566320 *
## ... ... ... ...
## [54483] chrX 154494644-154494860 *
## [54484] chrX 154494886-154496142 *
## [54485] chrX 154649633-154650466 *
## [54486] chrX 154650651-154650863 *
## [54487] chrX 154764236-154765286 *
## -------
## seqinfo: 23 sequences from an unspecified genome; no seqlengths
cpg_prom
and meth
,
then:Retrieve the subset of cpg_prom
that overlaps with
meth
and assign it to the variable
cpg_prom_Ov
Hint: Use unique()
to retrieve only unique
positions.
Retrieve the subset of meth
that overlaps with
cpg_prom
and assign it to the variable
meth_Ov
.
Hint: Use unique()
to retrieve only unique
positions.
ov = findOverlaps(cpg_prom, meth)
cpg_prom_Ov = unique(cpg_prom[queryHits(ov)])
meth_Ov = unique(meth[subjectHits(ov)])
meth_Ov
## GRanges object with 16479 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 18277-18635 *
## [2] chr1 703599-703962 *
## [3] chr1 704119-704692 *
## [4] chr1 751266-752745 *
## [5] chr1 848690-849158 *
## ... ... ... ...
## [16475] chr22 49412959-49413427 *
## [16476] chr22 49413590-49414224 *
## [16477] chr22 49459608-49459993 *
## [16478] chr22 49460164-49460900 *
## [16479] chr22 49568385-49568982 *
## -------
## seqinfo: 23 sequences from an unspecified genome; no seqlengths
GTF
file for Mouse version M24 located in
the Datasets
folder. Use the makeTxDbFromGFF()
function. (Note: This operation may take some time.)mouse
.columns()
function: it
contains a lot of useful information.# BiocManager::install("txdbmaker")
library(txdbmaker)
mouse = makeTxDbFromGFF("Exercises/Datasets/gencode.vM24.basic.annotation.gtf" )
mouse
## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: Exercises/Datasets/gencode.vM24.basic.annotation.gtf
## # Organism: NA
## # Taxonomy ID: NA
## # miRBase build ID: NA
## # Genome: NA
## # Nb of transcripts: 81726
## # Db created by: txdbmaker package from Bioconductor
## # Creation time: 2025-03-18 11:39:02 +0100 (Tue, 18 Mar 2025)
## # txdbmaker version at creation time: 1.2.1
## # RSQLite version at creation time: 2.3.9
## # DBSCHEMAVERSION: 1.2
mouse
and assign them to the
variable transc
. Use the parameter
columns = c("tx_name", "gene_id")
to include additional
information.transc = transcripts(mouse, columns = c("tx_name", "gene_id"))
transc
## GRanges object with 81726 ranges and 2 metadata columns:
## seqnames ranges strand | tx_name
## <Rle> <IRanges> <Rle> | <character>
## [1] chr1 3073253-3074322 + | ENSMUST00000193812.1
## [2] chr1 3102016-3102125 + | ENSMUST00000082908.1
## [3] chr1 3252757-3253236 + | ENSMUST00000192857.1
## [4] chr1 3466587-3513553 + | ENSMUST00000161581.1
## [5] chr1 3531795-3532720 + | ENSMUST00000192183.1
## ... ... ... ... . ...
## [81722] chrM 5260-5326 - | ENSMUST00000082401.1
## [81723] chrM 6870-6938 - | ENSMUST00000082403.1
## [81724] chrM 13552-14070 - | ENSMUST00000082419.1
## [81725] chrM 14071-14139 - | ENSMUST00000082420.1
## [81726] chrM 15356-15422 - | ENSMUST00000082423.1
## gene_id
## <CharacterList>
## [1] ENSMUSG00000102693.1
## [2] ENSMUSG00000064842.1
## [3] ENSMUSG00000102851.1
## [4] ENSMUSG00000089699.1
## [5] ENSMUSG00000103147.1
## ... ...
## [81722] ENSMUSG00000064350.1
## [81723] ENSMUSG00000064352.1
## [81724] ENSMUSG00000064368.1
## [81725] ENSMUSG00000064369.1
## [81726] ENSMUSG00000064372.1
## -------
## seqinfo: 22 sequences (1 circular) from an unspecified genome; no seqlengths
all_transcripts
containing all unique
transcript names from transc
.all_transcripts = unique(transc$tx_name)
head(all_transcripts)
## [1] "ENSMUST00000193812.1" "ENSMUST00000082908.1" "ENSMUST00000192857.1"
## [4] "ENSMUST00000161581.1" "ENSMUST00000192183.1" "ENSMUST00000193244.1"