"TGCTCAGGTAGCCTCACCTCC"
into a
DNAString object
, then evaluate:Hint: explore DNAString( )
function.
library(Biostrings)
sequence = "TGCTCAGGTAGCCTCACCTCC"
sequence = DNAString(sequence)
reverse(sequence)
## 21-letter DNAString object
## seq: CCTCCACTCCGATGGACTCGT
complement(sequence)
## 21-letter DNAString object
## seq: ACGAGTCCATCGGAGTGGAGG
reverseComplement(sequence)
## 21-letter DNAString object
## seq: GGAGGTGAGGCTACCTGAGCA
sequences <- c("AAATCGA", "ATACAACAT", "TTGCCA")
,
transform these sequences into a DNAStringSet object
.length()
,
nchar()
, [
, and sample()
?Hint: explore DNAStringSet( )
function.
sequences = c("AAATCGA", "ATACAACAT", "TTGCCA")
sequences = DNAStringSet(sequences)
sequences
## DNAStringSet object of length 3:
## width seq
## [1] 7 AAATCGA
## [2] 9 ATACAACAT
## [3] 6 TTGCCA
length(sequences)
## [1] 3
nchar(sequences)
## [1] 7 9 6
sequences[1]
## DNAStringSet object of length 1:
## width seq
## [1] 7 AAATCGA
sample(sequences, 2)
## DNAStringSet object of length 2:
## width seq
## [1] 9 ATACAACAT
## [2] 7 AAATCGA
reverseComplement(sequences)
## DNAStringSet object of length 3:
## width seq
## [1] 7 TCGATTT
## [2] 9 ATGTTGTAT
## [3] 6 TGGCAA
DNAStringSet()
from an object that does not contain a DNA sequence?DNAStringSet("ACGTMRW")
? Why?Check this resource for more information.
not_dna = "HHJU"
tryCatch({DNAStringSet(not_dna) # J and U are not DNA bases
}, error = function(e) {
print(paste("Error:", e$message))
})
## [1] "Error: key 74 (char 'J') not in lookup table"
DNAStringSet("ACGTMRW") # T, M, R and W are degenerated bases ...
## DNAStringSet object of length 1:
## width seq
## [1] 7 ACGTMRW
BSgenome
genomes, identify the
Apis mellifera assembly from BeeBase and install the related
package.dna
.library(BSgenome)
head(available.genomes())
## [1] "BSgenome.Alyrata.JGI.v1"
## [2] "BSgenome.Amellifera.BeeBase.assembly4"
## [3] "BSgenome.Amellifera.NCBI.AmelHAv3.1"
## [4] "BSgenome.Amellifera.UCSC.apiMel2"
## [5] "BSgenome.Amellifera.UCSC.apiMel2.masked"
## [6] "BSgenome.Aofficinalis.NCBI.V1"
#BiocManager::install("BSgenome.Amellifera.BeeBase.assembly4")
library(BSgenome.Amellifera.BeeBase.assembly4)
dna <- BSgenome.Amellifera.BeeBase.assembly4
dna
## | BSgenome object for Honey Bee
## | - organism: Apis mellifera
## | - provider: BeeBase
## | - genome: assembly4
## | - release date: Feb. 2008
## | - 16 sequence(s):
## | Group1 Group2 Group3 Group4 Group5 Group6 Group7 Group8 Group9
## | Group10 Group11 Group12 Group13 Group14 Group15 Group16
## |
## | Tips: call 'seqnames()' on the object to get all the sequence names, call
## | 'seqinfo()' to get the full sequence info, use the '$' or '[[' operator to
## | access a given sequence, see '?BSgenome' for more information.
seqnames(dna)
## [1] "Group1" "Group2" "Group3" "Group4" "Group5" "Group6" "Group7"
## [8] "Group8" "Group9" "Group10" "Group11" "Group12" "Group13" "Group14"
## [15] "Group15" "Group16"
seqlengths(dna)
## Group1 Group2 Group3 Group4 Group5 Group6 Group7 Group8
## 29934090 16072177 13621520 12256690 14500692 17739083 12848973 13189223
## Group9 Group10 Group11 Group12 Group13 Group14 Group15 Group16
## 11082907 12642577 14521977 11309010 10266737 9976661 10159687 7072872
letterFrequency()
to evaluate the frequency of N
bases into the “Group1”letterFrequency(x = dna$Group1, letters = "N")
## N
## 4542078
prob = TRUE
as option in
letterFrequency()
. What does it change?letterFrequency(x = dna$Group1, letters = "N", as.prob = TRUE)
## N
## 0.151736
[
is used to subset a DNAStringSet
, it
cannot be used to take substrings from a sequence. Instead, this can be
done usiing the subseq( )
function.DNAStringSet
by extracting sequences for
Group1
, Group2
and Group5
from
the genome of the previous exercises.subseq()
.dna2 = DNAStringSet(c(as.character(dna$Group1)
, as.character(dna$Group2)
, as.character(dna$Group5))
)
dna2
## DNAStringSet object of length 3:
## width seq
## [1] 29934090 AGCCTAACCCAGCCTGACCTAGCCTAACCCAG...GTTTCTTGGAAAAAGTGATACGTAAGCGAATA
## [2] 16072177 TCGAAAAGTGAGGGTGAAATGAATATAAGAAG...CGCTGTAATATCGATCTTAGTCGATAACTTAT
## [3] 14500692 TAAAATAATAAGATATATAATAAAATATAATA...AACTGTCGGCACGCAGAAATTAAAGAAGAAGC
dna2 = subseq(dna2, start = 2, end = 10)
dna2
## DNAStringSet object of length 3:
## width seq
## [1] 9 GCCTAACCC
## [2] 9 CGAAAAGTG
## [3] 9 AAAATAATA
Homo_sapiens.GRCh38.dna.chromosome.11.22.fa
from the
Datasets
folder. Explore the obtained object.fasta_file = readDNAStringSet("Exercises/Datasets/Homo_sapiens.GRCh38.dna.chromosome.11.22.fa")
fasta_file
## DNAStringSet object of length 2:
## width seq names
## [1] 135086622 NNNNNNNNNNNNNNNNNNNNN...NNNNNNNNNNNNNNNNNNNNN 11 dna:chromosome...
## [2] 50818468 NNNNNNNNNNNNNNNNNNNNN...NNNNNNNNNNNNNNNNNNNNN 22 dna:chromosome...