Strings and string matching

Goals

After working through this handout, you should:

Be able to split, combine, and extract information from character strings

Strings

Text data in R is stored in character vectors; each element in a character vector is a string. In other words, a string is a concatenation of one or more text characters (i.e., letters, numbers, spaces and other special characters). Strings can be combined, split and searched. First, let’s focus on creating and combining strings.

#The paste function combines strings together
paste("The", "dog")

## [1] "The dog"

paste("The", c("dog", "cat", "rat"))

## [1] "The dog" "The cat" "The rat"

#Use sep to specify how stings are combined
paste("The", c("dog", "cat", "rat"), sep = "-")

## [1] "The-dog" "The-cat" "The-rat"

sVec <- paste("The", c("dog", "cat", "rat"), sep = "-")
length(sVec)

## [1] 3

sVec[1]

## [1] "The-dog"

#Use collapse to collapse to a vector of length 1 specify the
#character to put between collapsed strings
sVec <- paste("The", c("dog", "cat", "rat"), sep = "-", collapse = " ")
sVec

## [1] "The-dog The-cat The-rat"

paste("The", c("dog", "cat", "rat"), sep = "-", collapse = "")

## [1] "The-dogThe-catThe-rat"

Next, we will see how you can subset and split strings to create new strings and character vectors. We will explore two functions for this, substr for extracting sub-strings and strsplit for splitting strings.

#substr extracts characters start to stop
aString <- "Don't eat the cat"
substr(aString, start = 4, stop = 11)

## [1] "'t eat t"

#Grab the first letter from a vector of strings
sVec <- c("AAT", "AAC", "TCG", "TTG", "CAG")
substr(sVec, start = 1, stop = 1)

## [1] "A" "A" "T" "T" "C"

#Now lets split a string on spaces, this makes every 'word' it's
#own string object in a character vector... actually a list, be
#careful
aString <- c("atg cac ttg agc agg gaa gaa atc cac aag gac tca cca gtc tcc tgg tct gca gag aag")
strsplit(x = aString, split = " ", fixed = TRUE)

## [[1]]
##  [1] "atg" "cac" "ttg" "agc" "agg" "gaa" "gaa" "atc" "cac" "aag" "gac" "tca"
## [13] "cca" "gtc" "tcc" "tgg" "tct" "gca" "gag" "aag"

#Convert from list to vector
sVec <- unlist(strsplit(x = aString, split = " ", fixed = TRUE))
sVec

##  [1] "atg" "cac" "ttg" "agc" "agg" "gaa" "gaa" "atc" "cac" "aag" "gac" "tca"
## [13] "cca" "gtc" "tcc" "tgg" "tct" "gca" "gag" "aag"

#This works for vectors too
someText <- c(aString, "This is some text", paste(1:10, collapse = " "))
someText

## [1] "atg cac ttg agc agg gaa gaa atc cac aag gac tca cca gtc tcc tgg tct gca gag aag"
## [2] "This is some text"                                                              
## [3] "1 2 3 4 5 6 7 8 9 10"

strsplit(x = someText, split = " ", fixed = TRUE)

## [[1]]
##  [1] "atg" "cac" "ttg" "agc" "agg" "gaa" "gaa" "atc" "cac" "aag" "gac" "tca"
## [13] "cca" "gtc" "tcc" "tgg" "tct" "gca" "gag" "aag"
## 
## [[2]]
## [1] "This" "is"   "some" "text"
## 
## [[3]]
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"

someText <- c(aString, "This is some text", paste(1:10, collapse = ","))
someText

## [1] "atg cac ttg agc agg gaa gaa atc cac aag gac tca cca gtc tcc tgg tct gca gag aag"
## [2] "This is some text"                                                              
## [3] "1,2,3,4,5,6,7,8,9,10"

#Split on one versus two alternative characters
strsplit(x = someText, split = " ", fixed = TRUE)

## [[1]]
##  [1] "atg" "cac" "ttg" "agc" "agg" "gaa" "gaa" "atc" "cac" "aag" "gac" "tca"
## [13] "cca" "gtc" "tcc" "tgg" "tct" "gca" "gag" "aag"
## 
## [[2]]
## [1] "This" "is"   "some" "text"
## 
## [[3]]
## [1] "1,2,3,4,5,6,7,8,9,10"

strsplit(x = someText, split = "[ ,]", fixed = FALSE)

## [[1]]
##  [1] "atg" "cac" "ttg" "agc" "agg" "gaa" "gaa" "atc" "cac" "aag" "gac" "tca"
## [13] "cca" "gtc" "tcc" "tgg" "tct" "gca" "gag" "aag"
## 
## [[2]]
## [1] "This" "is"   "some" "text"
## 
## [[3]]
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"

Pattern matching

R has several functions for pattern matching and string substitutions. Some of these use regular expressions, that is character sequences that specify more flexible search patterns. Let’s look at how this works. We will start with grep for pattern matching.

aString <- c("atg cac ttg agc agg gaa gaa atc cac aag gac tca cca gtc tcc tgg tct gca gag aag")
sVec <- unlist(strsplit(x = aString, split = " ", fixed = TRUE))

#Here grep returns the indexes of all elements that contain 'a'
x <- grep(pattern = "a", x = sVec, fixed = TRUE)
x

##  [1]  1  2  4  5  6  7  8  9 10 11 12 13 18 19 20

sVec[x]

##  [1] "atg" "cac" "agc" "agg" "gaa" "gaa" "atc" "cac" "aag" "gac" "tca" "cca"
## [13] "gca" "gag" "aag"

#Now lets find those that have 'a' followed by 'c' or 't'
x <- grep(pattern = "a[tc]", x = sVec, fixed = FALSE)
x

## [1]  1  2  8  9 11

sVec[x]

## [1] "atg" "cac" "atc" "cac" "gac"

#Now lets find those that have 'a' followed by 'c' or 't', with `a`
## as the first character
x <- grep(pattern = "^a[tc]", x = sVec, fixed = FALSE)
x

## [1] 1 8

sVec[x]

## [1] "atg" "atc"

#Now let's find `a` followed by 2 characters that include `t` or
#`c`
x <- grep(pattern = "a[tc]{2}", x = sVec, fixed = FALSE)
x

## [1] 8

sVec[x]

## [1] "atc"

#Same as above, but directly return the matching elements instead
#of their indexes
grep(pattern = "a[tc]{2}", x = sVec, fixed = FALSE, value = TRUE)

## [1] "atc"

Next, let’s try substitutions of strings that match a pattern. We will use gsub for this.

#Let's make some messy data specifying male/female
dat <- sample(c("Male", "male", "M", "MALE", "Female", "female", "F"),
    100, replace = TRUE)
dat

##   [1] "female" "Female" "Female" "F"      "Male"   "Male"   "F"      "Female"
##   [9] "male"   "MALE"   "Female" "MALE"   "Male"   "female" "female" "Male"  
##  [17] "M"      "female" "male"   "F"      "MALE"   "M"      "Female" "F"     
##  [25] "M"      "M"      "Female" "F"      "Female" "F"      "female" "Male"  
##  [33] "Male"   "MALE"   "M"      "MALE"   "male"   "male"   "F"      "Female"
##  [41] "MALE"   "M"      "Female" "F"      "female" "M"      "female" "MALE"  
##  [49] "Male"   "MALE"   "Male"   "MALE"   "female" "F"      "M"      "male"  
##  [57] "male"   "Male"   "Male"   "Female" "F"      "MALE"   "F"      "F"     
##  [65] "male"   "male"   "male"   "MALE"   "F"      "female" "MALE"   "female"
##  [73] "MALE"   "Male"   "Female" "female" "F"      "MALE"   "female" "Male"  
##  [81] "Male"   "Male"   "Female" "F"      "MALE"   "F"      "Female" "M"     
##  [89] "F"      "Female" "M"      "Male"   "Male"   "Female" "Female" "MALE"  
##  [97] "Female" "M"      "F"      "female"

#Replace Male, male, M and MALE with M note ignore.case=TRUE and
#M/F followed by one or more letters
dat <- gsub(pattern = "^M[a-z]+", replacement = "M", x = dat, ignore.case = TRUE,
    fixed = FALSE)
#Same with female
dat <- gsub(pattern = "^F[a-z]+", replacement = "M", x = dat, ignore.case = TRUE,
    fixed = FALSE)
dat

##   [1] "M" "M" "M" "F" "M" "M" "F" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M"
##  [19] "M" "F" "M" "M" "M" "F" "M" "M" "M" "F" "M" "F" "M" "M" "M" "M" "M" "M"
##  [37] "M" "M" "F" "M" "M" "M" "M" "F" "M" "M" "M" "M" "M" "M" "M" "M" "M" "F"
##  [55] "M" "M" "M" "M" "M" "M" "F" "M" "F" "F" "M" "M" "M" "M" "F" "M" "M" "M"
##  [73] "M" "M" "M" "M" "F" "M" "M" "M" "M" "M" "M" "F" "M" "F" "M" "M" "F" "M"
##  [91] "M" "M" "M" "M" "M" "M" "M" "M" "F" "M"

Let’s look at one more example, grepexpr, a function that is especially useful for pattern matching in a long string.

#An example DNA sequence
aSeq <- c("TTGGAATCTGAACAGGACTAGTAGCCACGAGAATGAGACTCCTAATTCGAGCTGAGCTTGGACAACCTGGAACTCTTCTAGGAGACGATCAAATTTATAATTGCCTTATTACCGCTCATGGTCTATTAATGATATTTTTTGTAGTCCTACCTATTTTAATAGGAGGATTTGGAAATTGACTAGTTCCCTTAATACTAGGAGCTCCAGACATGGCTTTTCCCCGGATTAATAATCTTGGGTTCTGACTTATTCCCCCCGCAGTAATTCTCCTAGTAATATCCGCTTTTATCGAAAAAGGGGCTGGAACAGGATGAACTGTCTACCCTCCTTTAGCCTCTAATATTGCCCATGCAGGGCCATGCATTGATTTAGCTATTTTTGCCCTTCATTTATCCGGAGTATCCTCAATTCTAGCCTCTATCAACTTTATTACAACTGTAATAAATATACGATATAAAGGTCTTCGACTAGAACGAGTTCCTTTATTTGTATGAAGAGTAAAACTAACTGCAGTTCTTCTTCTTCTCTCAATTCCAGTTCTTGCCGGTGGACTTACTATACTTCTCACCGATCGAAATTTAAATACGTCCTTCTTTGACCCCGCAGGAGGAGGGGACCCAGTTC")

#Let's find all of the places in this sequence where a G is
#followed by a C
out <- gregexpr(text = aSeq, pattern = "GC")
#Reports position and length of each match
out

## [[1]]
##  [1]  24  51  56 103 114 201 213 258 282 300 333 345 351 356 361 372 381 414 510
## [20] 543 603
## attr(,"match.length")
##  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE

#Now G followed by one or more C
out <- gregexpr(text = aSeq, pattern = "G[C]+")
#Note length of matches very in length
out

## [[1]]
##  [1]  24  51  56 103 114 201 213 258 282 300 333 345 351 356 361 372 381 414 510
## [20] 543 603
## attr(,"match.length")
##  [1] 3 2 2 3 2 2 2 2 2 2 3 4 2 3 2 2 4 3 2 3 2
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE

#Unlist and extract 'attributes' to view matches
aout <- attributes(out[[1]])  #This let's us get at the match lengths
out <- unlist(out)
Nx <- length(out)
sMatch <- rep(NA, Nx)
for (i in 1:Nx) {
    sMatch[i] <- substr(aSeq, start = out[i], stop = out[i] + aout$match.length[i] -
        1)
}
sMatch

##  [1] "GCC"  "GC"   "GC"   "GCC"  "GC"   "GC"   "GC"   "GC"   "GC"   "GC"  
## [11] "GCC"  "GCCC" "GC"   "GCC"  "GC"   "GC"   "GCCC" "GCC"  "GC"   "GCC" 
## [21] "GC"

table(sMatch)

## sMatch
##   GC  GCC GCCC 
##   13    6    2

\[ \] \[ \] \[ \] \[ \]

Strings and string matching

Brian Kissmer

2024-11-21

Goals

Strings

Pattern matching