chr_data <- c("Data", "Daft", "YouTube", "channel",
"learn", "and", "have", "FUN!")10 stringr tutorial
This is an overview of the stringr package that is part of the “tidyverse” family of packages.
The info in this section comes from this youtube playlist: https://www.youtube.com/watch?v=oIu5jK8DeX8&list=PLiC1doDIe9rDwsUhd3FtN1XGCV2ES1xZ2
10.1 tidyverse
See these resources for more info about the entire tidyverse family of packages.
10.2 Other tidyverse material
- tibble package: https://tibble.tidyverse.org/
- magrittr: https://magrittr.tidyverse.org/
- lubridate package: https://lubridate.tidyverse.org/
- hms package: https://hms.tidyverse.org/
- dplyr package: https://dplyr.tidyverse.org/
- Also see this playlist for info about dplyr: https://www.youtube.com/watch?v=THGFXV4RW8U&list=PLiC1doDIe9rC8RgWPAWqDETE-VbKOWfWl
10.3 stringr: Basic String Manipulation
# Check the length of a string
str_length("Hi there! How are you?")[1] 22
str_length(chr_data)[1] 4 4 7 7 5 3 4 4
# Convert string letters to uppercase
str_to_upper(chr_data)[1] "DATA" "DAFT" "YOUTUBE" "CHANNEL" "LEARN" "AND" "HAVE"
[8] "FUN!"
# Convert string letters to lowercase
str_to_lower(chr_data)[1] "data" "daft" "youtube" "channel" "learn" "and" "have"
[8] "fun!"
# Convert string to title (first letter uppercase)
str_to_title(chr_data)[1] "Data" "Daft" "Youtube" "Channel" "Learn" "And" "Have"
[8] "Fun!"
# Convert string to sentence (only first letter of first word uppercase)
str_to_sentence("make me into a SENTENCE!")[1] "Make me into a sentence!"
# Trim whitespace
str_trim(" Trim Me! ")[1] "Trim Me!"
# Pad strings with whitespace
str_pad("Pad Me!", width = 15, side="both")[1] " Pad Me! "
# Truncate strings to a given length
str_trunc("If you have a long string, you might want to truncate it!",
width = 50)[1] "If you have a long string, you might want to tr..."
10.4 stringr: Split and Join Strings
# Split strings
str_split("Split Me!", pattern = " ")[[1]]
[1] "Split" "Me!"
food <- c(
"apples and oranges and pears and bananas",
"pineapples and mangos and guavas"
)
stringr::str_split(food, " and ")[[1]]
[1] "apples" "oranges" "pears" "bananas"
[[2]]
[1] "pineapples" "mangos" "guavas"
# Join strings (equivalent to base R paste())
str_c("Join", "Me!", sep="_")[1] "Join_Me!"
# Join strings (equivalent to base R paste())
str_c(c("Join", "vectors"), c("Me!", "too!"), sep="_")[1] "Join_Me!" "vectors_too!"
# Collapse a vector of strings into a single string
str_c(c("Turn", "me", "into", "one", "string!"), collapse= " ")[1] "Turn me into one string!"
# Convert NA values in character vector to string "NA"
str_replace_na(c("Make", NA, "strings!"))[1] "Make" "NA" "strings!"
10.5 stringr: Sorting Strings
sort_data <- c("sort", "me", "please!")
# Get vector of indicies that would sort a string alphabetically
str_order(sort_data)[1] 2 3 1
# Use discovered ordering to extract data in sorted order
sort_data[str_order(sort_data)][1] "me" "please!" "sort"
# Directly extract sorted strings
str_sort(sort_data)[1] "me" "please!" "sort"
# Extract in reverse sorted order
str_sort(sort_data, decreasing = TRUE)[1] "sort" "please!" "me"
10.6 stringr: String Interpolation
first <- c("Luke", "Han", "Jean-Luc")
last <- c("Skywalker", "Solo", "Picard")
# Interpolate (insert variable values) into strings with str_glue()
str_glue("My name is {first}. {first} {last}.")My name is Luke. Luke Skywalker.
My name is Han. Han Solo.
My name is Jean-Luc. Jean-Luc Picard.
minimum_age <- 18
over_minimum <- c(5, 17, 33)
# Interpolate the result of an execution into a string
str_glue("{first} {last} is {minimum_age + over_minimum} years old.")Luke Skywalker is 23 years old.
Han Solo is 35 years old.
Jean-Luc Picard is 51 years old.
num <- c(1:5)
# Interpolate the result of function calls
str_glue("The square root of {num} is {round(sqrt(num), 3)}.")The square root of 1 is 1.
The square root of 2 is 1.414.
The square root of 3 is 1.732.
The square root of 4 is 2.
The square root of 5 is 2.236.
fuel_efficiency <- 30
# Interpolate strings using data from a data frame
mtcars %>% rownames_to_column("Model") %>%
filter(mpg > fuel_efficiency) %>%
str_glue_data("The {Model} gets {mpg} mpg.")The Fiat 128 gets 32.4 mpg.
The Honda Civic gets 30.4 mpg.
The Toyota Corolla gets 33.9 mpg.
The Lotus Europa gets 30.4 mpg.
10.7 stringr: String Matching
head(data,8) author score
1 butt_ghost 3
2 buntaro_pup 1
3 iidealized 2
4 [deleted] 1
5 stathibus 6
6 soulslicer0 2
7 swiftsecond 1
body
1 Hdf5. It's structured, it's easy to get data in and out, and it's fast. Plus it will scale if you ever get up there in dataset size.
2 yep, good point.
3 Google must have done (and is doing) serious internal research in ranking. I've heard they're pretty good at that and they've even made some money doing it :P
4 [deleted]
5 Sebastian Thrun's book, Probabilistic Robotics, goes through this in great detail. Get it, read it, make it your bible.
6 This. Such a legendary book. Kalman filters, particle filters, recursive Bayesian filters and a whole bunch of other stuff. I learnt so much. Read these 3 for starts from the book, then come back and ask the questions
7 Do you still need help?
# Detecting the presence of a pattern in strings
str_detect(data$body[1:100], pattern="deep") [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE NA NA NA NA NA
[13] NA NA NA NA NA NA NA NA NA NA NA NA
[25] NA NA NA NA NA NA NA NA NA NA NA NA
[37] NA NA NA NA NA NA NA NA NA NA NA NA
[49] NA NA NA NA NA NA NA NA NA NA NA NA
[61] NA NA NA NA NA NA NA NA NA NA NA NA
[73] NA NA NA NA NA NA NA NA NA NA NA NA
[85] NA NA NA NA NA NA NA NA NA NA NA NA
[97] NA NA NA NA
# Get the indicies of matched strings
str_inds <- str_which(data$body[1:100], pattern="deep")
str_indsinteger(0)
# Extract matched strings using detected indicies
data$body[str_inds]character(0)
# Count the number of matches
str_count(data$body[1:100], "deep") [1] 0 0 0 0 0 0 0 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[26] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[51] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[76] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
# Get the position of matches
str_locate_all(data$body[1], "deep")[[1]]
start end
# Get a list of the first match found in each string as a vector
str_extract(data$body[1:3], "deep|the|and")[1] "and" NA "and"
# Get a list of the first match found in each string as matrix
str_match(data$body[1:3], "deep|the|and") [,1]
[1,] "and"
[2,] NA
[3,] "and"
# Get a list of the all matches found in each string as list of matricies
str_match_all(data$body[1:3], "deep|the|and")[[1]]
[,1]
[1,] "and"
[2,] "and"
[3,] "the"
[[2]]
[,1]
[[3]]
[,1]
[1,] "and"
[2,] "the"
[3,] "and"
[4,] "the"
10.8 stringr: Subset and Replace Strings
head(data,8) author score
1 butt_ghost 3
2 buntaro_pup 1
3 iidealized 2
4 [deleted] 1
5 stathibus 6
6 soulslicer0 2
7 swiftsecond 1
body
1 Hdf5. It's structured, it's easy to get data in and out, and it's fast. Plus it will scale if you ever get up there in dataset size.
2 yep, good point.
3 Google must have done (and is doing) serious internal research in ranking. I've heard they're pretty good at that and they've even made some money doing it :P
4 [deleted]
5 Sebastian Thrun's book, Probabilistic Robotics, goes through this in great detail. Get it, read it, make it your bible.
6 This. Such a legendary book. Kalman filters, particle filters, recursive Bayesian filters and a whole bunch of other stuff. I learnt so much. Read these 3 for starts from the book, then come back and ask the questions
7 Do you still need help?
# Get a string subset based on character position
str_sub(data$body[1], start=1, end=100)[1] "Hdf5. It's structured, it's easy to get data in and out, and it's fast. Plus it will scale if you ev"
# Get a string subset based on words
word(data$body[1], start=1, end=10)[1] "Hdf5. It's structured, it's easy to get data in and"
# Get the strings that contain a certain pattern
str_subset(data$body[1:100], pattern="deep")character(0)
# Replace a substring with a new string by substring position
str_sub(data$body[1], start=1, end=100) <- str_to_upper(str_sub(data$body[1],
start=1,
end=100))
str_sub(data$body[1], start=1, end=100)[1] "HDF5. IT'S STRUCTURED, IT'S EASY TO GET DATA IN AND OUT, AND IT'S FAST. PLUS IT WILL SCALE IF YOU EV"
# Replace first occurrence of a substring with a new string by matching
str_replace(data$body[1], pattern="deep|DEEP", replacement="multi-layer")[1] "HDF5. IT'S STRUCTURED, IT'S EASY TO GET DATA IN AND OUT, AND IT'S FAST. PLUS IT WILL SCALE IF YOU EVer get up there in dataset size."
# Replace all occurrences of a substring with a new string by matching
str_replace_all(data$body[1], pattern="deep|DEEP", replacement="multi-layer")[1] "HDF5. IT'S STRUCTURED, IT'S EASY TO GET DATA IN AND OUT, AND IT'S FAST. PLUS IT WILL SCALE IF YOU EVer get up there in dataset size."
10.9 stringr: Viewing Strings
# Basic printing
print(data$body[1:10]) [1] "HDF5. IT'S STRUCTURED, IT'S EASY TO GET DATA IN AND OUT, AND IT'S FAST. PLUS IT WILL SCALE IF YOU EVer get up there in dataset size."
[2] "yep, good point."
[3] "Google must have done (and is doing) serious internal research in ranking. I've heard they're pretty good at that and they've even made some money doing it :P"
[4] "[deleted]"
[5] "Sebastian Thrun's book, Probabilistic Robotics, goes through this in great detail. Get it, read it, make it your bible."
[6] "This. Such a legendary book. Kalman filters, particle filters, recursive Bayesian filters and a whole bunch of other stuff. I learnt so much. Read these 3 for starts from the book, then come back and ask the questions"
[7] "Do you still need help?"
[8] NA
[9] NA
[10] NA
deep_learning_posts <- data$body[str_which(data$body, "deep learning")]
# View strings in HTML format with the first occurence of a pattern highlighted
str_view(deep_learning_posts, pattern="deep")# View strings in HTML format with the first all occurences highlighted
str_view_all(deep_learning_posts, pattern="deep")Warning: `str_view_all()` was deprecated in stringr 1.5.0.
ℹ Please use `str_view()` instead.
# Format strings into paragraphs of a given width with str_wrap()
wrapped <- str_wrap(data$body[str_which(data$body, "deep learning")][1],
width = 50)
wrapped [1] NA
# Print wrapped string with output obeying newlines
wrapped %>% cat()NA
# Display wrapped paragraph as HTML, inserting paragraph breaks
str_wrap(data$body[str_which(data$body, "deep learning")][1], width = 50) %>%
str_replace_all("\n", "<br>") %>%
str_view_all(pattern = "deep")[1] │ NA