<- c("Data", "Daft", "YouTube", "channel",
chr_data "learn", "and", "have", "FUN!")
10 stringr tutorial
This is an overview of the stringr package that is part of the “tidyverse” family of packages.
The info in this section comes from this youtube playlist: https://www.youtube.com/watch?v=oIu5jK8DeX8&list=PLiC1doDIe9rDwsUhd3FtN1XGCV2ES1xZ2
10.1 tidyverse
See these resources for more info about the entire tidyverse family of packages.
10.2 Other tidyverse material
- tibble package: https://tibble.tidyverse.org/
- magrittr: https://magrittr.tidyverse.org/
- lubridate package: https://lubridate.tidyverse.org/
- hms package: https://hms.tidyverse.org/
- dplyr package: https://dplyr.tidyverse.org/
- Also see this playlist for info about dplyr: https://www.youtube.com/watch?v=THGFXV4RW8U&list=PLiC1doDIe9rC8RgWPAWqDETE-VbKOWfWl
10.3 stringr: Basic String Manipulation
# Check the length of a string
str_length("Hi there! How are you?")
[1] 22
str_length(chr_data)
[1] 4 4 7 7 5 3 4 4
# Convert string letters to uppercase
str_to_upper(chr_data)
[1] "DATA" "DAFT" "YOUTUBE" "CHANNEL" "LEARN" "AND" "HAVE"
[8] "FUN!"
# Convert string letters to lowercase
str_to_lower(chr_data)
[1] "data" "daft" "youtube" "channel" "learn" "and" "have"
[8] "fun!"
# Convert string to title (first letter uppercase)
str_to_title(chr_data)
[1] "Data" "Daft" "Youtube" "Channel" "Learn" "And" "Have"
[8] "Fun!"
# Convert string to sentence (only first letter of first word uppercase)
str_to_sentence("make me into a SENTENCE!")
[1] "Make me into a sentence!"
# Trim whitespace
str_trim(" Trim Me! ")
[1] "Trim Me!"
# Pad strings with whitespace
str_pad("Pad Me!", width = 15, side="both")
[1] " Pad Me! "
# Truncate strings to a given length
str_trunc("If you have a long string, you might want to truncate it!",
width = 50)
[1] "If you have a long string, you might want to tr..."
10.4 stringr: Split and Join Strings
# Split strings
str_split("Split Me!", pattern = " ")
[[1]]
[1] "Split" "Me!"
<- c(
food "apples and oranges and pears and bananas",
"pineapples and mangos and guavas"
)
::str_split(food, " and ") stringr
[[1]]
[1] "apples" "oranges" "pears" "bananas"
[[2]]
[1] "pineapples" "mangos" "guavas"
# Join strings (equivalent to base R paste())
str_c("Join", "Me!", sep="_")
[1] "Join_Me!"
# Join strings (equivalent to base R paste())
str_c(c("Join", "vectors"), c("Me!", "too!"), sep="_")
[1] "Join_Me!" "vectors_too!"
# Collapse a vector of strings into a single string
str_c(c("Turn", "me", "into", "one", "string!"), collapse= " ")
[1] "Turn me into one string!"
# Convert NA values in character vector to string "NA"
str_replace_na(c("Make", NA, "strings!"))
[1] "Make" "NA" "strings!"
10.5 stringr: Sorting Strings
<- c("sort", "me", "please!")
sort_data
# Get vector of indicies that would sort a string alphabetically
str_order(sort_data)
[1] 2 3 1
# Use discovered ordering to extract data in sorted order
str_order(sort_data)] sort_data[
[1] "me" "please!" "sort"
# Directly extract sorted strings
str_sort(sort_data)
[1] "me" "please!" "sort"
# Extract in reverse sorted order
str_sort(sort_data, decreasing = TRUE)
[1] "sort" "please!" "me"
10.6 stringr: String Interpolation
<- c("Luke", "Han", "Jean-Luc")
first <- c("Skywalker", "Solo", "Picard")
last
# Interpolate (insert variable values) into strings with str_glue()
str_glue("My name is {first}. {first} {last}.")
My name is Luke. Luke Skywalker.
My name is Han. Han Solo.
My name is Jean-Luc. Jean-Luc Picard.
<- 18
minimum_age <- c(5, 17, 33)
over_minimum
# Interpolate the result of an execution into a string
str_glue("{first} {last} is {minimum_age + over_minimum} years old.")
Luke Skywalker is 23 years old.
Han Solo is 35 years old.
Jean-Luc Picard is 51 years old.
<- c(1:5)
num
# Interpolate the result of function calls
str_glue("The square root of {num} is {round(sqrt(num), 3)}.")
The square root of 1 is 1.
The square root of 2 is 1.414.
The square root of 3 is 1.732.
The square root of 4 is 2.
The square root of 5 is 2.236.
<- 30
fuel_efficiency
# Interpolate strings using data from a data frame
%>% rownames_to_column("Model") %>%
mtcars filter(mpg > fuel_efficiency) %>%
str_glue_data("The {Model} gets {mpg} mpg.")
The Fiat 128 gets 32.4 mpg.
The Honda Civic gets 30.4 mpg.
The Toyota Corolla gets 33.9 mpg.
The Lotus Europa gets 30.4 mpg.
10.7 stringr: String Matching
head(data,8)
author score
1 butt_ghost 3
2 buntaro_pup 1
3 iidealized 2
4 [deleted] 1
5 stathibus 6
6 soulslicer0 2
7 swiftsecond 1
body
1 Hdf5. It's structured, it's easy to get data in and out, and it's fast. Plus it will scale if you ever get up there in dataset size.
2 yep, good point.
3 Google must have done (and is doing) serious internal research in ranking. I've heard they're pretty good at that and they've even made some money doing it :P
4 [deleted]
5 Sebastian Thrun's book, Probabilistic Robotics, goes through this in great detail. Get it, read it, make it your bible.
6 This. Such a legendary book. Kalman filters, particle filters, recursive Bayesian filters and a whole bunch of other stuff. I learnt so much. Read these 3 for starts from the book, then come back and ask the questions
7 Do you still need help?
# Detecting the presence of a pattern in strings
str_detect(data$body[1:100], pattern="deep")
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE NA NA NA NA NA
[13] NA NA NA NA NA NA NA NA NA NA NA NA
[25] NA NA NA NA NA NA NA NA NA NA NA NA
[37] NA NA NA NA NA NA NA NA NA NA NA NA
[49] NA NA NA NA NA NA NA NA NA NA NA NA
[61] NA NA NA NA NA NA NA NA NA NA NA NA
[73] NA NA NA NA NA NA NA NA NA NA NA NA
[85] NA NA NA NA NA NA NA NA NA NA NA NA
[97] NA NA NA NA
# Get the indicies of matched strings
<- str_which(data$body[1:100], pattern="deep")
str_inds str_inds
integer(0)
# Extract matched strings using detected indicies
$body[str_inds] data
character(0)
# Count the number of matches
str_count(data$body[1:100], "deep")
[1] 0 0 0 0 0 0 0 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[26] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[51] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[76] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
# Get the position of matches
str_locate_all(data$body[1], "deep")
[[1]]
start end
# Get a list of the first match found in each string as a vector
str_extract(data$body[1:3], "deep|the|and")
[1] "and" NA "and"
# Get a list of the first match found in each string as matrix
str_match(data$body[1:3], "deep|the|and")
[,1]
[1,] "and"
[2,] NA
[3,] "and"
# Get a list of the all matches found in each string as list of matricies
str_match_all(data$body[1:3], "deep|the|and")
[[1]]
[,1]
[1,] "and"
[2,] "and"
[3,] "the"
[[2]]
[,1]
[[3]]
[,1]
[1,] "and"
[2,] "the"
[3,] "and"
[4,] "the"
10.8 stringr: Subset and Replace Strings
head(data,8)
author score
1 butt_ghost 3
2 buntaro_pup 1
3 iidealized 2
4 [deleted] 1
5 stathibus 6
6 soulslicer0 2
7 swiftsecond 1
body
1 Hdf5. It's structured, it's easy to get data in and out, and it's fast. Plus it will scale if you ever get up there in dataset size.
2 yep, good point.
3 Google must have done (and is doing) serious internal research in ranking. I've heard they're pretty good at that and they've even made some money doing it :P
4 [deleted]
5 Sebastian Thrun's book, Probabilistic Robotics, goes through this in great detail. Get it, read it, make it your bible.
6 This. Such a legendary book. Kalman filters, particle filters, recursive Bayesian filters and a whole bunch of other stuff. I learnt so much. Read these 3 for starts from the book, then come back and ask the questions
7 Do you still need help?
# Get a string subset based on character position
str_sub(data$body[1], start=1, end=100)
[1] "Hdf5. It's structured, it's easy to get data in and out, and it's fast. Plus it will scale if you ev"
# Get a string subset based on words
word(data$body[1], start=1, end=10)
[1] "Hdf5. It's structured, it's easy to get data in and"
# Get the strings that contain a certain pattern
str_subset(data$body[1:100], pattern="deep")
character(0)
# Replace a substring with a new string by substring position
str_sub(data$body[1], start=1, end=100) <- str_to_upper(str_sub(data$body[1],
start=1,
end=100))
str_sub(data$body[1], start=1, end=100)
[1] "HDF5. IT'S STRUCTURED, IT'S EASY TO GET DATA IN AND OUT, AND IT'S FAST. PLUS IT WILL SCALE IF YOU EV"
# Replace first occurrence of a substring with a new string by matching
str_replace(data$body[1], pattern="deep|DEEP", replacement="multi-layer")
[1] "HDF5. IT'S STRUCTURED, IT'S EASY TO GET DATA IN AND OUT, AND IT'S FAST. PLUS IT WILL SCALE IF YOU EVer get up there in dataset size."
# Replace all occurrences of a substring with a new string by matching
str_replace_all(data$body[1], pattern="deep|DEEP", replacement="multi-layer")
[1] "HDF5. IT'S STRUCTURED, IT'S EASY TO GET DATA IN AND OUT, AND IT'S FAST. PLUS IT WILL SCALE IF YOU EVer get up there in dataset size."
10.9 stringr: Viewing Strings
# Basic printing
print(data$body[1:10])
[1] "HDF5. IT'S STRUCTURED, IT'S EASY TO GET DATA IN AND OUT, AND IT'S FAST. PLUS IT WILL SCALE IF YOU EVer get up there in dataset size."
[2] "yep, good point."
[3] "Google must have done (and is doing) serious internal research in ranking. I've heard they're pretty good at that and they've even made some money doing it :P"
[4] "[deleted]"
[5] "Sebastian Thrun's book, Probabilistic Robotics, goes through this in great detail. Get it, read it, make it your bible."
[6] "This. Such a legendary book. Kalman filters, particle filters, recursive Bayesian filters and a whole bunch of other stuff. I learnt so much. Read these 3 for starts from the book, then come back and ask the questions"
[7] "Do you still need help?"
[8] NA
[9] NA
[10] NA
<- data$body[str_which(data$body, "deep learning")]
deep_learning_posts
# View strings in HTML format with the first occurence of a pattern highlighted
str_view(deep_learning_posts, pattern="deep")
# View strings in HTML format with the first all occurences highlighted
str_view_all(deep_learning_posts, pattern="deep")
Warning: `str_view_all()` was deprecated in stringr 1.5.0.
ℹ Please use `str_view()` instead.
# Format strings into paragraphs of a given width with str_wrap()
<- str_wrap(data$body[str_which(data$body, "deep learning")][1],
wrapped width = 50)
wrapped
[1] NA
# Print wrapped string with output obeying newlines
%>% cat() wrapped
NA
# Display wrapped paragraph as HTML, inserting paragraph breaks
str_wrap(data$body[str_which(data$body, "deep learning")][1], width = 50) %>%
str_replace_all("\n", "<br>") %>%
str_view_all(pattern = "deep")
[1] │ NA