6  regular expressions (regex)

rm(list=ls())

#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#@ Intro to Regular Expressions (Also Known As "regex") ####
#@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#@ Regular expressions are used to slice and dice character (i.e. textual)
#@ data in a variety of ways. There are many different features. The best 
#@ way to understand is just to dive right in.
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

# Relevant help pages
#
# ?regex     ####
# ?grep      ####
# ?strsplit  ####
#
# Also, see this tutorial for more info:
#   https://ryanstutorials.net/regular-expressions-tutorial/  ####

6.1 Data for examples

#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# SAMPLE DATA ####
#
# Before we start, let's define some data to be used with examples in this file.
#
# (NOTE: I made up N. American apple and S. Korean Fig 
#        so that I can use them in some examples.)
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

fruit = c("apple", "N. American apple", "S. Korean Fig", "fig", 
          "star fruit", "pear", "prickly pear", 
          "Beurre Hardy pear", "cherry", "black cherry", "peach", "plum",
          "kumquat", "banana", "blueberry", "strawberry", "honeydew", 
          "strawberries", "yumberry")

fruit
 [1] "apple"             "N. American apple" "S. Korean Fig"    
 [4] "fig"               "star fruit"        "pear"             
 [7] "prickly pear"      "Beurre Hardy pear" "cherry"           
[10] "black cherry"      "peach"             "plum"             
[13] "kumquat"           "banana"            "blueberry"        
[16] "strawberry"        "honeydew"          "strawberries"     
[19] "yumberry"         
addresses = c("12345 Sesame Street", 
              "One Micro$oft Way",                 # notice the $ sign
              "3 Olive St.", 
              "Two 1st Ave.",
              "5678 Park Place",
              "Forty Five 2nd Street",
              "Ninety Nine Cone St. apartment 7",
              "9 Main St. apt. 623", 
              "Five Google Drive", 
              "4\\2 Rechov Yafo",                   # note the backslash (\)
              "Fifteen Watchamacallit Boulevard",   # some long words
              "Nineteen Watchamacallit Boulevard",  # some longer words
              "One Main Street Apt 12b",
              "Two Main Street Apt 123c",
              "Three Main Street Apt 12343",
              "City Hall Lockport, NY")


addresses
 [1] "12345 Sesame Street"               "One Micro$oft Way"                
 [3] "3 Olive St."                       "Two 1st Ave."                     
 [5] "5678 Park Place"                   "Forty Five 2nd Street"            
 [7] "Ninety Nine Cone St. apartment 7"  "9 Main St. apt. 623"              
 [9] "Five Google Drive"                 "4\\2 Rechov Yafo"                 
[11] "Fifteen Watchamacallit Boulevard"  "Nineteen Watchamacallit Boulevard"
[13] "One Main Street Apt 12b"           "Two Main Street Apt 123c"         
[15] "Three Main Street Apt 12343"       "City Hall Lockport, NY"           
# show each address, one per line
cat(addresses, sep="\n")
12345 Sesame Street
One Micro$oft Way
3 Olive St.
Two 1st Ave.
5678 Park Place
Forty Five 2nd Street
Ninety Nine Cone St. apartment 7
9 Main St. apt. 623
Five Google Drive
4\2 Rechov Yafo
Fifteen Watchamacallit Boulevard
Nineteen Watchamacallit Boulevard
One Main Street Apt 12b
Two Main Street Apt 123c
Three Main Street Apt 12343
City Hall Lockport, NY

6.1.1 stringr package

#-------------------------------------------------------------------------.
# NOTE - most of the examples in this file were created using the data above.
# The stringr package also contains some data that can be used to experiment
# with these functions. 
#-------------------------------------------------------------------------.

if(!require(stringr)){install.packages("stringr");require(stringr)}
Loading required package: stringr

Attaching package: 'stringr'
The following object is masked _by_ '.GlobalEnv':

    fruit
stringr::words
  [1] "a"           "able"        "about"       "absolute"    "accept"     
  [6] "account"     "achieve"     "across"      "act"         "active"     
 [11] "actual"      "add"         "address"     "admit"       "advertise"  
 [16] "affect"      "afford"      "after"       "afternoon"   "again"      
 [21] "against"     "age"         "agent"       "ago"         "agree"      
 [26] "air"         "all"         "allow"       "almost"      "along"      
 [31] "already"     "alright"     "also"        "although"    "always"     
 [36] "america"     "amount"      "and"         "another"     "answer"     
 [41] "any"         "apart"       "apparent"    "appear"      "apply"      
 [46] "appoint"     "approach"    "appropriate" "area"        "argue"      
 [51] "arm"         "around"      "arrange"     "art"         "as"         
 [56] "ask"         "associate"   "assume"      "at"          "attend"     
 [61] "authority"   "available"   "aware"       "away"        "awful"      
 [66] "baby"        "back"        "bad"         "bag"         "balance"    
 [71] "ball"        "bank"        "bar"         "base"        "basis"      
 [76] "be"          "bear"        "beat"        "beauty"      "because"    
 [81] "become"      "bed"         "before"      "begin"       "behind"     
 [86] "believe"     "benefit"     "best"        "bet"         "between"    
 [91] "big"         "bill"        "birth"       "bit"         "black"      
 [96] "bloke"       "blood"       "blow"        "blue"        "board"      
[101] "boat"        "body"        "book"        "both"        "bother"     
[106] "bottle"      "bottom"      "box"         "boy"         "break"      
[111] "brief"       "brilliant"   "bring"       "britain"     "brother"    
[116] "budget"      "build"       "bus"         "business"    "busy"       
[121] "but"         "buy"         "by"          "cake"        "call"       
[126] "can"         "car"         "card"        "care"        "carry"      
[131] "case"        "cat"         "catch"       "cause"       "cent"       
[136] "centre"      "certain"     "chair"       "chairman"    "chance"     
[141] "change"      "chap"        "character"   "charge"      "cheap"      
[146] "check"       "child"       "choice"      "choose"      "Christ"     
[151] "Christmas"   "church"      "city"        "claim"       "class"      
[156] "clean"       "clear"       "client"      "clock"       "close"      
[161] "closes"      "clothe"      "club"        "coffee"      "cold"       
[166] "colleague"   "collect"     "college"     "colour"      "come"       
[171] "comment"     "commit"      "committee"   "common"      "community"  
[176] "company"     "compare"     "complete"    "compute"     "concern"    
[181] "condition"   "confer"      "consider"    "consult"     "contact"    
[186] "continue"    "contract"    "control"     "converse"    "cook"       
[191] "copy"        "corner"      "correct"     "cost"        "could"      
[196] "council"     "count"       "country"     "county"      "couple"     
[201] "course"      "court"       "cover"       "create"      "cross"      
[206] "cup"         "current"     "cut"         "dad"         "danger"     
[211] "date"        "day"         "dead"        "deal"        "dear"       
[216] "debate"      "decide"      "decision"    "deep"        "definite"   
[221] "degree"      "department"  "depend"      "describe"    "design"     
[226] "detail"      "develop"     "die"         "difference"  "difficult"  
[231] "dinner"      "direct"      "discuss"     "district"    "divide"     
[236] "do"          "doctor"      "document"    "dog"         "door"       
[241] "double"      "doubt"       "down"        "draw"        "dress"      
[246] "drink"       "drive"       "drop"        "dry"         "due"        
[251] "during"      "each"        "early"       "east"        "easy"       
[256] "eat"         "economy"     "educate"     "effect"      "egg"        
[261] "eight"       "either"      "elect"       "electric"    "eleven"     
[266] "else"        "employ"      "encourage"   "end"         "engine"     
[271] "english"     "enjoy"       "enough"      "enter"       "environment"
[276] "equal"       "especial"    "europe"      "even"        "evening"    
[281] "ever"        "every"       "evidence"    "exact"       "example"    
[286] "except"      "excuse"      "exercise"    "exist"       "expect"     
[291] "expense"     "experience"  "explain"     "express"     "extra"      
[296] "eye"         "face"        "fact"        "fair"        "fall"       
[301] "family"      "far"         "farm"        "fast"        "father"     
[306] "favour"      "feed"        "feel"        "few"         "field"      
[311] "fight"       "figure"      "file"        "fill"        "film"       
[316] "final"       "finance"     "find"        "fine"        "finish"     
[321] "fire"        "first"       "fish"        "fit"         "five"       
[326] "flat"        "floor"       "fly"         "follow"      "food"       
[331] "foot"        "for"         "force"       "forget"      "form"       
[336] "fortune"     "forward"     "four"        "france"      "free"       
[341] "friday"      "friend"      "from"        "front"       "full"       
[346] "fun"         "function"    "fund"        "further"     "future"     
[351] "game"        "garden"      "gas"         "general"     "germany"    
[356] "get"         "girl"        "give"        "glass"       "go"         
[361] "god"         "good"        "goodbye"     "govern"      "grand"      
[366] "grant"       "great"       "green"       "ground"      "group"      
[371] "grow"        "guess"       "guy"         "hair"        "half"       
[376] "hall"        "hand"        "hang"        "happen"      "happy"      
[381] "hard"        "hate"        "have"        "he"          "head"       
[386] "health"      "hear"        "heart"       "heat"        "heavy"      
[391] "hell"        "help"        "here"        "high"        "history"    
[396] "hit"         "hold"        "holiday"     "home"        "honest"     
[401] "hope"        "horse"       "hospital"    "hot"         "hour"       
[406] "house"       "how"         "however"     "hullo"       "hundred"    
[411] "husband"     "idea"        "identify"    "if"          "imagine"    
[416] "important"   "improve"     "in"          "include"     "income"     
[421] "increase"    "indeed"      "individual"  "industry"    "inform"     
[426] "inside"      "instead"     "insure"      "interest"    "into"       
[431] "introduce"   "invest"      "involve"     "issue"       "it"         
[436] "item"        "jesus"       "job"         "join"        "judge"      
[441] "jump"        "just"        "keep"        "key"         "kid"        
[446] "kill"        "kind"        "king"        "kitchen"     "knock"      
[451] "know"        "labour"      "lad"         "lady"        "land"       
[456] "language"    "large"       "last"        "late"        "laugh"      
[461] "law"         "lay"         "lead"        "learn"       "leave"      
[466] "left"        "leg"         "less"        "let"         "letter"     
[471] "level"       "lie"         "life"        "light"       "like"       
[476] "likely"      "limit"       "line"        "link"        "list"       
[481] "listen"      "little"      "live"        "load"        "local"      
[486] "lock"        "london"      "long"        "look"        "lord"       
[491] "lose"        "lot"         "love"        "low"         "luck"       
[496] "lunch"       "machine"     "main"        "major"       "make"       
[501] "man"         "manage"      "many"        "mark"        "market"     
[506] "marry"       "match"       "matter"      "may"         "maybe"      
[511] "mean"        "meaning"     "measure"     "meet"        "member"     
[516] "mention"     "middle"      "might"       "mile"        "milk"       
[521] "million"     "mind"        "minister"    "minus"       "minute"     
[526] "miss"        "mister"      "moment"      "monday"      "money"      
[531] "month"       "more"        "morning"     "most"        "mother"     
[536] "motion"      "move"        "mrs"         "much"        "music"      
[541] "must"        "name"        "nation"      "nature"      "near"       
[546] "necessary"   "need"        "never"       "new"         "news"       
[551] "next"        "nice"        "night"       "nine"        "no"         
[556] "non"         "none"        "normal"      "north"       "not"        
[561] "note"        "notice"      "now"         "number"      "obvious"    
[566] "occasion"    "odd"         "of"          "off"         "offer"      
[571] "office"      "often"       "okay"        "old"         "on"         
[576] "once"        "one"         "only"        "open"        "operate"    
[581] "opportunity" "oppose"      "or"          "order"       "organize"   
[586] "original"    "other"       "otherwise"   "ought"       "out"        
[591] "over"        "own"         "pack"        "page"        "paint"      
[596] "pair"        "paper"       "paragraph"   "pardon"      "parent"     
[601] "park"        "part"        "particular"  "party"       "pass"       
[606] "past"        "pay"         "pence"       "pension"     "people"     
[611] "per"         "percent"     "perfect"     "perhaps"     "period"     
[616] "person"      "photograph"  "pick"        "picture"     "piece"      
[621] "place"       "plan"        "play"        "please"      "plus"       
[626] "point"       "police"      "policy"      "politic"     "poor"       
[631] "position"    "positive"    "possible"    "post"        "pound"      
[636] "power"       "practise"    "prepare"     "present"     "press"      
[641] "pressure"    "presume"     "pretty"      "previous"    "price"      
[646] "print"       "private"     "probable"    "problem"     "proceed"    
[651] "process"     "produce"     "product"     "programme"   "project"    
[656] "proper"      "propose"     "protect"     "provide"     "public"     
[661] "pull"        "purpose"     "push"        "put"         "quality"    
[666] "quarter"     "question"    "quick"       "quid"        "quiet"      
[671] "quite"       "radio"       "rail"        "raise"       "range"      
[676] "rate"        "rather"      "read"        "ready"       "real"       
[681] "realise"     "really"      "reason"      "receive"     "recent"     
[686] "reckon"      "recognize"   "recommend"   "record"      "red"        
[691] "reduce"      "refer"       "regard"      "region"      "relation"   
[696] "remember"    "report"      "represent"   "require"     "research"   
[701] "resource"    "respect"     "responsible" "rest"        "result"     
[706] "return"      "rid"         "right"       "ring"        "rise"       
[711] "road"        "role"        "roll"        "room"        "round"      
[716] "rule"        "run"         "safe"        "sale"        "same"       
[721] "saturday"    "save"        "say"         "scheme"      "school"     
[726] "science"     "score"       "scotland"    "seat"        "second"     
[731] "secretary"   "section"     "secure"      "see"         "seem"       
[736] "self"        "sell"        "send"        "sense"       "separate"   
[741] "serious"     "serve"       "service"     "set"         "settle"     
[746] "seven"       "sex"         "shall"       "share"       "she"        
[751] "sheet"       "shoe"        "shoot"       "shop"        "short"      
[756] "should"      "show"        "shut"        "sick"        "side"       
[761] "sign"        "similar"     "simple"      "since"       "sing"       
[766] "single"      "sir"         "sister"      "sit"         "site"       
[771] "situate"     "six"         "size"        "sleep"       "slight"     
[776] "slow"        "small"       "smoke"       "so"          "social"     
[781] "society"     "some"        "son"         "soon"        "sorry"      
[786] "sort"        "sound"       "south"       "space"       "speak"      
[791] "special"     "specific"    "speed"       "spell"       "spend"      
[796] "square"      "staff"       "stage"       "stairs"      "stand"      
[801] "standard"    "start"       "state"       "station"     "stay"       
[806] "step"        "stick"       "still"       "stop"        "story"      
[811] "straight"    "strategy"    "street"      "strike"      "strong"     
[816] "structure"   "student"     "study"       "stuff"       "stupid"     
[821] "subject"     "succeed"     "such"        "sudden"      "suggest"    
[826] "suit"        "summer"      "sun"         "sunday"      "supply"     
[831] "support"     "suppose"     "sure"        "surprise"    "switch"     
[836] "system"      "table"       "take"        "talk"        "tape"       
[841] "tax"         "tea"         "teach"       "team"        "telephone"  
[846] "television"  "tell"        "ten"         "tend"        "term"       
[851] "terrible"    "test"        "than"        "thank"       "the"        
[856] "then"        "there"       "therefore"   "they"        "thing"      
[861] "think"       "thirteen"    "thirty"      "this"        "thou"       
[866] "though"      "thousand"    "three"       "through"     "throw"      
[871] "thursday"    "tie"         "time"        "to"          "today"      
[876] "together"    "tomorrow"    "tonight"     "too"         "top"        
[881] "total"       "touch"       "toward"      "town"        "trade"      
[886] "traffic"     "train"       "transport"   "travel"      "treat"      
[891] "tree"        "trouble"     "true"        "trust"       "try"        
[896] "tuesday"     "turn"        "twelve"      "twenty"      "two"        
[901] "type"        "under"       "understand"  "union"       "unit"       
[906] "unite"       "university"  "unless"      "until"       "up"         
[911] "upon"        "use"         "usual"       "value"       "various"    
[916] "very"        "video"       "view"        "village"     "visit"      
[921] "vote"        "wage"        "wait"        "walk"        "wall"       
[926] "want"        "war"         "warm"        "wash"        "waste"      
[931] "watch"       "water"       "way"         "we"          "wear"       
[936] "wednesday"   "wee"         "week"        "weigh"       "welcome"    
[941] "well"        "west"        "what"        "when"        "where"      
[946] "whether"     "which"       "while"       "white"       "who"        
[951] "whole"       "why"         "wide"        "wife"        "will"       
[956] "win"         "wind"        "window"      "wish"        "with"       
[961] "within"      "without"     "woman"       "wonder"      "wood"       
[966] "word"        "work"        "world"       "worry"       "worse"      
[971] "worth"       "would"       "write"       "wrong"       "year"       
[976] "yes"         "yesterday"   "yet"         "you"         "young"      
head(stringr::words, 100)
  [1] "a"           "able"        "about"       "absolute"    "accept"     
  [6] "account"     "achieve"     "across"      "act"         "active"     
 [11] "actual"      "add"         "address"     "admit"       "advertise"  
 [16] "affect"      "afford"      "after"       "afternoon"   "again"      
 [21] "against"     "age"         "agent"       "ago"         "agree"      
 [26] "air"         "all"         "allow"       "almost"      "along"      
 [31] "already"     "alright"     "also"        "although"    "always"     
 [36] "america"     "amount"      "and"         "another"     "answer"     
 [41] "any"         "apart"       "apparent"    "appear"      "apply"      
 [46] "appoint"     "approach"    "appropriate" "area"        "argue"      
 [51] "arm"         "around"      "arrange"     "art"         "as"         
 [56] "ask"         "associate"   "assume"      "at"          "attend"     
 [61] "authority"   "available"   "aware"       "away"        "awful"      
 [66] "baby"        "back"        "bad"         "bag"         "balance"    
 [71] "ball"        "bank"        "bar"         "base"        "basis"      
 [76] "be"          "bear"        "beat"        "beauty"      "because"    
 [81] "become"      "bed"         "before"      "begin"       "behind"     
 [86] "believe"     "benefit"     "best"        "bet"         "between"    
 [91] "big"         "bill"        "birth"       "bit"         "black"      
 [96] "bloke"       "blood"       "blow"        "blue"        "board"      
# The stringr pacakge also includes many functions. For example one of 
# those functions is str_length.

# The str_length function is part of the stringr package.
# To use it you must install stringr (or install tidyverse, which is a 
# collection of packages one of which is stringr)
str_length(c("abc", "hello", "I like ice cream!"))
[1]  3  5 17
# This function is very similar to the
# nchar function that is built into base R.
nchar(c("abc", "hello", "I like ice cream!"))  
[1]  3  5 17

6.2 grep function

# Show all words that
# "start with a p, end with a y (with anything in the middle)"
grep(stringr::words, pattern="^p.*y$", value=TRUE)
[1] "party"  "pay"    "play"   "policy" "pretty"
# Starts with a p, ends with a y, nothing in the middle.
# Only matches "py". 
# There are no words that match.
grep(stringr::words, pattern="^py$", value=TRUE)
character(0)
# match any word that start with p, ends with y and has a single
# character between them
grep(stringr::words, pattern="^p.y$", value=TRUE)
[1] "pay"
# match any word that start with p, ends with y and
# has exactly two characters between them.
grep(stringr::words, pattern="^p..y$", value=TRUE)
[1] "play"
# match any word that start with p, ends with y and
# has exactly four characters between them.
grep(stringr::words, pattern="^p....y$", value=TRUE)
[1] "policy" "pretty"
# match any sequence of characters between the p and the y
grep(stringr::words, pattern="^p.*y$", value=TRUE)
[1] "party"  "pay"    "play"   "policy" "pretty"
# starts with a p
grep(stringr::words, pattern="^p", value=TRUE)
 [1] "pack"       "page"       "paint"      "pair"       "paper"     
 [6] "paragraph"  "pardon"     "parent"     "park"       "part"      
[11] "particular" "party"      "pass"       "past"       "pay"       
[16] "pence"      "pension"    "people"     "per"        "percent"   
[21] "perfect"    "perhaps"    "period"     "person"     "photograph"
[26] "pick"       "picture"    "piece"      "place"      "plan"      
[31] "play"       "please"     "plus"       "point"      "police"    
[36] "policy"     "politic"    "poor"       "position"   "positive"  
[41] "possible"   "post"       "pound"      "power"      "practise"  
[46] "prepare"    "present"    "press"      "pressure"   "presume"   
[51] "pretty"     "previous"   "price"      "print"      "private"   
[56] "probable"   "problem"    "proceed"    "process"    "produce"   
[61] "product"    "programme"  "project"    "proper"     "propose"   
[66] "protect"    "provide"    "public"     "pull"       "purpose"   
[71] "push"       "put"       

6.3 grep and grepl

# When value=FALSE grep returns the positions in the vector of 
# values that matched
grep(stringr::words, pattern="^p.*y$", value=TRUE)
[1] "party"  "pay"    "play"   "policy" "pretty"
grep(stringr::words, pattern="^p.*y$", value=FALSE)
[1] 604 607 623 628 643
# default is value=FALSE
grep(stringr::words, pattern="^p.*y$", value=FALSE)
[1] 604 607 623 628 643
#@ grep and grepl
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#@ grep stands for "Globally search for a Regular Expression and Print the result"
#@
#@ Grep will search through the entries in a character vector and display those
#@ entries that match a specified pattern (see examples below). These patterns
#@ are known as regular expressions or "regex".
#@
#@ The history of grep started with a a command that was used on the Unix operating
#@ system. It has been adapted for use with many programming environments. R has
#@ its own version.
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@


# grep ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# grep returns character values or the indexes (i.e. position numbers) 
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Find all fruit whose name contains the letter "h"
grep(pattern="h", x=fruit, value=TRUE)   # value=TRUE, show the acutal values that match the pattern 
[1] "cherry"       "black cherry" "peach"        "honeydew"    
grep(pattern="h", x=fruit, value=FALSE)  # value=FALSE, show the index (ie. position) of the values that match 
[1]  9 10 11 17
# grepl ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# grepl returns logical values (i.e. TRUE/FALSE vectors)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

grepl(pattern="h", x=fruit)    # find which values include an "h"
 [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE
[13] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Summary: 3 ways to use grep or grepl
#
# - grep ( regexPattern , value=TRUE)  # returns the actual values that match  
# - grep ( regexPattern , value=FALSE) # returns the index numbers of the values that match  
# - grepl ( regexPattern )              # returns a logical vector that indicate which values match  
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# For now, let's focus on grep(... , value=TRUE) as it is easier to understand the results. 


# The pattern is searched for in the entire entry ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The pattern is considered "matched" if it appears anywhere in the data value.
# For example:   grep("h", fruit)
#
# returns all fruit that contain an "h", no matter whether the h is at the 
# beginning, end or middle of the word.
#
# You can change this behavior with the ^ and $ metacharacters (see below)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




# spaces in regex patterns are meaningful ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Spaces are NOT ignored. Spaces count as part of the pattern 
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# fruits that contain a space
grep(pattern=" ", x=fruit, value=TRUE) # all fruit that contain a space
[1] "N. American apple" "S. Korean Fig"     "star fruit"       
[4] "prickly pear"      "Beurre Hardy pear" "black cherry"     
# serach for "k"  (i.e. JUST a "k")
grep("k", fruit, value=TRUE) # "prickly pear"  "black cherry"  "kumquat"
[1] "prickly pear" "black cherry" "kumquat"     
grep("ck", fruit, value=TRUE) # "prickly pear"  "black cherry"  "kumquat"
[1] "prickly pear" "black cherry"
# search for "k "  (i.e. k followed by a space)
grep("k ", fruit, value=TRUE) # "black cherry"
[1] "black cherry"

6.4 regex patterns do NOT understand “numbers”

# regex patterns do NOT understand "numbers" ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Digits are NOT treated as numbers. They are treated the same as any
# other character. Therefore grep("12", SOME_VECTOR) will match any value
# that contains a 1 followed by a 2, including "123" and "34321234".
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

addresses # show all the addresses
 [1] "12345 Sesame Street"               "One Micro$oft Way"                
 [3] "3 Olive St."                       "Two 1st Ave."                     
 [5] "5678 Park Place"                   "Forty Five 2nd Street"            
 [7] "Ninety Nine Cone St. apartment 7"  "9 Main St. apt. 623"              
 [9] "Five Google Drive"                 "4\\2 Rechov Yafo"                 
[11] "Fifteen Watchamacallit Boulevard"  "Nineteen Watchamacallit Boulevard"
[13] "One Main Street Apt 12b"           "Two Main Street Apt 123c"         
[15] "Three Main Street Apt 12343"       "City Hall Lockport, NY"           
grep("23", addresses, value=TRUE)  # matches anything that contains 23
[1] "12345 Sesame Street"         "9 Main St. apt. 623"        
[3] "Two Main Street Apt 123c"    "Three Main Street Apt 12343"

6.5 case sensitivity

# Regular expressions (in R) are case sensitive by default ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# By default, R's version of grep is case sensitive.
# There are a few different approaches for searching case-INsensitively ####.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# The first way - use ignore.case = TRUE
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

grep("H",fruit, value=TRUE)  # contains a capital "H"
[1] "Beurre Hardy pear"
grep("h",fruit, value=TRUE)  # contains a lowercase "h"
[1] "cherry"       "black cherry" "peach"        "honeydew"    
grep("h", fruit, value=TRUE, ignore.case=TRUE) # contains AnY h
[1] "Beurre Hardy pear" "cherry"            "black cherry"     
[4] "peach"             "honeydew"         
grep("H", fruit, value=TRUE, ignore.case=TRUE) # same thing
[1] "Beurre Hardy pear" "cherry"            "black cherry"     
[4] "peach"             "honeydew"         
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Another way to search for for both CAPITAL and lowercase characters, e.g. [Hh]
# For example, [hH] indicates that h or H is valid to be matched. 
# We will describe the exact meaning of the [square brackets] in a lot more 
# detail below.
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

grep("[hH]", fruit, value=TRUE)
[1] "Beurre Hardy pear" "cherry"            "black cherry"     
[4] "peach"             "honeydew"         
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# another way using R's toupper or tolower functions.
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

msg = "She said 'Hello' to Joe."
msg
[1] "She said 'Hello' to Joe."
toupper(msg)
[1] "SHE SAID 'HELLO' TO JOE."
tolower(msg)
[1] "she said 'hello' to joe."
grep("h", tolower(fruit), value=TRUE)
[1] "beurre hardy pear" "cherry"            "black cherry"     
[4] "peach"             "honeydew"         

6.6 str_view from the stringr package

The str_view function from the stringr package can be very helpful when you’re trying to understand a regular expression. str_view shows exactly what parts of a string match the pattern. See the example below.

# str_view is part of the stringr package
library(stringr)

greetings = c("hi there", "yo dude", "shalom", "bon jour")
cat(greetings, sep="\n")
hi there
yo dude
shalom
bon jour
# match the letter h in each greeting
str_view(greetings, "h")
[1] │ <h>i t<h>ere
[3] │ s<h>alom

6.7 sub and gsub functions

#@ sub and gsub functions ####
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#@
#@ sub (SOME_REGEX_PATTERN, REPLACMENT, SOME_VECTOR)
#@    sub returns a new vector. The return value is the same as SOME_VECTOR
#@    except that the FIRST match of the pattern in each entry of SOME_VECTOR
#@    is replaced with REPLACEMENT - see the examples below.
#@
#@ gsub (SOME_REGEX_PATTERN, REPLACMENT, SOME_VECTOR)
#@    same as sub but ALL matches of the pattern are replaced (not just the
#@    first in each entry of the the vector - see the exmaples below
#@
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@


# QUESTION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# replace the first letter "e" that appears in any fruit with the letter "X"
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ANSWER
sub(pattern="e", replacement="X", x=fruit)  # "applX"    "N. AmXrican apple"     etc
 [1] "applX"             "N. AmXrican apple" "S. KorXan Fig"    
 [4] "fig"               "star fruit"        "pXar"             
 [7] "prickly pXar"      "BXurre Hardy pear" "chXrry"           
[10] "black chXrry"      "pXach"             "plum"             
[13] "kumquat"           "banana"            "bluXberry"        
[16] "strawbXrry"        "honXydew"          "strawbXrries"     
[19] "yumbXrry"         
# QUESTION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# replace ALL of the "e"s that appears in any fruit with the letter "x"
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ANSWER
gsub(pattern="e", replacement="X", fruit)   # "applX"    "N. AmXrican applX"     etc
 [1] "applX"             "N. AmXrican applX" "S. KorXan Fig"    
 [4] "fig"               "star fruit"        "pXar"             
 [7] "prickly pXar"      "BXurrX Hardy pXar" "chXrry"           
[10] "black chXrry"      "pXach"             "plum"             
[13] "kumquat"           "banana"            "bluXbXrry"        
[16] "strawbXrry"        "honXydXw"          "strawbXrriXs"     
[19] "yumbXrry"         
# QUESTION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# remove all spaces from the addresses
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ANSWER
gsub(pattern=" ", replacement="", addresses)   # "12345SesameStreet"   "OneMicro$oftWay"   etc.
 [1] "12345SesameStreet"               "OneMicro$oftWay"                
 [3] "3OliveSt."                       "Two1stAve."                     
 [5] "5678ParkPlace"                   "FortyFive2ndStreet"             
 [7] "NinetyNineConeSt.apartment7"     "9MainSt.apt.623"                
 [9] "FiveGoogleDrive"                 "4\\2RechovYafo"                 
[11] "FifteenWatchamacallitBoulevard"  "NineteenWatchamacallitBoulevard"
[13] "OneMainStreetApt12b"             "TwoMainStreetApt123c"           
[15] "ThreeMainStreetApt12343"         "CityHallLockport,NY"            
# We will revisit sub and gsub later with more complex examples ...

6.8 strsplit function

##################################################################.
# strsplit 
#
# strsplit is used to split a string based on a "delimeter" that appears
# between the different values. This "delimeter" can be a regular expression.
# We'll come back to strsplit later, but let's introduce it here.
##################################################################.

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Example
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sentences = c("He said hi. She said bye. We went to the park.",
              "I like ice cream! Do you? Sue likes pizza.")
sentences
[1] "He said hi. She said bye. We went to the park."
[2] "I like ice cream! Do you? Sue likes pizza."    
# QUESTION - 
# Use strsplit to split the values in the sentences vector by 
# splitting based on spaces. Assign the result to the varible "sentenceWords".
#
# Write code to get the 3rd "word" from the 1st entry in the sentences 
# vector.


?strsplit
starting httpd help server ... done
sentenceWords = strsplit(sentences, split=" ")
sentenceWords
[[1]]
 [1] "He"    "said"  "hi."   "She"   "said"  "bye."  "We"    "went"  "to"   
[10] "the"   "park."

[[2]]
[1] "I"      "like"   "ice"    "cream!" "Do"     "you?"   "Sue"    "likes" 
[9] "pizza."
# Notice that the result is a LIST:
str(sentenceWords)
List of 2
 $ : chr [1:11] "He" "said" "hi." "She" ...
 $ : chr [1:9] "I" "like" "ice" "cream!" ...
# Show the 3rd word in the 1st sentence
sentenceWords[[1]][3]
[1] "hi."

6.8.1 — practice —

# QUESTION - split each entry in the sentences variable into individual 
# sententces. 
# 
# WARNING - the value of the split argument is interpreted as a
# regular expression pattern. Be careful.

# 1st attempt - doesn't work. 
strsplit(sentences, ".")
[[1]]
 [1] "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" ""
[26] "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" ""

[[2]]
 [1] "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" ""
[26] "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" ""
# This doesn't work since the 2nd argument is a regular expression.
# The following will split based on periods.
sentences
[1] "He said hi. She said bye. We went to the park."
[2] "I like ice cream! Do you? Sue likes pizza."    
strsplit(sentences, "\\.")
[[1]]
[1] "He said hi"           " She said bye"        " We went to the park"

[[2]]
[1] "I like ice cream! Do you? Sue likes pizza"
# Use a "regular expression" to instead split on any of a period, 
# question mark, or exclamation point.
sentences
[1] "He said hi. She said bye. We went to the park."
[2] "I like ice cream! Do you? Sue likes pizza."    
strsplit(sentences, "[.?!]")  # split on any one of .?!
[[1]]
[1] "He said hi"           " She said bye"        " We went to the park"

[[2]]
[1] "I like ice cream" " Do you"          " Sue likes pizza"

6.9 “anchors” ^ and $

# ^ and $    ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Regular Expressions use special characters to control what is matched.
# These characters are called "meta-characters".
#
#   ^  "matches" the start of the character value
#   $  "matches" the end of the character value
#
#   The [square brackets] described above are also "meta characters" in
#   regular expressions. (We will describe those in more detail next)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

grep(pattern="^a", x=fruit, value=TRUE)  # find fruits that START with an "a"
[1] "apple"
grep("a$", fruit, value=TRUE)  # find fruits that END with an "a"
[1] "banana"
# QUESTION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Write a command using grep to display all the fruits that start with
# with a c or an s. Make your search case insensitive.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#         ANSWERS ####
grep("^[csCS]", fruit, value=TRUE)
[1] "S. Korean Fig" "star fruit"    "cherry"        "strawberry"   
[5] "strawberries" 
grep("^[cs]", fruit, value=TRUE, ignore.case = TRUE)
[1] "S. Korean Fig" "star fruit"    "cherry"        "strawberry"   
[5] "strawberries" 
grep("^[cs]", tolower(fruit), value=TRUE)
[1] "s. korean fig" "star fruit"    "cherry"        "strawberry"   
[5] "strawberries" 

6.10 periods

# . (i.e. a period) ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# a period (ie  .  ) "matches" any single character.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

grep("^.a", fruit, value=TRUE)  # 2nd letter is a
[1] "banana"
grep("^..a", fruit, value=TRUE)  # 3rd letter is a
[1] "star fruit"   "pear"         "black cherry" "peach"       
grep("a.$", fruit, value=TRUE)  # 2nd to last letter is an a
[1] "pear"              "prickly pear"      "Beurre Hardy pear"
[4] "kumquat"          
grep("....", fruit, value=TRUE) # all fruit that are AT LEAST 4 characters long
 [1] "apple"             "N. American apple" "S. Korean Fig"    
 [4] "star fruit"        "pear"              "prickly pear"     
 [7] "Beurre Hardy pear" "cherry"            "black cherry"     
[10] "peach"             "plum"              "kumquat"          
[13] "banana"            "blueberry"         "strawberry"       
[16] "honeydew"          "strawberries"      "yumberry"         
grep("^....$", fruit, value=TRUE) # all fruit that are EXACTLY 4 characters long
[1] "pear" "plum"

6.11 “character classes” eg. [abc]

# [abc] matches a SINGLE "a", "b" or "c".    ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# characters in [square brackets] match a single copy of
# any of those characters, e.g.
#  [abc]    matches exactly one of a,b or c
#
# DEFINITION: 
# The [square brackets] with the characters in them are often referred to as 
# "character classes" or "character sets"
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

6.11.1 Examples

grep("[qbxz]", fruit, value=TRUE)  # find fruit that contain q,b,x or z
[1] "black cherry" "kumquat"      "banana"       "blueberry"    "strawberry"  
[6] "strawberries" "yumberry"    
grep("[aeiou][aeiou]", fruit, value=TRUE)  # two vowels in a row
[1] "S. Korean Fig"     "star fruit"        "pear"             
[4] "prickly pear"      "Beurre Hardy pear" "peach"            
[7] "kumquat"           "blueberry"         "strawberries"     
grep("^.[aeiou][aeiou]", fruit, value=TRUE)  # vowels in the 2nd and 3rd positions
[1] "pear"              "Beurre Hardy pear" "peach"            
grep("[aeiou][aeiou].$", fruit, value=TRUE)  # 2nd & 3rd to last characters are vowels
[1] "star fruit"        "pear"              "prickly pear"     
[4] "Beurre Hardy pear" "kumquat"           "strawberries"     
grep("[aeiou][aeiou][aeiou]", fruit, value=TRUE)  # 3 vowels in a row
character(0)
# QUESTION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# replace the FIRST vowel that appears in any fruit with the letter "X"
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ANSWER

# QUESTION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# replace ALL vowels that appears in any fruit with the letter "X"
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ANSWER
gsub("[aeiouAEIOU]", "X", fruit)
 [1] "XpplX"             "N. XmXrXcXn XpplX" "S. KXrXXn FXg"    
 [4] "fXg"               "stXr frXXt"        "pXXr"             
 [7] "prXckly pXXr"      "BXXrrX HXrdy pXXr" "chXrry"           
[10] "blXck chXrry"      "pXXch"             "plXm"             
[13] "kXmqXXt"           "bXnXnX"            "blXXbXrry"        
[16] "strXwbXrry"        "hXnXydXw"          "strXwbXrrXXs"     
[19] "yXmbXrry"         
# QUESTION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# remove ALL of the vowels that appear in any fruit
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ANSWER
gsub("[aeiouAEIOU]", "", fruit)
 [1] "ppl"         "N. mrcn ppl" "S. Krn Fg"   "fg"          "str frt"    
 [6] "pr"          "prckly pr"   "Brr Hrdy pr" "chrry"       "blck chrry" 
[11] "pch"         "plm"         "kmqt"        "bnn"         "blbrry"     
[16] "strwbrry"    "hnydw"       "strwbrrs"    "ymbrry"     

6.11.2 dashes, e.g. [a-d]

# Specify ranges with dash, e.g. [a-d] or [0-3] or [a-d0-3], etc ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Specify ranges with dash, e.g. [a-d] is same as [abcd], [0-3] is same as [0123] ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

6.11.3 Examples

grep("[j-m]", fruit, value=TRUE)  # fruits that contain any of j,k,l,m
[1] "apple"             "N. American apple" "prickly pear"     
[4] "black cherry"      "plum"              "kumquat"          
[7] "blueberry"         "yumberry"         
addresses  # this was defined above
 [1] "12345 Sesame Street"               "One Micro$oft Way"                
 [3] "3 Olive St."                       "Two 1st Ave."                     
 [5] "5678 Park Place"                   "Forty Five 2nd Street"            
 [7] "Ninety Nine Cone St. apartment 7"  "9 Main St. apt. 623"              
 [9] "Five Google Drive"                 "4\\2 Rechov Yafo"                 
[11] "Fifteen Watchamacallit Boulevard"  "Nineteen Watchamacallit Boulevard"
[13] "One Main Street Apt 12b"           "Two Main Street Apt 123c"         
[15] "Three Main Street Apt 12343"       "City Hall Lockport, NY"           
grep("[0-9]", addresses, value=TRUE)  # contains a digit
 [1] "12345 Sesame Street"              "3 Olive St."                     
 [3] "Two 1st Ave."                     "5678 Park Place"                 
 [5] "Forty Five 2nd Street"            "Ninety Nine Cone St. apartment 7"
 [7] "9 Main St. apt. 623"              "4\\2 Rechov Yafo"                
 [9] "One Main Street Apt 12b"          "Two Main Street Apt 123c"        
[11] "Three Main Street Apt 12343"     
grep("[6-9]", addresses, value=TRUE)  # contains 6,7,8 or 9
[1] "5678 Park Place"                  "Ninety Nine Cone St. apartment 7"
[3] "9 Main St. apt. 623"             
grep("[0-9][0-9]", addresses, value=TRUE)  # contains a number with at least 2 digits
[1] "12345 Sesame Street"         "5678 Park Place"            
[3] "9 Main St. apt. 623"         "One Main Street Apt 12b"    
[5] "Two Main Street Apt 123c"    "Three Main Street Apt 12343"
grep("[0-9][0-9][0-9][0-9]", addresses, value=TRUE)  # contains a # with at least 4 digits 
[1] "12345 Sesame Street"         "5678 Park Place"            
[3] "Three Main Street Apt 12343"
grep("^[6-9]", addresses, value=TRUE)  # contains 6,7,8 or 9 as the first character
[1] "9 Main St. apt. 623"
grep("^.[6-9]", addresses, value=TRUE) # 6,7,8 or 9 is second character
[1] "5678 Park Place"
grep("[0-9]$", addresses, value=TRUE) # last character is a digit
[1] "Ninety Nine Cone St. apartment 7" "9 Main St. apt. 623"             
[3] "Three Main Street Apt 12343"     
grep("[0-9][0-9][0-9]$", addresses, value=TRUE) # ends with at least 3 digits
[1] "9 Main St. apt. 623"         "Three Main Street Apt 12343"
grep("[0-9][0-9][0-9][0-9]$", addresses, value=TRUE) # ends with at least 4 digits
[1] "Three Main Street Apt 12343"

6.11.4 combine multiple ranges one [brackets]

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# You can combine multiple ranges and values in a single [brackets]
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

grep("[1-3x-z]$", addresses, value=TRUE) # ends with 1,2,3,x,y or z
[1] "One Micro$oft Way"           "9 Main St. apt. 623"        
[3] "Three Main Street Apt 12343"
grep("[of-hq]", fruit, value=TRUE)  # seraches for any of o,f,g,h,q
[1] "S. Korean Fig" "fig"           "star fruit"    "cherry"       
[5] "black cherry"  "peach"         "kumquat"       "honeydew"     
# REMEMBER THE [BRACKETS]!!!
grep("of-hq", fruit, value=TRUE)  # searches for exactly :  "of-hq"
character(0)

6.11.5 [^abc]

# [^abc] matches a single character that is NOT a,b or c ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# When ^ is the FIRST character in the [^brackets] it means to match a 
# single character that is NOT one of the characters in the brackets
#
# [^abc]  - i.e. a single character that is NOT a,b or c
# [^0-9]  - a non-digit
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

grep("^[^0-9]", addresses, value=TRUE)  # does NOT start with a digit
 [1] "One Micro$oft Way"                 "Two 1st Ave."                     
 [3] "Forty Five 2nd Street"             "Ninety Nine Cone St. apartment 7" 
 [5] "Five Google Drive"                 "Fifteen Watchamacallit Boulevard" 
 [7] "Nineteen Watchamacallit Boulevard" "One Main Street Apt 12b"          
 [9] "Two Main Street Apt 123c"          "Three Main Street Apt 12343"      
[11] "City Hall Lockport, NY"           
grep("[^0-9]$", addresses, value=TRUE)  # does NOT end with a digit
 [1] "12345 Sesame Street"               "One Micro$oft Way"                
 [3] "3 Olive St."                       "Two 1st Ave."                     
 [5] "5678 Park Place"                   "Forty Five 2nd Street"            
 [7] "Five Google Drive"                 "4\\2 Rechov Yafo"                 
 [9] "Fifteen Watchamacallit Boulevard"  "Nineteen Watchamacallit Boulevard"
[11] "One Main Street Apt 12b"           "Two Main Street Apt 123c"         
[13] "City Hall Lockport, NY"           
grep("[^0-9]", addresses, value=TRUE)
 [1] "12345 Sesame Street"               "One Micro$oft Way"                
 [3] "3 Olive St."                       "Two 1st Ave."                     
 [5] "5678 Park Place"                   "Forty Five 2nd Street"            
 [7] "Ninety Nine Cone St. apartment 7"  "9 Main St. apt. 623"              
 [9] "Five Google Drive"                 "4\\2 Rechov Yafo"                 
[11] "Fifteen Watchamacallit Boulevard"  "Nineteen Watchamacallit Boulevard"
[13] "One Main Street Apt 12b"           "Two Main Street Apt 123c"         
[15] "Three Main Street Apt 12343"       "City Hall Lockport, NY"           
# Contains 5 non-vowels in a row (notice that space counts as a non-vowel)
grep("[^aeiou][^aeiou][^aeiou][^aeiou][^aeiou]", fruit, value=TRUE, ignore.case=TRUE)
[1] "prickly pear"      "Beurre Hardy pear" "black cherry"     

6.11.6 metachars lose special meaning in brackets eg. [.$*]

# meta characters in [brackets] other than ^ - and ] lose their special meaning ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Most meta characters inside of [brackets] are treated like any other character.
# They do NOT have any special meaning in the brackets. Therefore you can use
# them without any problem inside a character class. For example [.$]
# matches either a period or a dollar sign (see exmaples below).
#
# The only exceptions are ^ - and ] which DO have a special meaning inside of
# the [square brackets] - see more info below.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

stuff = c("This is a period: .", "apple", "45 oranges", "$2", "This is an open bracket: [")
stuff
[1] "This is a period: ."        "apple"                     
[3] "45 oranges"                 "$2"                        
[5] "This is an open bracket: ["
# EXAMPLE
#
# Match a period, left-square-bracket, or a dollar sign
# You do NOT need backslashes inside of the [brackets]
grep("[.[$]", stuff, value=TRUE)
[1] "This is a period: ."        "$2"                        
[3] "This is an open bracket: ["
# ANOTHER EXAMPLE
#
# The following matches either a period or a digit.
# You do NOT need to use a backslash before the period.
grep("[.1-9]", stuff, value=TRUE)
[1] "This is a period: ." "45 oranges"          "$2"                 
# The backslash will not hurt (but it isn't necessary inside the character class)
# (below we will explain why there are TWO backslashes - for now you can leave
# off both of the backslashes)
grep("[\\.1-9]", stuff, value=TRUE)
[1] "This is a period: ." "45 oranges"          "$2"                 

6.11.7 Special cases: ^ - ]

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Special cases:    ^    -    ]
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The following characters have to be addressed in a special way inside
# of a character class. 
#
# ^   As we saw above, if the first character in the brackets is ^ the regex will
#     look for characters that are NOT in the brackets. If ^ appears anywhere else
#     inside the brackets it has no special meaning.
#
# -   As we saw above, [a-d] is the same as [abcd]. Therefore the - has a special
#     meaning inside of a character class. If you want to actually search
#     for a -, it must be the first, e.g. [-abc] or last character, eg. [abc-]
#     in the class.
#
# ]   has a special meaning - i.e. to end the character class. Therefore if 
#     you want to seach for an actual "]", the "]" should be specified
#     as the very FIRST character in the class, e.g. []abc]
#-------------------------------------------------------------------

# Examples:

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# The "." inside of [brackets] simply means an actual period.
# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

stuff = c("...", "def", "...bbbzzzbz.bzz...z.b", "^^^")
stuff
[1] "..."                   "def"                   "...bbbzzzbz.bzz...z.b"
[4] "^^^"                  
grep ("[.x]", stuff, value=TRUE)
[1] "..."                   "...bbbzzzbz.bzz...z.b"

6.11.8 matching ^ inside [square brackets]

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# The "^" inside of [brackets] has a different meaning if it is in the 
# first position or if it is anywhere else. For example:
#
# [^abc]   matches anything that is NOT an "a","b" or "c"
#
# [a^bc]   
# [abc^]   both of these examples matches one "a","b","c" or "^" character
# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

caretStuff = c("^^^", "hello", "???")
caretStuff
[1] "^^^"   "hello" "???"  
grep ("[^a-z]", caretStuff, value=TRUE)   # "^^^" "???"
[1] "^^^" "???"
grep ("[a-z^]", caretStuff, value=TRUE)   # "^^^" "hello"
[1] "^^^"   "hello"
grep ("[^^]", caretStuff, value=TRUE)     # "hello" "???"
[1] "hello" "???"  
# find all entries that have any symbol that is not a ".", "b" or "z"

stuff = c("...", "def", "...bbbzzzbz.bzz...z.b", "^^^")
stuff
[1] "..."                   "def"                   "...bbbzzzbz.bzz...z.b"
[4] "^^^"                  
grep ("[^.bz]", stuff, value=TRUE)
[1] "def" "^^^"
#grep("[a-z.]", c())

6.11.9 Matching a - inside [square brackets]

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Matching a dash (i.e. - ) inside a character class
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# As we saw earlier, inside of [bracketes] a dash special meaning (to indicate a range).
# To actually match a dash as one of the characters place the dash as either the 
# first or last character in the brackets.


dashStuff = c("---", "hello", "xxx")
dashStuff
[1] "---"   "hello" "xxx"  
grep ("[-xyz]", dashStuff, value=TRUE)   # "---" "xxx"
[1] "---" "xxx"
grep ("[xyz-]", dashStuff, value=TRUE)   # "---" "xxx"   (same thing)
[1] "---" "xxx"
grep ("[a-f]", dashStuff, value=TRUE)   # "hello"
[1] "hello"
grep ("[-a-f]", dashStuff, value=TRUE)   # "---" "hello"
[1] "---"   "hello"

6.11.10 Matching a ] inside [square brackets]

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# To match a closing-square-bracket "]" inside a character class you must
# specify the ] as the very FIRST symbol in the character class.
#
# NOTE - there are no special rules for matching an open-square-bracket, "["
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

stuff = c("]", "apple", "zzz" )
stuff
[1] "]"     "apple" "zzz"  
grep("]", stuff, value=TRUE)    #  "]"
[1] "]"
# The pattern "[]a]" matches a single character that is either "]" or "a".
# It finds "]" and also "apple" (since "apple" it contains an "a").
#
# This works since "]" is placed as the very first character in the 
# [brackets] so it is simply one of the characters that is searched for.

grep("[]a]", stuff, value=TRUE) #  "]"  "apple"
[1] "]"     "apple"
# This is VERY different for the pattern "[a]]"
#
# The following example shows what happens if you put the "]" in any
# position other than the first. The pattern "[a]]" is broken down as follows:
#
#   [a]   This is the single character "a". Note
#
#   ]     This does NOT signify the end of the character class, but is 
#         rather just a regular character that must be part of the text to be 
#         matched.
#
# Therefore [a]] is looking for the EXACT text "a]" somewhere in the text
# being searched.

grep("[a]]", stuff, value=TRUE) # No matches - looking for "a]" in the text
character(0)

6.12 Searching with regular expressions in a “text editor”

A “text editor” is a program that is used to edit “text files”. A text file can only contain “plain text” - i.e. no pictures, no music, only one font,

RStudio’s text editor

The text editor that is in RStudio can be used to create many different types of files. For example, it can be used to create both “R Script files” (i.e. .R files) and “Quarto Documents” (i.e. .qmd files). In addition, it can be used to create “plain text files”. To do so, choose the following menu choices from RStudio’s menu: “File | New File | Text File”

# NOTE: regular expressions are used in many different languages and environments.
# In general in regular expression in OTHER environments, 
# if you want to actually match a metacharacter (e.g. period, parentheses,
# caret, dollar sign, etc) you precede the metacharacter with a backslash.
#
# For example, you can do this in RStudios text editor - just type ctrl-f or cmd-f
# and click "regex" checkbox. Then type your regular expression into the 
# search box.
# For example
# Try searching the addresses.txt file for the following in the RStudio text
# editor:
#
#   one|1
#   .
#   \.
#   $
#   \$
#
# Matching meta characters requires that you "escape" the meta-character
# by preceding it with a backslash e.g.  \.

6.13 Matching meta-characters

# Matching meta-characters ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# NOTE: regular expressions are used in many different languages and environments.
# In general in regular expression in OTHER environments, 
# if you want to actually match a metacharacter (e.g. period, parentheses,
# caret, dollar sign, etc) you precede the metacharacter with a backslash.
#
# For example, you can do this in RStudios text editor - just type ctrl-f or cmd-f
# and click "regex" checkbox. Then type your regular expression into the 
# search box.
#
# For example
# Try searching the addresses.txt file for the following in the RStudio text
# editor:
#
#   one|1
#   .
#   \.
#   $
#   \$
#
# Matching meta characters requires that you "escape" the meta-character
# by preceding it with a backslash e.g.  \.
#
#
#
# When writing regular expression patterns in R you must use TWO \\'s to escape a metacharacter ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# HOWEVER, in R, since character values already use a backslash
# such as \n for a new line, you must use TWO backslashes in the regex
# pattern. The first backslash escapes the 2nd backslash from R
# so that R's character values don't interpret it in a special way.
# See the examples below.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

6.13.1 Reminder of how backslashes () are used in R

# Reminder of how backslashes (\) are used in R ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Remember that R uses backslashes to change the meaning (or to "escape" 
# the meaning) of the character that follows the backslash. 
# For example in the following cat command, \n, is displayed as a 
# "newline character" and \t is displayed as a tab.

cat("Hello\nJoe\thow are you\n\ndoing?\n\tI'm fine.")
Hello
Joe how are you

doing?
    I'm fine.
# Similarly in the following cat command the \" escapes the meaning
# of the quote. It no longer implies the end of the quotation. The 
# meaning of \" is simply to include a quotation mark as part of the 
# text.

cat("Lincoln said \"Four score and seven years ago today...\"")
Lincoln said "Four score and seven years ago today..."
# If the following line were not commented it would cause an error
# because the quotation is not actually closed due to the \ before the
# final quotation mark.
#
#cat("This is a backslash: \")   # ERROR

# The following works correctly. Note that \\ is needed to escape
# the normal meaning of the backslash character!
#cat("This is a backslash: \\")   # ERROR

#cat("This is a period \.")   # ERROR \. is NOT an R escape sequence

# You must use TWO backslashes in R's regular expressions ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# In R you must use two backslashes in a regex pattern to 
# escape a metacharacter.


grep("\\.", addresses, value=TRUE)  # all addresses that contain a period
[1] "3 Olive St."                      "Two 1st Ave."                    
[3] "Ninety Nine Cone St. apartment 7" "9 Main St. apt. 623"             
grep(".", addresses, value=TRUE)  # All the addresses
 [1] "12345 Sesame Street"               "One Micro$oft Way"                
 [3] "3 Olive St."                       "Two 1st Ave."                     
 [5] "5678 Park Place"                   "Forty Five 2nd Street"            
 [7] "Ninety Nine Cone St. apartment 7"  "9 Main St. apt. 623"              
 [9] "Five Google Drive"                 "4\\2 Rechov Yafo"                 
[11] "Fifteen Watchamacallit Boulevard"  "Nineteen Watchamacallit Boulevard"
[13] "One Main Street Apt 12b"           "Two Main Street Apt 123c"         
[15] "Three Main Street Apt 12343"       "City Hall Lockport, NY"           
stuff = c("", "apple", "", "banana")
stuff
[1] ""       "apple"  ""       "banana"
grep(".", stuff, value=FALSE)
[1] 2 4
grep(".", stuff, value=TRUE)
[1] "apple"  "banana"
# This is an ERROR in R but would be correct in other 
# languages or environment that use regular expressions
#grep("\.", addresses, value="TRUE")  # ERROR - R doesn't recognize \.

# Without the backslash you will find all addresses that contain
# at least a single character (i.e. all the addresses)
grep(".", addresses, value="TRUE")  
 [1] "12345 Sesame Street"               "One Micro$oft Way"                
 [3] "3 Olive St."                       "Two 1st Ave."                     
 [5] "5678 Park Place"                   "Forty Five 2nd Street"            
 [7] "Ninety Nine Cone St. apartment 7"  "9 Main St. apt. 623"              
 [9] "Five Google Drive"                 "4\\2 Rechov Yafo"                 
[11] "Fifteen Watchamacallit Boulevard"  "Nineteen Watchamacallit Boulevard"
[13] "One Main Street Apt 12b"           "Two Main Street Apt 123c"         
[15] "Three Main Street Apt 12343"       "City Hall Lockport, NY"           
grep("\\$", addresses, value=TRUE)   # addresses that contain a dollar sign
[1] "One Micro$oft Way"
grep("$", addresses, value=TRUE)    # all addresses - why?? - they all have an ending
 [1] "12345 Sesame Street"               "One Micro$oft Way"                
 [3] "3 Olive St."                       "Two 1st Ave."                     
 [5] "5678 Park Place"                   "Forty Five 2nd Street"            
 [7] "Ninety Nine Cone St. apartment 7"  "9 Main St. apt. 623"              
 [9] "Five Google Drive"                 "4\\2 Rechov Yafo"                 
[11] "Fifteen Watchamacallit Boulevard"  "Nineteen Watchamacallit Boulevard"
[13] "One Main Street Apt 12b"           "Two Main Street Apt 123c"         
[15] "Three Main Street Apt 12343"       "City Hall Lockport, NY"           

6.13.2 To search for an actual backslash you must use 4 backslashes in the pattern

# To search for an actual backslash you must use 4 backslashes in the pattern ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Searching for an actual backslash in the data can be tricky. 
# Remember, one of our addresses had a backslash in it. Let's find it.
#
# To look for a single backslash in the data you must use FOUR backslashes.
# Just as R character values need to "escape" a backslash with a 2nd backslash,
# so too do regular expressions need to escape a backslash with a 2nd backslash.
# Therefore if you want to write a regular expression in R that searches for
# a backslash, you must write FOUR backslashes in a row. The first two resolve
# to a single backslash. The 3rd and 4th resolve to a single backslash. Then finally
# the two single backslashes are used in the regular expression to match a 
# single actual backslash in the data.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

answer = grep("\\\\", addresses, value=TRUE)  # look for a single backslash in the data

answer
[1] "4\\2 Rechov Yafo"
cat(answer)
4\2 Rechov Yafo
stuff="\\\\"
stuff
[1] "\\\\"
cat(stuff)
\\

6.14 Matching QUOTES

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Matching QUOTES
#
# "Quotation marks" are NOT meta-characters in regular expressions. They
# have no special meaning in a regular expression. However, as with all 
# R code you must make sure to use a single backslash if the quotation mark
# is inside of quotation marks (e.g.  "\"" ) - see the example below.
#
# Note that when using R's regular expression functions, regex
# meta-characters, such as the period or ^ for which you want to remove 
# the special meaning require a DOUBLE backslash (as explained above).
#
# A regex pattern in VS Code (or a similar editor) that includes " or ' 
# would not need any backslashes since these aren't regex meta characters.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
quoteStuff = c("Joe says great stuff.", "Franklin said \"a penny saved ...\"")
quoteStuff
[1] "Joe says great stuff."               "Franklin said \"a penny saved ...\""
cat(quoteStuff, sep="\n")
Joe says great stuff.
Franklin said "a penny saved ..."
grep ("\"", quoteStuff, value=TRUE) #  "Franklin said \"a penny saved ...\""
[1] "Franklin said \"a penny saved ...\""
grep ("\\.", quoteStuff, value=TRUE) #  "Franklin said \"a penny saved ...\""
[1] "Joe says great stuff."               "Franklin said \"a penny saved ...\""

6.15 Different “flavors” or “dialects” of regular expressions.

# Different "flavors" or "dialects" of regular expressions. ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Regular expressions have been around for a long time. Different "dialects"
# of regular expressions have popped up over the years. 
#
# Some programming languages and tools use slightly different "rules" 
# for regular expressions. This can be frustrating. However, the basic set
# of regular expresion rules remains the same for most programming languages 
# and tools. 
#
# Regular expressions first became popular with the Unix operating system in 
# the 1970s. There were many different versions of Unix being marketed by 
# different companies, each with slight differences. POSIX is a standard that
# defines how things should be done in a standard way across all the different
# versions of Unix. POSIX addresses regular expressions too. 
#
# POSIX introduced "named character classes" as described below. R will 
# recognize these. 
#
# Other additions to the regular expression notation were introduced by 
# the once very popular Perl programming language. You can get these features
# to work in R by specifying perl=TRUE as one of the arguments for grep
# and other functions in R that work with regular expressions. 
# For more details about perl regular expressions, see ?regex.
#
# As we said above, regular expressions are NOT totally standardized across all 
# languages and environments. For example (as of Feb 10, 2022)
# there are subtle differences between the rules for regular expressions
# that are used in R and those that are used in the 
# Visual Studio Code (VS Code) text editor. You can see a summary of the 
# rules used by VS Code here:
#   https://docs.microsoft.com/en-us/visualstudio/ide/using-regular-expressions-in-visual-studio?view=vs-2022
#
# Although there may be some differences between different languaes and 
# environments, the vast majority of regular expression meta characters
# work the same across the different environments. 
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

6.15.1 [[:digit:]] vs \d - different shorthand notations for character classes

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Two different shorthand notations for character classes
# - POSIX named character classes , e.g. [[:alnum:]]  [[:digit:]]   etc. 
# - backslash shortcuts , e.g. \s \S \d \D  etc.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

“POSIX” named character classes, e.g. [[:digit:]]

# "POSIX" named character classes, e.g. [[:alnum:]] ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# NOTE: These are available in R. 
#
#       They currently are NOT available in VSCode
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Bracket notation in regular expressions (e.g. [aeiou] or [0-9]) are known as 
# character classes. 
#
# You can use several "named character classes" as shorthand for some common
# character classes. These are shown below. Notice the [[double brackets]]
# We'll explain more about the the [[double brackets]] below.
#
# [[:upper:]]   same as [A-Z]
# [[:lower:]]   same as [a-z]
# [[:space:]]   same as [ \r\n\t]
# [[:punct:]]   all "special" characters, eg. !@#$% etc...
# [[:digit:]]   same as [0-9]
# [[:alpha:]]   same as [a-zA-Z]
# [[:alnum:]]   same as [a-zA-Z0-9]
#
# The [[double brackets]] shown above are necessary since these
# "named character classes" must actually be placed inside a pair of 
# [square brackets]. For example, you can also use the named
# character classes inside a larger character class.
#
# For example the following will match any single character
# from the following list:    -,+,*,/,(,),0,1,2,3,4,5,6,7,8,9
#
#   [-+*/()[:digit:]]    is the same as [-+*/()0-9]
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

6.15.2 — practice —

# QUESTION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Search for addresses that contain at least one digit. Use a POSIX
# named character class.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ANSWER 
grep("[[:digit:]]", addresses, value=TRUE)   # uses POSIX named character classes
 [1] "12345 Sesame Street"              "3 Olive St."                     
 [3] "Two 1st Ave."                     "5678 Park Place"                 
 [5] "Forty Five 2nd Street"            "Ninety Nine Cone St. apartment 7"
 [7] "9 Main St. apt. 623"              "4\\2 Rechov Yafo"                
 [9] "One Main Street Apt 12b"          "Two Main Street Apt 123c"        
[11] "Three Main Street Apt 12343"     
str_view(addresses, "[[:digit:]]")
 [1] │ <1><2><3><4><5> Sesame Street
 [3] │ <3> Olive St.
 [4] │ Two <1>st Ave.
 [5] │ <5><6><7><8> Park Place
 [6] │ Forty Five <2>nd Street
 [7] │ Ninety Nine Cone St. apartment <7>
 [8] │ <9> Main St. apt. <6><2><3>
[10] │ <4>\<2> Rechov Yafo
[13] │ One Main Street Apt <1><2>b
[14] │ Two Main Street Apt <1><2><3>c
[15] │ Three Main Street Apt <1><2><3><4><3>
# NOTE - the pattern "[:digit:]" with one set of [brackets] does NOT work.
#
# Since there is only one set of [brackets], the pattern matches any one of
# the characters that are between the [brackets], i.e. match
# one of the characters ":", "d", "i", "g", "i", "t" or ":"
# This is equivalent to "[:digt]"  (I removed the 2nd ":" and the 2nd "i" as
# they are repetitive.)

# THIS DOESN'T WORK! - see note above
grep("[:digit:]", addresses, value=TRUE)  # looks for one of the following :,d,i,g,i,t,:
 [1] "12345 Sesame Street"               "One Micro$oft Way"                
 [3] "3 Olive St."                       "Two 1st Ave."                     
 [5] "Forty Five 2nd Street"             "Ninety Nine Cone St. apartment 7" 
 [7] "9 Main St. apt. 623"               "Five Google Drive"                
 [9] "Fifteen Watchamacallit Boulevard"  "Nineteen Watchamacallit Boulevard"
[11] "One Main Street Apt 12b"           "Two Main Street Apt 123c"         
[13] "Three Main Street Apt 12343"       "City Hall Lockport, NY"           
grep("^[N[:digit:]]", addresses, value=TRUE)   # same as [N0-9]
[1] "12345 Sesame Street"               "3 Olive St."                      
[3] "5678 Park Place"                   "Ninety Nine Cone St. apartment 7" 
[5] "9 Main St. apt. 623"               "4\\2 Rechov Yafo"                 
[7] "Nineteen Watchamacallit Boulevard"
# QUESTION:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Write a command that replaces any sequence of digits or mathematical 
# operators with the text "<<MATH-EXPRESSION>>"
#
# You can use the following "mathStuff" variable to test your answer.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

mathStuff <- c("What is 3+2 ? Do you know the answer?",
               "99.5 desgrees in Farenheit is 99.5*(5/9)-32 degrees in Celcius")

mathStuff
[1] "What is 3+2 ? Do you know the answer?"                         
[2] "99.5 desgrees in Farenheit is 99.5*(5/9)-32 degrees in Celcius"
# ANSWER

gsub("[-+*/().[:digit:]]+", "<<MATH-EXPRESSION>>", mathStuff)
[1] "What is <<MATH-EXPRESSION>> ? Do you know the answer?"                              
[2] "<<MATH-EXPRESSION>> desgrees in Farenheit is <<MATH-EXPRESSION>> degrees in Celcius"
# QUESTION:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Write a grep command that matches punctuation and letters, but not numbers.
# You can use the following data to test your answer. Use POSIX named
# character classes.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

stuff = c("1234",  # This should NOT match since it doesn't contain letters or punctuation
          "12.34", # This SHOULD match since it contains punctuation.
          ".",     # This SHOULD match since it contains punctuation.
          "hi")    # This SHOULD match since it contains at least one letter
# ANSWER

# The following will match any punctuation or letters but not numbers
grep("[[:punct:][:alpha:]]", stuff, value=TRUE)   #   "."    "hi"
[1] "\\\\"
# QUESTION 
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Search for fruits that contain spaces using the POSIX
# named character classes for spaces
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ANSWER
grep("[[:space:]]", fruit, value=TRUE)
[1] "N. American apple" "S. Korean Fig"     "star fruit"       
[4] "prickly pear"      "Beurre Hardy pear" "black cherry"     
# QUESTION 
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Search fruit for those that contain punctuation (e.g. periods, commas, etc)
# using the POSIX named character classes
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ANSWER

grep ("[[:punct:]]", fruit, value=TRUE)
[1] "N. American apple" "S. Korean Fig"    
# QUESTION 
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Show fruit that contain either an x,y,z or some punctuation.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ANSWER

# NOTE that there are TWO sets of brackets. The POSIX named character
# class, [:punct:], is itself inside a set of [brackets].

grep("[xyz[:punct:]]", fruit, value=TRUE)
 [1] "N. American apple" "S. Korean Fig"     "prickly pear"     
 [4] "Beurre Hardy pear" "cherry"            "black cherry"     
 [7] "blueberry"         "strawberry"        "honeydew"         
[10] "yumberry"         
# QUESTION 
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Search all ADDRESSES for those that contain punctuation (e.g. periods,
# commas, etc) or actual digits (e.g. 0123456789) using POSIX named
# character classes
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ANSWER

grep("[[:punct:][:digit:]]", addresses, value=TRUE)
 [1] "12345 Sesame Street"              "One Micro$oft Way"               
 [3] "3 Olive St."                      "Two 1st Ave."                    
 [5] "5678 Park Place"                  "Forty Five 2nd Street"           
 [7] "Ninety Nine Cone St. apartment 7" "9 Main St. apt. 623"             
 [9] "4\\2 Rechov Yafo"                 "One Main Street Apt 12b"         
[11] "Two Main Street Apt 123c"         "Three Main Street Apt 12343"     
[13] "City Hall Lockport, NY"          
# This also works
grep("[[:punct:]0-9]", addresses, value=TRUE)
 [1] "12345 Sesame Street"              "One Micro$oft Way"               
 [3] "3 Olive St."                      "Two 1st Ave."                    
 [5] "5678 Park Place"                  "Forty Five 2nd Street"           
 [7] "Ninety Nine Cone St. apartment 7" "9 Main St. apt. 623"             
 [9] "4\\2 Rechov Yafo"                 "One Main Street Apt 12b"         
[11] "Two Main Street Apt 123c"         "Three Main Street Apt 12343"     
[13] "City Hall Lockport, NY"          
# QUESTION 
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Search all ADDRESSES for those that contain some punctuation that 
# comes immediately after the letter t. Use POSIX named classes.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ANSWER
grep("[tT][[:punct:]]", addresses, value=TRUE)
[1] "3 Olive St."                      "Ninety Nine Cone St. apartment 7"
[3] "9 Main St. apt. 623"              "City Hall Lockport, NY"          
# ANOTHER WAY
grep("[tT][^[:alnum:]]", addresses, value=TRUE)
 [1] "One Micro$oft Way"                 "3 Olive St."                      
 [3] "Two 1st Ave."                      "Ninety Nine Cone St. apartment 7" 
 [5] "9 Main St. apt. 623"               "Fifteen Watchamacallit Boulevard" 
 [7] "Nineteen Watchamacallit Boulevard" "One Main Street Apt 12b"          
 [9] "Two Main Street Apt 123c"          "Three Main Street Apt 12343"      
[11] "City Hall Lockport, NY"           
# QUESTION 
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# answer the previous question without using POSIX named classes.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

grep("[tT][,.!?]", addresses, value=TRUE)
[1] "3 Olive St."                      "Ninety Nine Cone St. apartment 7"
[3] "9 Main St. apt. 623"              "City Hall Lockport, NY"          

backslash shortcuts for character classes, e.g. etc.

# backslash shortcuts for character classes, e.g. \s \S \d \D etc. ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The following are also shorthand notation that you can use for some
# characer classes.
#
# Note that in R you must use a double backslash, e.g. \\s instead of \s
#
# \s   is the same as [ \n\t\r]  also same as [[:space:]]
#      It matches anything which is considered whitespace.
#      This could be a space, tab, line break etc.
#
# \S   is the same as [^ \n\t\r]
#      It matches the opposite of \s, that is anything which is not considered
#      whitespace.
#
# \d   is the same as [0-9]  (ie. it matches a single digit)  same as [[:digit:]]
#
# \D   is the same as [^0-9] (i.e. it matches a single NON-digit)
#
# \w - matches anything which is considered a word character. That is
#      [A-Za-z0-9_]. Note the inclusion of the underscore character '_'. This is
#      because in programming and other areas we regularly use the underscore as part
#      of, say, a variable or function name.
#
# \W - matches  [^A-Za-z0-9_] the opposite of \w, that is anything which is not considered a
#      word character.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

6.16 WORD BOUNDRIES: ::

# WORD BOUNDRIES:  \b ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# \b matches a "word boundary"   (remember in R use \\b)
# 
# You can use this before a pattern to mean that the pattern must
# come at the beginning of a word - see examples below.
#
# You can use this after a pattern to mean that the pattern must
# come at the end of a word - see examples below.
#
# Note that a "word boundary" is not a particular character such as a 
# space or comma, but rather is a position in the text. 
#
# NOTE - a "word" may include letters, digits or underscores
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

heWords = c("he", "here", "there", "the", "who was she?", "headache")
heWords
[1] "he"           "here"         "there"        "the"          "who was she?"
[6] "headache"    
# Example: word starts with "he"
grep ("\\bhe", heWords, value=TRUE)  # "he" "here" (NOT "there","the","who was she?")
[1] "he"       "here"     "headache"
# Example: word ends with "he"
grep ("he\\b", heWords, value=TRUE)  # "he" "the" "who was she?" (NOT "here","there")
[1] "he"           "the"          "who was she?" "headache"    
# Example: match only the word "he"
grep ("\\bhe\\b", heWords, value=TRUE)  # "he"
[1] "he"
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# You can use \b with more complicated patterns too.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Examples: 
grep ("\\b[Hh]", fruit, value=TRUE)  # fruits that have a word that starts with H or h
[1] "Beurre Hardy pear" "honeydew"         
grep ("[aeiouAEIOU]\\b", fruit, value=TRUE)  # fruits that end with a vowel
[1] "apple"             "N. American apple" "Beurre Hardy pear"
[4] "banana"           

:::

6.17 “pattern1|pattern2” matches pattern1 OR pattern2

# "pattern1|pattern2"  matches pattern1 OR pattern2 ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Combining patterns
#
#    PATTERN1|PATTERN2  matches if either PATTERN1 or PATTERN2 is found
#
#    (PATTERN)          you may surround patterns with (parentheses) if necessary
#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

grep("black|blue|green", fruit, value=TRUE) # contains black,blue or green
[1] "black cherry" "blueberry"   
grep("^(1|One)", addresses, value=TRUE, ignore.case=TRUE) # 1 or One at beginning 
[1] "12345 Sesame Street"     "One Micro$oft Way"      
[3] "One Main Street Apt 12b"
grep("(^1|^One)", addresses, value=TRUE) # same thing 
[1] "12345 Sesame Street"     "One Micro$oft Way"      
[3] "One Main Street Apt 12b"
grep("^1|^One", addresses, value=TRUE) # same thing 
[1] "12345 Sesame Street"     "One Micro$oft Way"      
[3] "One Main Street Apt 12b"
grep("[0-9]", addresses, value=TRUE)
 [1] "12345 Sesame Street"              "3 Olive St."                     
 [3] "Two 1st Ave."                     "5678 Park Place"                 
 [5] "Forty Five 2nd Street"            "Ninety Nine Cone St. apartment 7"
 [7] "9 Main St. apt. 623"              "4\\2 Rechov Yafo"                
 [9] "One Main Street Apt 12b"          "Two Main Street Apt 123c"        
[11] "Three Main Street Apt 12343"     
grep("0|1|2|3|4|5|6|7|8|9", addresses, value=TRUE)  # Same as [0-9]
 [1] "12345 Sesame Street"              "3 Olive St."                     
 [3] "Two 1st Ave."                     "5678 Park Place"                 
 [5] "Forty Five 2nd Street"            "Ninety Nine Cone St. apartment 7"
 [7] "9 Main St. apt. 623"              "4\\2 Rechov Yafo"                
 [9] "One Main Street Apt 12b"          "Two Main Street Apt 123c"        
[11] "Three Main Street Apt 12343"     

6.18 breaking up long patterns with paste0

# breaking up long patterns with paste0 ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Do NOT include extra whitespace in patterns!!!
# 
# For long patterns you can use paste0 to break up the pattern
# so it is more readable in the code.
#
# See examples below.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

6.18.1 example

# QUESTION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Show those addresses that contain one of the numbers 1-9 spelled out in words,
# e.g. "one", "two", etc
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ANSWER - note that writing the pattern using paste0 with collapse = ""
# makes the pattern easy to understand. You can also comment on 
# individual portions of the pattern. 

pattern = paste0("\\bone\\b|",     # match the word "one"
                 "\\btwo\\b|",     # match the word "two"
                 "\\bthree\\b|",   # etc.
                 "\\bfour\\b|",
                 "\\bfive\\b|",
                 "\\bsix\\b|",
                 "\\bseven\\b|",
                 "\\beight\\b|",
                 "\\bnine\\b")

grep(pattern, addresses, value=TRUE, ignore.case = TRUE)
[1] "One Micro$oft Way"                "Two 1st Ave."                    
[3] "Forty Five 2nd Street"            "Ninety Nine Cone St. apartment 7"
[5] "Five Google Drive"                "One Main Street Apt 12b"         
[7] "Two Main Street Apt 123c"         "Three Main Street Apt 12343"     
# Note that the following also works but is
#   - MUCH harder to read 
#   - MUCH harder to check for errors and
#   - cannot be commented on for different parts of the pattern

grep(
  "\\bone\\b|\\btwo\\b|\\bthree\\b|\\bfour\\b|\\bfive\\b|\\bsix\\b|\\bseven\\b|\\beight\\b|\\bnine\\b",
  addresses, value=TRUE, ignore.case = TRUE)
[1] "One Micro$oft Way"                "Two 1st Ave."                    
[3] "Forty Five 2nd Street"            "Ninety Nine Cone St. apartment 7"
[5] "Five Google Drive"                "One Main Street Apt 12b"         
[7] "Two Main Street Apt 123c"         "Three Main Street Apt 12343"     

6.18.2 example

# A more complex example ####

# QUESTION 
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Display those addresses that contain a number that is exactly one
# digit long. For example: 
#
#   addresses =
#    c("3 Olive St.",              # should match (because of 3)
#      "Forty Five 2nd Street",    # should match (because of 2nd)
#      "Ninety Nine Cone St. apartment 7",
#                                  # should match (because of 7)
#      "7",                        # should match
#
#      "12345 Sesame Street",      # should NOT match (12345 is five digits)
#      "One main Street Apt 12b",  # should NOT match (12 is two digits)
#      "Two Main St. Apt 99",      # should NOT match (99 is two digits) 
#      "45")                       # should NOT match
#
#   > YOUR COMMAND GOES HERE
#   [1] "3 Olive St."
#   [2] "Forty Five 2nd Street"
#   [3] "Ninety Nine Cone St. apartment 7"
#   [4] "7"
#
# NOTE: the pattern "[0-9]" will NOT work as it will match every one of 
# values above
#
# NOTE: the pattern "\\b[0-9]\\b" is a good try but will not match
# "Forty Five 2nd Street" as the 2 in "2nd" is NOT followed by a word boundary.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ANSWER

# Note - there are 4 possible ways for a single digit appear in the text:
#
# (a) The digit can appear at the very beginning of the text and be followed by
#     a non-digit, e.g. "3 Olive street". 
#
#     The pattern: "^[0-9][^0-9]"     
#     matches "3 Olive street"
#     but doesn't match "Forty Five 2nd Street" (since the 2 is not at the
#                                                beginning of the text).
#
# Similarly, each of the following patterns will match a single digit for
# some texts but not for others. 
#
# (b) [^0-9][0-9][^0-9] : NONdigit digit NONdigit anywhere in the text
# (c) [^0-9][0-9]$      : last two characters are a NONdigit followed by a single digit
# (d) ^[0-9]$           : whole thing is JUST one digit
#
# For actual addresses you probably don't have to worry about the last
# case, but for other types of data you might.
#
# You can write a pattern that deals with all of these cases by
# separating the different "sub-patterns" from each other with "|" symbols.
# For example, the following answers the question, but the pattern is VERY
# hard to read. (see below for a better way to write this code.)

grep("^[0-9][^0-9]|[^0-9][0-9][^0-9]|[^0-9][0-9]$|^[0-9]$", addresses, value=TRUE)
[1] "3 Olive St."                      "Two 1st Ave."                    
[3] "Forty Five 2nd Street"            "Ninety Nine Cone St. apartment 7"
[5] "9 Main St. apt. 623"              "4\\2 Rechov Yafo"                
# we can use paste0 to make this easier to read

pattern <- 
  paste0 ( "^[0-9][^0-9]" ,     # starts with digit followed by a NONdigit
         "|[^0-9][0-9][^0-9]",  # NONdigit digit NONdigit anywhere in the text 
         "|[^0-9][0-9]$",       # ends with a NONdigit followed by a single digit
         "|^[0-9]$")            # whole thing is JUST one digit 

grep(pattern, addresses, value=TRUE)
[1] "3 Olive St."                      "Two 1st Ave."                    
[3] "Forty Five 2nd Street"            "Ninety Nine Cone St. apartment 7"
[5] "9 Main St. apt. 623"              "4\\2 Rechov Yafo"                

6.18.3 example

# QUESTION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Write a command that displays all addresses that contain the
# number "one" or 1.
#
# Notice that the following will NOT work. This gets "Cone" and 12345 too:
#
#   grep("one|1", addresses, value=TRUE, ignore.case=TRUE) # NO - matches Cone and 12345
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ANSWER

# The following is the way to do it. Regular expressions require 
# a very thoughtful approach!
#
# The word "one" or the number "1" (not including the number 123)
pattern = paste0("^one[^a-z]|",       # one at beginning
                 "[^a-z]one[^a-z]|",  # one in middle
                 "[^a-z]one$|",       # one at end
                 "^one$|",            # ONLY the word "one"
                 "^1[^0-9]|",         # 1 at beginning
                 "[^0-9]1[^0-9]|",    # 1 in middle
                 "[^0-9]1$|",         # 1 at end
                 "^1$")               # ONLY the number 1


pattern
[1] "^one[^a-z]|[^a-z]one[^a-z]|[^a-z]one$|^one$|^1[^0-9]|[^0-9]1[^0-9]|[^0-9]1$|^1$"
grep(pattern, addresses, value=TRUE, ignore.case=TRUE)
[1] "One Micro$oft Way"       "Two 1st Ave."           
[3] "One Main Street Apt 12b"
# Same thing but MUCH harder to read!!!
# You should break up long patterns with paste0 and comment them as shown above.

grep(
  "^one[^a-z]|[^a-z]one[^a-z]|[^a-z]one$|^one$|^1[^0-9]|[^0-9]1[^0-9]|[^0-9]1$|^1$",
  addresses, value=TRUE, ignore.case=TRUE)
[1] "One Micro$oft Way"       "Two 1st Ave."           
[3] "One Main Street Apt 12b"

6.18.4 example

# QUESTION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# df is a dataframe. Write a command that shows all rows from df
# for which the 2nd character in the first column is "x". 
# 
# Hints: 
#   a. Access a dataframe as you normally would but use grep or lgrep to
#      return either the row numbers or TRUE/FALSE
#      values that identify the rows to be displayed.
# 
#   b. Remember that you are NOT told what the column names are. Therefore you
#      must use a number to stipulate the first column and NOT a column name.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Use this example data
df = data.frame( partNum = c("ax4321", "az12", "bx1234", "bw987"),
                 partName = c("widget","thingie","gadget","gizmo"),
                 price =    c(0.50, 0.60, 1.70, 0.80),
                 stringsAsFactors = FALSE)
df
  partNum partName price
1  ax4321   widget   0.5
2    az12  thingie   0.6
3  bx1234   gadget   1.7
4   bw987    gizmo   0.8
# Show the rows that contain "x" as the 2nd character in the partNum


# One answer - using grep
df[ grep ( "^.x", df$partNum , ignore.case=TRUE ) ,   ]
  partNum partName price
1  ax4321   widget   0.5
3  bx1234   gadget   1.7
# Another answer - using grepl
df[ grepl ( "^.x", df$partNum , ignore.case=TRUE ) ,   ]
  partNum partName price
1  ax4321   widget   0.5
3  bx1234   gadget   1.7

6.18.5 example

# QUESTION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Using the same data as above, only show those rows that contain an "x"
# in the 2nd character of the partNum whose price is also less than 1.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
df
  partNum partName price
1  ax4321   widget   0.5
2    az12  thingie   0.6
3  bx1234   gadget   1.7
4   bw987    gizmo   0.8
df[ grepl ( "^.x", df$partNum , ignore.case=TRUE ) & df$price < 1 ,   ]
  partNum partName price
1  ax4321   widget   0.5

6.19 Quantifiers: {n,m} {n} {n,} + * ?

#@ Quantifiers: {n,m}  {n}  {n,}  +  *  ?  ####
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#@
#@ Quantifiers are symbols in the pattern that identify how many repetetitions
#@ to look for of a particular sub-pattern. The quantifiers include
#@
#@   SOME_SUB_PATTERN{n,m}    (where n and m are numbers)
#@   SOME_SUB_PATTERN{n}      (where n is a number)
#@   SOME_SUB_PATTERN{n,}      (where n is a number)
#@   SOME_SUB_PATTERN+
#@   SOME_SUB_PATTERN*
#@   SOME_SUB_PATTERN?
#@
#@ See below for an explanation of each type of quantifier.
#@
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

6.19.1 Quantifiers with {curly braces} eg. {3} {2,5} {3,}

# Quantifiers with {curly braces}   eg. {3}   {2,5}   {3,}   ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 
#    SOME_PATTERN{3}       three matches in a row
#    SOME_PATTERN{3,6}     between three and six matches in a row
#    SOME_PATTERN{3,}      at least 3 matches in a row
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# EXAMPLE
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# "[0-9]{4}" matches 4 digits in a row, same as "[0-9][0-9][0-9][0-9]"
#
# Note that this will also return those entries that have more than 4 digits
# in a row since these entries ALSO have 4 digits in a row (plus some extra
# digits)

grep("[0-9]{4}", addresses, value=TRUE)  # 4 digits , same as "[0-9][0-9][0-9][0-9]"
[1] "12345 Sesame Street"         "5678 Park Place"            
[3] "Three Main Street Apt 12343"
# Note that if we use gsub, only the first 4 digits will be substituted.
gsub("[0-9]{4}", "xxxx", addresses)  # 4 digits , same as "[0-9][0-9][0-9][0-9]"
 [1] "xxxx5 Sesame Street"               "One Micro$oft Way"                
 [3] "3 Olive St."                       "Two 1st Ave."                     
 [5] "xxxx Park Place"                   "Forty Five 2nd Street"            
 [7] "Ninety Nine Cone St. apartment 7"  "9 Main St. apt. 623"              
 [9] "Five Google Drive"                 "4\\2 Rechov Yafo"                 
[11] "Fifteen Watchamacallit Boulevard"  "Nineteen Watchamacallit Boulevard"
[13] "One Main Street Apt 12b"           "Two Main Street Apt 123c"         
[15] "Three Main Street Apt xxxx3"       "City Hall Lockport, NY"           
grep("[0-9]{3}", addresses, value=TRUE)  # 3 digits 
[1] "12345 Sesame Street"         "5678 Park Place"            
[3] "9 Main St. apt. 623"         "Two Main Street Apt 123c"   
[5] "Three Main Street Apt 12343"
# Note that if we use gsub, only the first 3 digits will be substituted.
gsub("[0-9]{3}", "xxxx", addresses)  # 4 digits , same as "[0-9][0-9][0-9][0-9]"
 [1] "xxxx45 Sesame Street"              "One Micro$oft Way"                
 [3] "3 Olive St."                       "Two 1st Ave."                     
 [5] "xxxx8 Park Place"                  "Forty Five 2nd Street"            
 [7] "Ninety Nine Cone St. apartment 7"  "9 Main St. apt. xxxx"             
 [9] "Five Google Drive"                 "4\\2 Rechov Yafo"                 
[11] "Fifteen Watchamacallit Boulevard"  "Nineteen Watchamacallit Boulevard"
[13] "One Main Street Apt 12b"           "Two Main Street Apt xxxxc"        
[15] "Three Main Street Apt xxxx43"      "City Hall Lockport, NY"           
grep("[0-9]{3}", addresses, value=TRUE)  # 3 digits 
[1] "12345 Sesame Street"         "5678 Park Place"            
[3] "9 Main St. apt. 623"         "Two Main Street Apt 123c"   
[5] "Three Main Street Apt 12343"
grep("\\b[0-9]{3}\\b", addresses, value=TRUE)  # 3 digits 
[1] "9 Main St. apt. 623"
# EXAMPLE
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# match a word that is exactly 4 letters long

pattern = 
  paste0 ("^[a-zA-Z]{4}[^a-zA-Z]",
          "|[^a-zA-Z][a-zA-Z]{4}[^a-zA-Z]",
          "|[^a-zA-Z][a-zA-Z]{4}$",
          "|^[a-zA-Z]{4}$")

pattern =
  paste0 ("\\b[a-zA-Z]{4}\\b")

# use the same pattern for both addressess and fruit

grep(pattern, addresses, value=TRUE)
 [1] "5678 Park Place"                  "Forty Five 2nd Street"           
 [3] "Ninety Nine Cone St. apartment 7" "9 Main St. apt. 623"             
 [5] "Five Google Drive"                "4\\2 Rechov Yafo"                
 [7] "One Main Street Apt 12b"          "Two Main Street Apt 123c"        
 [9] "Three Main Street Apt 12343"      "City Hall Lockport, NY"          
grep(pattern, fruit, value=TRUE)
[1] "star fruit"        "pear"              "prickly pear"     
[4] "Beurre Hardy pear" "plum"             
# Another way - using \b
pattern = "\\b[a-zA-Z]{4}\\b"
grep(pattern, addresses, value=TRUE)
 [1] "5678 Park Place"                  "Forty Five 2nd Street"           
 [3] "Ninety Nine Cone St. apartment 7" "9 Main St. apt. 623"             
 [5] "Five Google Drive"                "4\\2 Rechov Yafo"                
 [7] "One Main Street Apt 12b"          "Two Main Street Apt 123c"        
 [9] "Three Main Street Apt 12343"      "City Hall Lockport, NY"          
# Use a text editor (e.g. the one in RStudio) to see how this works ...
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Paste the following in the text editor to see how this pattern works.
#
#  \b[a-zA-Z]{4}\b
#
# This will match any 4 character word. It will NOT match 3 or 5 character words.
# Note that in VS Code you should use only one backslash (i.e. \b ) but in R
# you would use two backslashes (i.e. \\b ) as explained above.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# match a word that is at least 4 letters long
# Use {4,} instead of {4}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Paste this into VSCode regex search.
#   ^[a-zA-Z]{4,}[^a-zA-Z]|[^a-zA-Z][a-zA-Z]{4,}[^a-zA-Z]|[^a-zA-Z][a-zA-Z]{4,}$|^[a-zA-Z]{4,}$
# match a word that is at least 3,4 or 5 letters long
# Use {3,5} instead of {4}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Paste this into VSCode regex search.
#   ^[a-zA-Z]{3,5}[^a-zA-Z]|[^a-zA-Z][a-zA-Z]{3,5}[^a-zA-Z]|[^a-zA-Z][a-zA-Z]{3,5}$|^[a-zA-Z]{3,5}$
# ANOTHER EXAMPLE
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# match a word that is exactly 7 letters long
pattern = 
  paste0 ("^[a-zA-Z]{7}[^a-zA-Z]",
          "|[^a-zA-Z][a-zA-Z]{7}[^a-zA-Z]",
          "|[^a-zA-Z][a-zA-Z]{7}$",
          "|^[a-zA-Z]{7}$")

# use the same pattern for both addresses and fruit
grep(pattern, addresses, value=TRUE)
[1] "Fifteen Watchamacallit Boulevard"
grep(pattern, fruit, value=TRUE)
[1] "prickly pear" "kumquat"     
# match any word that is between 4 and 7 letters long
pattern = 
  paste0 ("^[a-zA-Z]{4,7}[^a-zA-Z]",
          "|[^a-zA-Z][a-zA-Z]{4,7}[^a-zA-Z]",
          "|[^a-zA-Z][a-zA-Z]{4,7}$",
          "|^[a-zA-Z]{4,7}$")

grep(pattern, addresses, value=TRUE)
 [1] "12345 Sesame Street"              "One Micro$oft Way"               
 [3] "3 Olive St."                      "5678 Park Place"                 
 [5] "Forty Five 2nd Street"            "Ninety Nine Cone St. apartment 7"
 [7] "9 Main St. apt. 623"              "Five Google Drive"               
 [9] "4\\2 Rechov Yafo"                 "Fifteen Watchamacallit Boulevard"
[11] "One Main Street Apt 12b"          "Two Main Street Apt 123c"        
[13] "Three Main Street Apt 12343"      "City Hall Lockport, NY"          
grep(pattern, fruit, value=TRUE)      # ... "apple" ... "peach" ... (5 letter long words too)
 [1] "apple"             "N. American apple" "S. Korean Fig"    
 [4] "star fruit"        "pear"              "prickly pear"     
 [7] "Beurre Hardy pear" "cherry"            "black cherry"     
[10] "peach"             "plum"              "kumquat"          
[13] "banana"           
# match a word that is at least 7 letters long
pattern = 
  paste0 ("^[a-zA-Z]{7,}[^a-zA-Z]",
          "|[^a-zA-Z][a-zA-Z]{7,}[^a-zA-Z]",
          "|[^a-zA-Z][a-zA-Z]{7,}$",
          "|^[a-zA-Z]{7,}$")

grep(pattern, addresses, value=TRUE)
[1] "Ninety Nine Cone St. apartment 7"  "Fifteen Watchamacallit Boulevard" 
[3] "Nineteen Watchamacallit Boulevard" "City Hall Lockport, NY"           
grep(pattern, fruit, value=TRUE)
[1] "N. American apple" "prickly pear"      "kumquat"          
[4] "blueberry"         "strawberry"        "honeydew"         
[7] "strawberries"      "yumberry"         
# Exactly 3 digits (see example in last section of exactly one digit)
pattern = paste0(
  "^[0-9]{3}[^0-9]",
  "|[^0-9][0-9]{3}[^0-9]",
  "|[^0-9][0-9]{3}$",
  "|^[0-9]{3}$"
)
grep(pattern, addresses, value=TRUE)
[1] "9 Main St. apt. 623"      "Two Main Street Apt 123c"
grep("[^aeiou]{5}", fruit, value=TRUE)  # at least 5 non vowels in a row
[1] "N. American apple" "prickly pear"      "Beurre Hardy pear"
[4] "black cherry"     
grep("[^aeiou]{6}", fruit, value=TRUE)  # at least 6 non vowels in a row
[1] "prickly pear"
grep("[^aeiou]{7}", fruit, value=TRUE)  # at least 7 non vowels in a row
character(0)
grep("^.[aeiou]{2}", fruit, value=TRUE)  # vowels in the 2nd and 3rd positions
[1] "pear"              "Beurre Hardy pear" "peach"            
grep("[aeiou]{2}.$", fruit, value=TRUE)  # 2nd & 3rd to last characters are vowels
[1] "star fruit"        "pear"              "prickly pear"     
[4] "Beurre Hardy pear" "kumquat"           "strawberries"     
# QUESTION
# Search for fruit that are 4 or 6 letters long.
#

grep ("^[a-zA-Z]{4}$|^[a-zA-Z]{6}$", fruit, value=TRUE)
[1] "pear"   "cherry" "plum"   "banana"
grep ("(^[a-zA-Z]{4}$)|(^[a-zA-Z]{6}$)", fruit, value=TRUE)
[1] "pear"   "cherry" "plum"   "banana"
grep ("^(([a-zA-Z]{4})|([a-zA-Z]{6}))$", fruit, value=TRUE)
[1] "pear"   "cherry" "plum"   "banana"

6.20 Quantifiers with * + and ?

# Quantifiers with    *   +  and  ?  ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#      PATTERN* is the same as PATTERN{0,}  i.e. zero or more repetitions
# 
#      PATTERN+ is the same as PATTERN{1,}  i.e. one or more repetitions
#
#      PATTERN? is the same as PATTERN{0,1}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


grep("^[^aeiouAEIOU].*[^aeiouAEIOU]$", fruit, value=TRUE) # start and end with non-vowel 
 [1] "S. Korean Fig"     "fig"               "star fruit"       
 [4] "pear"              "prickly pear"      "Beurre Hardy pear"
 [7] "cherry"            "black cherry"      "peach"            
[10] "plum"              "kumquat"           "blueberry"        
[13] "strawberry"        "honeydew"          "strawberries"     
[16] "yumberry"         
# match at least two spaces in the text (including just two spaces)
pattern = ".* .* .*"       

pattern = ".*e.*e.*"       
grep(pattern, fruit, value=TRUE) 
[1] "N. American apple" "Beurre Hardy pear" "blueberry"        
[4] "honeydew"          "strawberries"     
spacesStuff = c("nospaces",
          "this has three spaces",
          "just two spaces", 
          "one space", 
          "two  spaces", 
          "three   spaces", 
          "",
          " ", 
          "  ", 
          "   ")
spacesStuff
 [1] "nospaces"              "this has three spaces" "just two spaces"      
 [4] "one space"             "two  spaces"           "three   spaces"       
 [7] ""                      " "                     "  "                   
[10] "   "                  
grep(pattern, spacesStuff, value=TRUE) 
[1] "this has three spaces" "one space"             "three   spaces"       
# QUESTION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Find solutions for the questions at the website.
# The website is free to use.
#
#         http://play.inginf.units.it/
#
# 1. When you get to this page, scroll all the way to the bottom.
#
# 2. You will be prompted for your "regex skill level" and other basic signup
#    info. It, seems that you must fill in this info in order for the example
#    questions to work correctly. However, the exact answers to these questions
#    don't seem to matter (it seems that you get the same questions no 
#    matter what "skill level" you choose)
# 
# 3. Press the "Next" button.
#
# 4. You will then be prompted with a list of regex metacharacters 
# Some of these are a little challenging: http://play.inginf.units.it/#/
#
# You can find sample answers here: https://avicoder.me/2019/01/21/regex-fun/
# There could definitely be other valid answers.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ANSWERS THAT WE DID TOGETHER IN CLASS

# level 1
#   \d+


# level 2
#   [a-zA-Z0-9]{2}:[a-zA-Z0-9]{2}:[a-zA-Z0-9]{2}:[a-zA-Z0-9]{2}:[a-zA-Z0-9]{2}:[a-zA-Z0-9]{2}
#   ([a-zA-Z0-9]{2}:){5}[a-zA-Z0-9]{2}
#   ([0-9a-zA-Z][0-9a-zA-Z]:){5}[0-9a-zA-Z][0-9a-zA-Z]


# level 3
#
#   ftp://ftp[^.]*\.[a-zA-Z]+\.[a-zA-Z]+(\.[a-zA-Z]+)?/pub/FreeBSD/
#   ftp://ftp[^.]*(\.[a-zA-Z]+)+/pub/FreeBSD/
#   ftp://ftp\d*(\.[a-zA-Z]+)+/pub/FreeBSD/


# level 4
#   \$[^$]+\$

6.21 gsub works with patterns

# gsub ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# gsub works with patterns
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

gsub("[aeiou]", "x", fruit)   # replace all vowels with x's
 [1] "xpplx"             "N. Amxrxcxn xpplx" "S. Kxrxxn Fxg"    
 [4] "fxg"               "stxr frxxt"        "pxxr"             
 [7] "prxckly pxxr"      "Bxxrrx Hxrdy pxxr" "chxrry"           
[10] "blxck chxrry"      "pxxch"             "plxm"             
[13] "kxmqxxt"           "bxnxnx"            "blxxbxrry"        
[16] "strxwbxrry"        "hxnxydxw"          "strxwbxrrxxs"     
[19] "yxmbxrry"         
gsub("[^aeiou]", "x", fruit)   # replace all non-vowels with x's
 [1] "axxxe"             "xxxxxexixaxxaxxxe" "xxxxoxeaxxxix"    
 [4] "xix"               "xxaxxxxuix"        "xeax"             
 [7] "xxixxxxxxeax"      "xeuxxexxaxxxxxeax" "xxexxx"           
[10] "xxaxxxxxexxx"      "xeaxx"             "xxux"             
[13] "xuxxuax"           "xaxaxa"            "xxuexexxx"        
[16] "xxxaxxexxx"        "xoxexxex"          "xxxaxxexxiex"     
[19] "xuxxexxx"         
gsub("[^aeiou]+", "x", fruit)   # replace one or more non-vowels with a single x
 [1] "axe"         "xexixaxaxe"  "xoxeaxix"    "xix"         "xaxuix"     
 [6] "xeax"        "xixeax"      "xeuxexaxeax" "xex"         "xaxex"      
[11] "xeax"        "xux"         "xuxuax"      "xaxaxa"      "xuexex"     
[16] "xaxex"       "xoxexex"     "xaxexiex"    "xuxex"      
gsub("[^aeiou]*", "x", fruit)
 [1] "xaxex"         "xexixaxaxex"   "xoxexaxix"     "xix"          
 [5] "xaxuxix"       "xexax"         "xixexax"       "xexuxexaxexax"
 [9] "xex"           "xaxex"         "xexax"         "xux"          
[13] "xuxuxax"       "xaxaxax"       "xuxexex"       "xaxex"        
[17] "xoxexex"       "xaxexixex"     "xuxex"        
gsub("[^aeiou]*", "x", "apple")
[1] "xaxex"

6.22 backreferences \1 \2 etc

# BACKREFERENCES  ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Parenthesized expressions in a regex can be referred "back" to 
# with \1, \2 ... 
# (remember in R you need two backslashes - i.e. \\1, \\2, ...)
#
# The original regex standard only allowed for up to nine
# backreferences, ie. \1 \2 \3 ... \9  
# It did not allow for \10. Some environments have ways to
# allow you to reference \10 and further but I personally
# don't know how to do that in R ... I guess you could 
# research that if you need to but it usually doesn't
# come up. If it becomes an issue, there is almost always
# a simple way to workaround the situation
# using loops and other coding approaches.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



# Find fruits that have 3 letters in the pattern xyx or aba
grep ("([a-z])[a-z]\\1", fruit, value=TRUE)
[1] "banana"    "blueberry"
grep ("([a-z])([a-z])\\1\\2", fruit, value=TRUE)
[1] "banana"
grep("([a-z][a-z])\\1", fruit, value=TRUE)
[1] "banana"
gsub ("([a-z])([a-z])\\1", "\\2\\1\\2", fruit)
 [1] "apple"             "N. American apple" "S. Korean Fig"    
 [4] "fig"               "star fruit"        "pear"             
 [7] "prickly pear"      "Beurre Hardy pear" "cherry"           
[10] "black cherry"      "peach"             "plum"             
[13] "kumquat"           "bnanna"            "blubebrry"        
[16] "strawberry"        "honeydew"          "strawberries"     
[19] "yumberry"         
gsub("^(.)(.)(.)",   # reverse the first 3 characters  
     "\\3\\2\\1", 
     fruit)
 [1] "ppale"             " .NAmerican apple" " .SKorean Fig"    
 [4] "gif"               "atsr fruit"        "aepr"             
 [7] "irpckly pear"      "ueBrre Hardy pear" "ehcrry"           
[10] "albck cherry"      "aepch"             "ulpm"             
[13] "mukquat"           "nabana"            "ulbeberry"        
[16] "rtsawberry"        "noheydew"          "rtsawberries"     
[19] "muyberry"         
# QUESTION
# Write a command to swap the first character and last character of
# each fruit

gsub( "^(.)(.*)(.)$" , "\\3\\2\\1" , fruit)
 [1] "eppla"             "e. American applN" "g. Korean FiS"    
 [4] "gif"               "ttar fruis"        "reap"             
 [7] "rrickly peap"      "reurre Hardy peaB" "yherrc"           
[10] "ylack cherrb"      "heacp"             "mlup"             
[13] "tumquak"           "aananb"            "ylueberrb"        
[16] "ytrawberrs"        "woneydeh"          "strawberries"     
[19] "yumberry"         
gsub( "^(.)(.*)(.)$" , "\\3-\\2-\\1" , fruit)
 [1] "e-ppl-a"             "e-. American appl-N" "g-. Korean Fi-S"    
 [4] "g-i-f"               "t-tar frui-s"        "r-ea-p"             
 [7] "r-rickly pea-p"      "r-eurre Hardy pea-B" "y-herr-c"           
[10] "y-lack cherr-b"      "h-eac-p"             "m-lu-p"             
[13] "t-umqua-k"           "a-anan-b"            "y-lueberr-b"        
[16] "y-trawberr-s"        "w-oneyde-h"          "s-trawberrie-s"     
[19] "y-umberr-y"         
# QUESTION 
# Find fruits that start and end with the same letter
#

grep("^(.).*\\1$", fruit, value=TRUE)
[1] "strawberries" "yumberry"    
shoppingList = c("35 yumberry pops", 
                 "four strawberries         ", 
                 " five apples",
                 "six yumberry and strawberries pops")
shoppingList
[1] "35 yumberry pops"                   "four strawberries         "        
[3] " five apples"                       "six yumberry and strawberries pops"
# QUESTION
# Use sub or gsub to replace words that start and end with the same letter
# with the first letter then "XXXX" then the last letter of the word

gsub("\\b(.).*\\1\\b", "\\1XXXX\\1", shoppingList)
[1] "35 yumberry pops"           "four strawberries         "
[3] " XXXX apples"               "sXXXXs"                    
# Make the * UN-greedy by following it with a ?
gsub("\\b(.).*?\\1\\b", "\\1XXXX\\1", shoppingList)
[1] "35 yumberry pops"           "four strawberries         "
[3] " XXXX apples"               "sXXXXs"                    
#
gsub("\\b([a-z])[a-z]*?\\1\\b", "\\1XXXX\\1", shoppingList)
[1] "35 yXXXXy pops"             "four sXXXXs         "      
[3] " five apples"               "six yXXXXy and sXXXXs pops"

6.23 “greedy” vs “non-greedy” quantifiers

# "greedy" vs "non-greedy" quantifiers ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# By default, quantifiers (e.g. + * ?) are "greedy"
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# By **default** quantifiers are "greedy". In other words ...
#
# A "greedy" match works as follows:
#
#   1. The regex engine tries to start matching the regex at the beginning o
#      of the text
#
#   2. If there are any quantifiers (e.g. * + ?) in the regex, the reular
#      expression engine tries to match AS MUCH of the text as possible.
#      (see the example in VSCode described below).
#
#   3. 
#   
# by as much as it can. This behavior can be changed by using 
# "non-greedy" quantifiers as shown below. To make a quantifier non-greedy
# just follow it with a question mark.
#


# Greedy quantifiers: match as MUCH as possible while still being able to
# match rest of the pattern. The greedy quantifiers:
#
#   PATTERN{n,m}  minimum of n, maximum of m
#   PATTERN{n,}   n or more
#   PATTERN+      same as {1,} i.e. one or more 
#   PATTERN*      same as {0,} i.e. zero or more of the preceding pattern
#   PATTERN?      same as {0,1} ie. zero or one (i.e. optional)
#
# UNgreedy (or stingy) quantifiers:
# match as LITTLE as possible while still being able to
# match rest of the pattern. The greedy quantifiers:
#
#   PATTERN{n,m}?  minimum of n, maximum of m
#   PATTERN{n,}?   n or more
#   PATTERN+?      same as {1,} i.e. one or more 
#   PATTERN*?      same as {0,} i.e. zero or more of the preceding pattern
#   PATTERN??      same as {0,1} ie. zero or one (i.e. optional)
#

# EXAMPLES:

# greedy   {n,m}
# ungreedy {n,m}?

sub("[0-9]{3,5}", "x", "123456 1234")  # {3,5} greedy   "x6 1234"
[1] "x6 1234"
sub("[0-9]{3,5}?", "x", "123456 1234") # {3,5}? UNgreedy "x456 1234"
[1] "x456 1234"
gsub("[0-9]{3,5}", "x", "123456 1234")  # {3,5} greedy   "x6 x"
[1] "x6 x"
gsub("[0-9]{3,5}?", "x", "123456 1234") # {3,5}? UNgreedy "xx x4"
[1] "xx x4"
# greedy   +
# ungreedy +?

sub("[0-9]+", "x", "123456 1234")      # +  greedy   "x 1234"       
[1] "x 1234"
sub("[0-9]+?", "x", "123456 1234")     # +? UNgreedy "x23456 1234"  
[1] "x23456 1234"
gsub("[0-9]+", "x", "123456 1234")     # +     greedy   "x x"
[1] "x x"
gsub("[0-9]+?", "x", "123456 1234")    # +?    UNgreedy "xxxxxx xxxx"
[1] "xxxxxx xxxx"
# greedy   *
# ungreedy *?

sub("[0-9]*", "x", "123456 1234")      # *  greedy   "x 1234"
[1] "x 1234"
sub("[0-9]*?", "x", "123456 1234")     # *? UNgreedy "x123456 1234"
[1] "x123456 1234"
gsub("[0-9]*", "x", "123456 1234")     # *     greedy   "x x"
[1] "x x"
gsub("[0-9]*?", "x", "123456 1234")    # *?    UNgreedy "xxxxxx xxxx"
[1] "x1x2x3x4x5x6x x1x2x3x4x"
# greedy   ?
# ungreedy ??

sub("[0-9]?", "x", "123456 1234")      # ?  greedy   "x23456 1234"
[1] "x23456 1234"
sub("[0-9]??", "x", "123456 1234")     # ?? UNgreedy "x123456 1234"
[1] "x123456 1234"
gsub("[0-9]?", "x", "123456 1234")     # ?   greedy   "xxxxxx xxxx"
[1] "xxxxxx xxxx"
gsub("[0-9]??", "x", "123456 1234")    # ??  UNgreedy  "x1x2x3x4x5x6x x1x2x3x4x"
[1] "x1x2x3x4x5x6x x1x2x3x4x"
sub("[a-z]*?", "x", "abcde")
[1] "xabcde"

6.23.1 examples

# Question
# Extract JUST the first quotation from each of the following.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
quotations = c('Bill said "hi" to Jill. She replied "bye" to him and "hello" to the driver.',
               'Tony said "I love ice cream!" to his mom. He then said "I love mom:)"')

# ANSWER: use greedy and UNgreedy quantifiers as appropriate
sub('(.*?)(".*?")(.*)', "\\2", quotations)
[1] "\"hi\""                "\"I love ice cream!\""
# ANOTHER ANSWER: This also works and perhaps is easier to understand (or perhaps not :)
sub('([^"]*)("[^"]*")(.*)', "\\2", quotations)
[1] "\"hi\""                "\"I love ice cream!\""
###################################.
# The following are NOT answers
###################################.

# NOT an answer - Compare with the following - this one gets the last quotation
sub('(.*)(".*")(.*)', "\\2", quotations)
[1] "\"hello\""        "\"I love mom:)\""
# It's helpful to see what happens with the str_view function in the following
# cases

# This matches everything from the first quotation mark to the last
str_view(quotations, '".*"')
[1] │ Bill said <"hi" to Jill. She replied "bye" to him and "hello"> to the driver.
[2] │ Tony said <"I love ice cream!" to his mom. He then said "I love mom:)">
# The following is more what we want. *? is now a non-greedy quantifier. 
# Therefore, it matches the first quoted info.
# Then str_view continues by showing you the next match (i.e. the 2nd
# quoted info), then the 3rd match, etc.
str_view(quotations, '".*?"')
[1] │ Bill said <"hi"> to Jill. She replied <"bye"> to him and <"hello"> to the driver.
[2] │ Tony said <"I love ice cream!"> to his mom. He then said <"I love mom:)">

AN ASIDE - the 1st question mark is not actually necessary in this case

# This also works - see below for why the first ? is not necessary in this case.
sub('(.*)(".*?")(.*)', "\\2", quotations)
[1] "\"hi\""                "\"I love ice cream!\""
# Use VSCode to understand greedy VS non-greedy quantifiers
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Use VSCode to open a text file that contains a lot of English text.
#
# Do a regex search in VSCode for
#   e.*e
#
# This is a "greedy" search (since it uses * instead of *?).
# It searches for "e" following by anything followed by another "e".
# This will highlight on each line all the text starting from the 
# first e on the line until the last e on the line.
#
# Now search again using a non-greedy quantifier, i.e. .*?
#   e.*?e
#
# The results will be potentially several matches on each line. 
# Each match starts with an "e" and extends to the next "e" but no further.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

6.24 The complete list of quantifiers

# The complete list of quantifiers 
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# In general you can use ? to turn any greedy quantifier into a non-greedy
# quantifier
#
#   *  - zero or more (greedy)
#   *? - zero or more (non-greedy)
#
#   +  - one or more (greedy)
#   +? - one or more (non-greedy)
#
#   ?  - zero or one (greedy)
#   ?? - zero or one (non-greedy)
#
# NOTE the following also allow for non-greedy ? modifier. However, these
# are not really necessary - see the notes below.
#
#   {3,5}  - 3,4 or 5 repetitions (greedy - i.e. will match all 5 if they are there)
#   {3,5}? - (non-greedy - will match 3 even if there are 5)
#            Notice - you could just write {3} instead of {3,5}? (think about it)
#
#   {3,}   - (greedy)     3 or more matches in a row, matches as many as there are
#   {3,}?  - (non-greedy) will always match first 3 even if there are more (non-greedy)
#            Notice - you could just write {3} instead of {3,}? (think about it)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

6.24.1 examples

# EXAMPLE
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

quotedCsv = c('"a,b,c","apple,orange","watermellon"')
cat(quotedCsv)
"a,b,c","apple,orange","watermellon"
# The following gsub uses a greedy quantifier, ie. *.
# It will match as much as it can.

# greedy
gsub('".*"', 'QUOTES', quotedCsv)   # "QUOTES"
[1] "QUOTES"
# non-greedy
gsub('".*?"', 'QUOTES', quotedCsv) # "QUOTES,QUOTES,QUOTES"
[1] "QUOTES,QUOTES,QUOTES"
# A MORE COMPLEX EXAMPLE
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

text = "She sells sea shells by the sea shore."

result = gsub("(.*)(sea)(.*)", "1st \\1\n2nd \\2\n3rd \\3", text)

cat(result)
1st She sells sea shells by the 
2nd sea
3rd  shore.
# Reults are "greedy", i.e. the .* in the beginning matches as much as 
# it can as long as the whole regex will work. The result is:

# 1st part: She sells sea shells by the 
# 2nd part: sea
# 3rd part: shore.

# The following DOES NOT happen
#
# 1st part:    She sells 
# 2nd part:    sea
# 3rd par:     shells by the sea shore.


# we can make the regex UN-GREEDY by using a ? AFTER the *
#

text = "She sells sea shells by the sea shore."


result = gsub("(.*?)(sea)(.*)", "1st \\1\n2nd \\2\n3rd \\3", text)

cat(result)
1st She sells 
2nd sea
3rd  shells by the sea shore.

6.25 Challenges

# Challenges ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Use the following for the challenges below

stuff = c("His ssn is 876543890.",
          "Call me at 212 950 3216 when you have time.",
          "Please call Joe at 777-7777",
          "Sue's number is (555)123   4567.",
          "7182345678 is the number for the helpdesk.",
          "Email Anne at anne@anneco.com and explain.",
          "Meet me @ 10pm.",
          "Mikes company is called mike@large",
          "To work in Whatsapp internationally you need to enter his number as +1 555 555 5555.")
stuff

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Write a regular expression to find telephone numbers
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

pattern = paste0 ( "(^|\\D)",              # start with a non-digit or the start of the text
                   "(\\(?\\d{3}\\)?)?",    # optional area code with optional (parentheses)
                   " *",                   # zero or more spaces
                   "\\d{3}",               # first 3 digits
                   " *-? *",               # any number of spaces surrounding an optional dash
                   "\\d{4}",
                   "(\\D|$)")  # end with a non digit or a the end of the text

pattern
grep (pattern, stuff, value=TRUE)



#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Challenge: 
# 
# Extract just the telephone numbers in a standard format.
#
# Make sure to use parentheses in the pattern.
#
# Substitute JUST the parts you want.
#
# Use grep (... value=FALSE ...) to get the positions that 
# matched and keep only those.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


# Show how to extract just the area code, first 3 and last 4 digits
stuff
newpattern = paste0 ( "(^|.*\\D)",  # tel# at beginning or after a non-digit
                      "(\\(?(\\d{3})\\)?)?", # optional area code with optional (parentheses)
                      "( *)",      # zero or more spaces
                      "(\\d{3})",  # first 3 digits
                      "( *-? *)",  # any number of spaces surrounding an optional dash
                      "(\\d{4})",  # last 4 digits
                      "(\\D.*|$)"  # tel # at end or before non-digit
)
grep(newpattern, stuff, value=TRUE)
positionsWithTelNums = grep(newpattern, stuff)
reformatted = gsub(newpattern, "\\2 \\5 \\7", stuff) 

reformatted[positionsWithTelNums]




result = gsub (newpattern, 
               paste0("1stPart \\1\n",
                      "2ndPart \\2\n",
                      "3rdPart \\3\n4thPart \\4\n",
                      "5thPart \\5\n6thPart \\6\n7thPart \\7\n8thPart \\8\n"),
               stuff)

result

cat(result[1])
cat(result[2])
cat(result[3])

# The following example shows a "bug" in the regex. 
# Since regular expressions are "greedy", the area code is matched
# in the 1st part if it is indeed surrounded by parentheses. 
# We can fix this but it is tricky.
cat(result[4])

cat(result[5])
cat(result[6])
cat(result[7])
cat(result[8])
cat(result[9])

# See the pattern used above

telNums = gsub(newpattern, "\\3 \\5-\\7", stuff)
telNums
cat(telNums, sep="\n")

positionsWithTelNums = grep(newpattern, stuff, value=FALSE)
positionsWithTelNums

telNums
telNums [ positionsWithTelNums ]


stuff
gsub(newpattern, "XXXXXXXXXX", stuff)






#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# CHALLENGE
#
# - write a regular expression to find email addresses and extract them 
#   from a character vector
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# CHALLENGE
#
# Get a vector of all the words from "The adventures of Sherlock Holmes".
# You can find the UTF-8 encoding version here:
#    https://www.gutenberg.org/files/1661/1661-0.txt
#
# HINT: use
#  - readLines  with  url("https://www.gutenberg.org/files/1661/1661-0.txt")
#  - strsplit
#  - unlist    (remember that strsplit returns a LIST)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

book = readLines(url("https://www.gutenberg.org/files/1661/1661-0.txt"),
                 encoding="UTF-8")
head(book)

words = strsplit(book, " +")

head(words)

words = unlist(words)

head(words, 100)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# CHALLENGE
#
# Get a vector of the 10 most common words in "The Adventures of Sherlock Holmes"
# HINT: Use the table function.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

wordTable = table(words)
head(wordTable)

sorted = sort(wordTable)
head(wordTable)

tail(sorted)


# Lookahead and lookbehind ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Lookahead and lookbehind are used to match a portion of the text but NOT
# consider it part of the match. To use this you MUST set the following
# argument in grep, strsplit, gsub, etc:    perl=TRUE
#
#    Positive Lookahead (?=pattern)
#    Negative Lookahead (?!pattern)
#    Positive Lookbehind    (?<= pattern)
#    Negative Lookbehind    (?<! pattern)
# 
# See this page
#  https://debuggingdata.com/post/r/regular-expressions-look-arounds/
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

stuff = c("a100", "200b" , "@300", "400@")
stuff

# Lookbehind
gsub("(?<=[a-z])\\d+", "NUMBER", stuff, perl=TRUE)


# Lookbehind  (not equal to )
gsub("(?<![a-z0-9])\\d+", "NUMBER", stuff, perl=TRUE)


# Lookahead
gsub("\\d+(?=[a-z])", "NUMBER", stuff, perl=TRUE)

# Lookahead   (not equal to )
gsub("\\d+(?![a-z0-9])", "NUMBER", stuff, perl=TRUE)


quotations

# Replace all letters between quotation marks with XXXX
gsub('(?<=")[a-zA-Z!.? ]+(?=")', "XXXX", quotations, perl=TRUE)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# CHALLENGE
#
# Get a vector of all the sentences from "The adventures of Sherlock Holmes".
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

book = readLines(url("https://www.gutenberg.org/files/1661/1661-0.txt"),
                 encoding="UTF-8")

book[100:110]

newBook = paste0(book, collapse=" ")

length(newBook)

str(newBook)

sentences = strsplit(newBook, "[.?!]")[[1]]

str(sentences)

length(sentences)

sentences[3]
head(sentences)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# CHALLENGE
#
# Get a vector of all quotations from "The adventures of Sherlock Holmes".
# HINT: use an un-greedy search
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




# Other arguments and functions ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# help pages

?regex

?grep

?strsplit


# grep(pattern, x, ignore.case = FALSE, perl = FALSE, value = FALSE,
#      fixed = FALSE, useBytes = FALSE, invert = FALSE)
#
# grepl(pattern, x, ignore.case = FALSE, perl = FALSE,
#       fixed = FALSE, useBytes = FALSE)
#
# sub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE,
#     fixed = FALSE, useBytes = FALSE)
#
# gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE,
#      fixed = FALSE, useBytes = FALSE)
#

text = "She sells sea shells by the sea shore."

sub("sea", "xxxxx", text)

gsub("sea", "xxxxx", text)

sub("s", "x", text)

gsub("s", "x", text)

gsub(".", "x", text)

gsub("\\.", "x", text)

gsub(".", "x", text, fixed=TRUE)


words = strsplit(text, " ")

words

# Get the 2nd word from the text:
words[[1]][2]


words = strsplit(addresses, " ")


words

# Get the 2nd word from the 3rd address
words[[3]][2]


# Other functions - sub vs gsub, regexpr, gregexpr, regexec ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# See the documentation for the following functions: 
#
# regexpr(pattern, text, ignore.case = FALSE, perl = FALSE,
#         fixed = FALSE, useBytes = FALSE)
#
# gregexpr(pattern, text, ignore.case = FALSE, perl = FALSE,
#          fixed = FALSE, useBytes = FALSE)
#
# regexec(pattern, text, ignore.case = FALSE, perl = FALSE,
#         fixed = FALSE, useBytes = FALSE)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

6.26 More practice with regex - see the following websites

# More practice with regex  ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# See the following sites:

# http://regextutorials.com/
# https://regexone.com/
# https://librarycarpentry.org/lc-data-intro/03-quiz/index.html
# https://www.hackerrank.com/domains/regex
# https://regex.sketchengine.co.uk/


# Period comes between rest of line and first word
pattern = paste0("(^[A-Za-z]+)",     # first word on line
                 "(\\s+)",           # spaces after the first word
                 "(.*)"              # rest of the line
                 )

str_replace_all(sentences, pattern=pattern, replacement="\\3\\2\\1")



# Period comes between rest of line and first word
pattern = paste0("(^[A-Za-z]+)",     # first word on line
                 "(\\s+)",           # spaces after the first word
                 "(.*)",              # rest of the line except for the final period
                 "([?!.])"
)

movedFirstWordToEnd = str_replace_all(sentences, pattern=pattern, replacement="\\3\\2\\1\\4")
str_to_sentence(movedFirstWordToEnd)

#########################

# change the case on the first word to uppercase
# change the case on the last word to lowercase

toupper(c("hello", "goodbye"))



#str_replace_all(sentences, pattern="(^[A-Za-z]+)(\\s+)(.*)", "\\3\\2\\1")