# This is accompanying code for # https://jozef.io/r007-string-manipulation/ # Quick overview of the very basics =========================================== # String constants can be assigned using # double quotes a <- "this is a character string" # or single quotes b <- 'this is a character string, too' # To use literal quotes, we can escape with `\`: c <- "this is \"it\"" # To make a character vector with multiple elements: d <- c("this", "vector", "has", "five", "elements") # To get the length of a character vector # (how many elements are in a character vector) length(d) # To get the number of characters in elemets of a vector # ("how many characters in each of the elements") nchar(d) # To create a missing character value NA_character_ # To test if an object is a character vector is.character("is this a character vector?") # To convert other objects to character vectors # Can surprise the unwary as.character(c( 42, Sys.time(), factor("A", levels = LETTERS) )) # One of the ways to output a vector is `cat` cat("Show me this") # To include line breaks use `"\n"` # To include tabs use `"\t"`: cat("Break\ta\ta\nline") # When in doubt about an object # str or summary may help weirdList <- list( "What is this?", Sys.time(), b = 5L, c = c("one", 2), d = factor(c("red", "blue")), e = NA_character_, f = NA_integer_ ) str(weirdList) summary(weirdList) # String concatenation ======================================================== # |- Simple concatenation ----------------------------------------------------- # We will use these vectors for our examples: 1:3 month.name # Use paste to concatenate # R recycles 1:3 4 times to fit the length of month.name paste(1:3, month.name) # Specify the sep argument to # separate the elements differently paste(1:3, month.name, sep = ": ") # A shorthard for sep = "" paste0(1:3, month.name) # Alternatively, sprintf is very useful sprintf("%s: %s", 1:3, month.name) # |- Concatenate a vector into a single character string ---------------------- # Provide the collapse argument to paste # to get a character string (length 1 vector): paste(1:3, month.name, sep = ": ", collapse = ", ") # Or, use toString toString(paste(1:3, month.name, sep = ": ")) # String manipulation and properties ========================================== # |- String lengths ----------------------------------------------------------- # How many elements does a vector have? length(month.name) # To get the number of characters in elemets of a vector # ("how many characters in each of the elements?") nchar(month.name) # Are the elements non-empty strings? nzchar(month.name) # |- Switching to upper/lower case -------------------------------------------- # Switch to all lower case tolower(month.name) # Switch to all upper case toupper(month.name) # Casefold is a wrapper for S-PLUS compatibility casefold(month.name, upper = FALSE) casefold(month.name, upper = TRUE) # Also, custom translation: chartr("OIZEASGTC", "01234567(" , toupper(month.name)) # |- Removing white spaces ---------------------------------------------------- # Remove all leading and trailing whitespaces trimws(" This has trailing spaces. ") # Remove leading whitespaces trimws(" This has trailing spaces. ", which = "left") # Remove trailing whitespaces trimws(" This has trailing spaces. ", which = "right") # |- Encoding conversion ------------------------------------------------------ # Convert a character vector between encodings iconv("šibrinkuje", "UTF-8", "ASCII", "?") # |- Quoting ------------------------------------------------------------------ # Quoting text for fancier priting: sQuote(month.name) dQuote(month.name) # Not to be confused with quoting strings for passing to OS shell system(paste("echo", shQuote("Weird\nstuff"))) # Also not be confused with quoting expressions str(quote(1 + 1)) # |- Retrieving and working with substrings ----------------------------------- # Get the first three characters from all the month.names substr(month.name, 1, 3) # Get the last three characters from all the month.names substr(month.name, nchar(month.name) - 2, nchar(month.name)) # Wrapper around substr for S Compability: substring(month.name, 1, 3) # Check whether elements start with a string startsWith(month.name, "J") # Check whether elements end with a string endsWith(month.name, "ember") # Trim character strings to specified display widths. strtrim(month.name, 3) # Abbreviate strings to at least minlength characters abbreviate(month.name, minlength = 3) # Basic pattern matching and replacement ====================================== # |- Replace substring with other strings ------------------------------------- myStrings <- paste(1:3, month.name, sep = ". ") # Replace all ones with zeros: # fixed will match the first argument as is gsub("1", "0", myStrings, fixed = TRUE) # Replace only the first "a" in each for "A" sub("a", "A", myStrings, fixed = TRUE) # Replace any number with 0 # note that the fixed argument is now FALSE (default) gsub("[0-9]", "0", myStrings) # Replace literal dots with 0 gsub(".", "0", myStrings, fixed = TRUE) # This will replace all characters with zeros gsub(".", "0", myStrings) # |- Check if a pattern is present within elements of a character vector ------ myStrings <- paste(1:3, month.name, sep = ". ") # Is a pattern present (returns a logical vector)? grepl("ember", myStrings) # In which elements is a pattern present (returns indices)? grep("ember", myStrings) # In which elements is a pattern present (returns the values)? grep("ember", myStrings, value = TRUE) # |- Check where the matches are within the elements of a character vector ---- myStrings <- paste(1:3, month.name, sep = ". ") # Where is the first "a" located in each of the elements? # pattern if not found in that element, returns -1 regexpr("a", myStrings) # Where are all the "a" located in each of the elements? # If pattern not found in that element, returns -1 gregexpr("a", myStrings) # Where are all the "a" located in the first element? gregexpr("a", myStrings[1]) # or also gregexpr("a", myStrings)[[1]] # Bonuses ===================================================================== # The Levenshtein distance between strings adist(c("lazy", "lasso", "lassie"), c("lazy", "lazier", "laser")) # Repeat elements of a character vector a given number of times strrep(c(":)", ":P ", ";) "), 1:3) # Convert strings to integers of a given base strtoi(c("101010", "11111000101"), base = 2L) strtoi(c("2A", "7C5"), base = 16L) # Symbolic Number Coding cors <- lapply(split(iris, iris$Species), function(x) cor(x[, 1:4])) lapply(cors, symnum, abbr.colnames = 6)