Wordclouds in R

Wordclouds are a great way of visualizing the most frequent terms in texts. Additionally, R provides some great tools to convert pdfs into text files and clean the texts, so non-informative terms are ignored (e.g., articles, prepositions, etc.).

Converting data from pdf to text

library(pdftools)
library(wordcloud)
library(tm)
library(tidyverse)


# Convert pdf 2 text function
files <- list.files("pdf/",
                    "*.pdf",
                    full.names = T)
pdfs <- sapply(files, function(x){
  pdftools::pdf_text(x) %>%
    paste(sep = " ") %>%
    # Remove special characters
    stringr::str_replace_all(fixed("\n"), " ") %>%
    stringr::str_replace_all(fixed("\r"), " ") %>%
    stringr::str_replace_all(fixed("\t"), " ") %>%
    stringr::str_replace_all(fixed("\""), " ") %>%
    paste(sep = " ", collapse = " ") %>%
    stringr::str_squish() %>%
    stringr::str_replace_all("- ", "") 
})

# Clean text by removing numbers, spaces, punctuation, etc.
arts_text_clean <- Corpus(VectorSource(pdfs))

# Remove punctutation
arts_text_clean <- tm_map(arts_text_clean, removePunctuation)
# Pass all word to lowercase
arts_text_clean <- tm_map(arts_text_clean, content_transformer(tolower))
# Remove numbers
arts_text_clean <- tm_map(arts_text_clean, removeNumbers)
# Remove spaces
arts_text_clean <- tm_map(arts_text_clean, stripWhitespace)
# Remove stopwords
arts_text_clean <- tm_map(arts_text_clean, removeWords, stopwords('english'))
arts_text_clean <- tm_map(arts_text_clean, removeWords, stopwords('spanish'))
arts_text_clean <- tm_map(arts_text_clean, removeWords, stopwords('portuguese'))

# Create matrix
arts_text_clean <- TermDocumentMatrix(arts_text_clean) 
arts_text_clean <- as.matrix(arts_text_clean) 
arts_text_clean <- sort(rowSums(arts_text_clean),decreasing=TRUE) 
df <- data.frame(word = names(arts_text_clean),freq=arts_text_clean)

# Remove leftover punctuations and words that wish to be omitted
df <- df |>
  # Need to add the freq as I can't remove the hyphen
  filter(word != "–" & word != "−" & word != "•" & freq != 466 &
           word != "crossref" & word != "doi" & word != "thus" & word != "two" &
           word != "one" & word != "fig" & word != "three" & word != "can" & 
           word != "may" & word != "therefore" & word != "first" & word != "also" &
           word != "author" & word != "journal" & word != "among" & word != "figure" &
           word != "solórzano" & word != "gallardocruz" & word != "jiménezlópez" &
           word != "springer" & word != "although" & word != "however" & word != "authors") |>
  mutate_at(vars(word), function(x) ifelse(x == "ecol", "ecology", x)) |>
  mutate_at(vars(word), function(x) ifelse(x == "sens", "sensing", x)) |>
  mutate_at(vars(word), function(x) ifelse(x == "environ", "environment", x)) |>
  group_by(word) |>
  summarise(freq2 = sum(freq))

Wordcloud

To do the wordcloud you will need a dataframe containing the words and its corresponding frequency of appearance. In this case that object is saved as df and contains two columns word and freq2. The rest of the arguments let you choose the minimum frequency shown in the wordplot, the maximum number of words shown in the plot, if a random order should be used, rotation percentage and the colors of the words.

set.seed(1234) # for reproducibility 

png("wordcloud.png",
    width = 10,
    height = 10,
    units = "cm",
    res = 300)
wordcloud(words = df$word, 
          freq = df$freq2, 
          scale=c(3.5,0.25),
          min.freq = 20,
          max.words=200, 
          random.order=FALSE, 
          rot.per=0.35,
          colors=rev(brewer.pal(6, "Dark2")))
dev.off()

An example of a wordplot.

styled-image Wordcloud of my publications.