tidytext::unnest_tokens()
unnests values in text column into a word columntidytext::get_sentiments()
retrieve sentiment lexiconsYou need to have the following R packages installed and loaded:
The example text derives from the commencement speech given by David Foster Wallace at Kenyon College, Ohio on 21 May 2005.
web_page <- read_html("https://www.theguardian.com/books/2008/sep/20/fiction")
SelectorGadget is a javascript bookmarklet that you use in your web browser to identify CSS selectors in a web page. Read the vignette("selectorgadget")
for more information about using SelectorGadget.
web_nodes <- html_nodes(web_page, ".js-article__body p")
web_text <- html_text(web_nodes)
Or using %>%
web_text <- read_html("https://www.theguardian.com/books/2008/sep/20/fiction") %>%
html_nodes(".js-article__body p") %>% html_text()
Extract the text of J. K. Rowling’s commencement speech at Harvard University on 5 June 2008 from the following web page: http://news.harvard.edu/gazette/story/2008/06/text-of-j-k-rowling-speech/
Answer:
example_web_text <- read_html("http://news.harvard.edu/gazette/story/2008/06/text-of-j-k-rowling-speech/") %>%
html_nodes(".article-body p") %>% html_text()
text_df <- data_frame(text = web_text)
text_df <- text_df %>% slice(1:(grep("this is water.\"", text)))
text_df <- text_df %>% filter(nzchar(text))
text_df <- text_df %>% mutate(paragraph = row_number())
Or using %>%
text_df <- web_text %>% data_frame(text = .) %>% slice(1:(grep("this is water.\"",
text))) %>% mutate(paragraph = row_number())
example_web_text
to a data frame using data_frame
()slice()
. (Hint: you can use a vector in slice()
)Answer:
example_text_df <- example_web_text %>% data_frame(text = .) %>% slice(c(2:14,
16:n())) %>% mutate(paragraph = row_number())
tidytext
words <- text_df %>% unnest_tokens(word, text)
words <- words %>% anti_join(stop_words, by = "word")
words %>% count(word, sort = TRUE)
example_text_df
into a word columnAnswer:
example_words <- example_text_df %>% unnest_tokens(word, text) %>% anti_join(stop_words,
by = "word")
example_words %>% count(word, sort = TRUE)
bigram <- text_df %>%
unnest_tokens(word, text, token = "ngrams", n = 2) %>% # tokenise into word pairs
separate(word, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word, # remove 'stop words'
!word2 %in% stop_words$word) %>%
unite(word, word1, word2, sep = " ")
Frequency of bigrams
bigram %>% count(word, sort = TRUE) %>% top_n(10) %>% mutate(word = reorder(word,
n)) %>% ggplot(aes(word, n)) + geom_col(fill = "grey", alpha = 0.8) + coord_flip() +
scale_y_continuous(expand = c(0, 0)) + labs(x = NULL, y = "Number of mentions",
title = "2-word combinations in Foster Wallace's commencement speech") +
theme_minimal()
Frequency of positive words
words %>% inner_join(get_sentiments("bing"), by = "word") %>% filter(sentiment ==
"positive") %>% count(word) %>% wordcloud2(size = 0.7, fontFamily = "RobotoCondensed-Regular",
color = rep(c("orange", "skyblue"), length.out = nrow(.)))
Frequency of negative words
words %>% inner_join(get_sentiments("bing"), by = "word") %>% filter(sentiment ==
"negative") %>% count(word) %>% wordcloud2(size = 0.7, fontFamily = "RobotoCondensed-Regular",
color = rep(c("black", "grey"), length.out = nrow(.)))
Distribution of positive and negative words
words %>% inner_join(get_sentiments("bing"), by = "word") %>% count(word, sentiment,
sort = TRUE) %>% ungroup() %>% filter(n > 1) %>% mutate(n = ifelse(sentiment ==
"negative", -n, n)) %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word,
n, fill = sentiment)) + geom_col() + labs(x = NULL, y = NULL, fill = "Sentiment") +
coord_flip() + theme_minimal() + theme(legend.position = "bottom")