# Environment
suppressMessages(library(rvest))
suppressMessages(library(tidyverse))
suppressMessages(library(stringr))
suppressMessages(library(tidytext))
suppressMessages(library(stopwords))
suppressMessages(library(tm))
suppressMessages(library(wordcloud2))
# Target URL to scrape
#base_url <- "https://www.bestbuy.com/site/reviews/hp-instant-ink-50-page-monthly-plan-for-select-hp-printers/5119176"
# Load page
#page <- read_html(base_url)
# Scrape just the comments from the page
#comments <- page %>%
# html_nodes(".pre-white-space") %>%
# html_text() %>%
# tbl_df()
# Be nice if you're using this approach...don't over tax someone's website.
# Loop to do the same over pages 2 to 330
#for (i in 2:330) {
# url <- paste0(base_url, "?page=", i)
# page <- read_html(url)
# new_comments <- page %>%
# html_nodes(".pre-white-space") %>%
# html_text() %>%
# tbl_df()
# comments <- rbind(comments,new_comments)
#}
#write_csv(comments, "../../data/ink_comments.csv")
<- read_csv("ink_comments.csv") comments
Rows: 6622 Columns: 1
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): value
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.