This code snippet utilizes seleniumPipes and the RSelenium package. To make it work, ensure you have downloaded phantomjs, unzipped it, and placed the .exe file in your R working directory.
The technique involves a headless browser (phantomjs) that mimics user actions, allowing it to interpret values generated by JavaScript.
library(rvest)
library(RSelenium) # start a server with utility function
library(seleniumPipes)
rD <- rsDriver(browser = 'chrome', chromever = "latest", port = 4444L)
#open browser
remDr <- remoteDr(browserName = "chrome")
main_page_url <- "http://www.napo.net/search/newsearch.asp"
#go to home page
remDr %>% go(main_page_url)
#switch to iframe
remDr %>% switchToFrame(Id = "SearchResultsFrame")
#get all relative path
relative_path <- remDr %>% getPageSource() %>% html_nodes(".lineitem a[href]") %>% html_attr("href")
#all individual urls:
full_paths <- paste0("http://www.napo.net", relative_path)
#scrape email from each page
email_address <- list()
#Retrieve email address from the first three results
for(i in seq_along(full_paths[1:3])){
remDr %>% go(full_paths[i])
email_adress <- remDr %>% getPageSource() %>% html_nodes('a[href^="mailto"]') %>% html_text()
temp_list <- list(email = email_adress)
email_address <- c(email_address, temp_list)
Sys.sleep(3)
}
#display result
email_address[1]
$email
[1] "<a href="/cdn-cgi/l/email-protection" class="__cf_email__" data-cfemail="e48985968885a4d5d6d78b9683858a8d9e81ca878b89">[email protected]</a>"
The above pertains to page one; to navigate to page two:
remDr %>% go(main_page_url)
remDr %>% switchToFrame(Id = "SearchResultsFrame")
#click on page two within the iframe to proceed to page 2:
remDr %>% findElement(using = "css selector", value = ".DotNetPager a:nth-child(2)") %>% elementClick()
#get relative and full paths again
relative_path <- remDr %>% getPageSource() %>% html_nodes(".lineitem a[href]") %>% html_attr("href")
full_paths <- paste0("http://www.napo.net", relative_path)
#Repeat the for loop
for(i in seq_along(full_paths[1:3])){
remDr %>% go(full_paths[i])
email_adress <- remDr %>% getPageSource() %>% html_nodes('a[href^="mailto"]') %>% html_text()
temp_list <- list(email = email_adress)
email_address <- c(email_address, temp_list)
Sys.sleep(3)
}
#display result[6]
$email
[1] "<a href="/cdn-cgi/l/email-protection" class="__cf_email__" data-cfemail="670b1e0902131302270e1314140e0a170b1e170b060402034904080a">[email protected]</a>"
email_address
#You can also use a loop to scrape all pages
#-----
#delete session and close server
remDr %>% deleteSession()
rD[["server"]]$stop()