考慮XML
作爲你唯一需要包xpathSApply
電話:
library(XML)
theURL <- "http://www.r-bloggers.com/search/web%20scraping"
page_data <- htmlParse(readLines(theURL, warn = FALSE))
pages <- xpathSApply(doc, '//*[@id="leftcontent"]/div[11]/span[1]', xmlValue)
pages <- as.numeric(regmatches(pages, regexpr("[0-9]+$", pages)))
scrape_r_bloggers_page <- function(doc, page){
titles <- xpathSApply(doc, '//div[contains(@id,"post")]/h2/a', xmlValue)
descriptions <- xpathSApply(doc, '//div[contains(@id,"post")]/div[2]/p[1]', xmlValue)
dates <- xpathSApply(doc, '//div[contains(@id,"post")]/div[1]/div', xmlValue)
authors <- xpathSApply(doc, '//div[contains(@id,"post")]/div[1]/a', xmlValue)
urls <- xpathSApply(doc, '//div[contains(@id,"post")]/h2/a', xmlValue)
blog_posts_df <- data.frame(title=titles,
description=descriptions,
author=authors,
date=dates,
url=urls,
page=page)
}
blogsdf <- scrape_r_bloggers_page(page_data, 1)
blogsList <- lapply(c(2:(pages-1)), function (page) {
Sys.sleep(1)
theURL <- paste("http://www.r-bloggers.com/search/web%20scraping/page/",page,"/",sep="")
page_data <- htmlParse(readLines(theURL, warn = FALSE))
scrape_r_bloggers_page(page_data, page)
})
finaldf <- rbind(blogsdf, do.call(rbind, blogsList))
可能是一個RCurl問題,因爲XML包工作正常:'DOC < - htmlParse(readlines方法(theURL)); xpathSApply(doc,'// * [@ id =「leftcontent」]/div [11]/span [1]',xmlValue); #[1]「第1頁25」' – Parfait
@Parfait謝謝。這裏是一塊codetheURL的< - 「http://www.r-bloggers.com/search/web%20scraping」 page_data < - HTML(theURL) #獲取的頁的總數 頁數< - page_data%>% html_nodes(xpath ='// * [@ id =「leftcontent」]/div [11]/span [1]') pages <-sapply(pages,xmlValue)// code the rendering error on這行代碼。 –
我完全理解代碼來自哪裏。這個錯誤可能是一個RCurl問題,因爲正如我上面顯示的那樣,精確的XPath在XML中起作用。 – Parfait