install.packages("rvest") # Delet after first install
install.packages("dplyr") # Delet after first install
install.packages("ggplot2") # Delet after first install
library(rvest)
library(dplyr)
library(ggplot2)
# Function scrap.f
scrap.f<-function (url) {
library(rvest)
library(dplyr)
web <- rvest::html(url,encoding="utf-8")
Headline <- web %>% html_nodes("h3 a") %>% html_text()%>%as.character()
Source <- web %>% html_nodes("p.c-author") %>% html_text() %>% as.character()
keyword <- web %>% html_nodes("em") %>% html_text()
keyword <- keyword[1:length(Headline)]%>%as.factor()
Media <- Source %>% substring(1,regexpr('[0-9]',.)-2)%>%as.factor()
a <- web %>% html_nodes("a.c-more_link") %>%html_text()%>%grep("相同新闻",.)
b <- web %>% html_nodes("a.c-more_link") %>%
html_text()%>%
substring(regexpr('[0-9]|[0-9][0-9]',.),
regexpr('[0-9]|[0-9][0-9]',.)+attr(regexpr('[0-9]|[0-9][0-9]',.),'match.length')-1)
Same_New <- vector(length=length(Headline),mode="numeric")
Same_New[a]<-b
Same_New<-Same_New %>% as.numeric()
TIME <- Source %>% substring(regexpr('((201.{1,})|([0-9].{1,}))(前|日)',.),
regexpr('((201.{1,})|([0-9].{1,}))(前|日)',.)+
attr(regexpr('((201.{1,})|([0-9].{1,}))(前|日)',.),'match.length')-1)
TIME <- sub('[0-9].{1,}前',Sys.Date(),TIME)
TIME <- gsub('年|月','-',TIME)
TIME <- gsub('日','',TIME)
DATE <- as.Date(TIME, format = "%Y-%m-%d")
Link <- web %>% html_nodes("h3.c-title a") %>% html_attrs() %>% pluck(1,character(1))
data.frame(Headline,Media,keyword,Same_New,DATE,Link)}
# How mang news you want from 0 to 200, by = 20
Pages <- seq(0,200,20)
# key words, whats in the box
Keywords <- c("%E5%A5%B6%E7%B2%89",
"%E7%BE%8E%E8%B5%9E%E8%87%A3",
"%E5%A4%9A%E7%BE%8E%E6%BB%8B",
"%E8%B4%9D%E5%9B%A0%E7%BE%8E",
"%E4%BC%8A%E5%88%A9",
"%E9%9B%85%E5%9F%B9",
"%E6%83%A0%E6%B0%8F",
"%E6%83%A0%E6%B0%8F%E5%90%AF%E8%B5%8B",
"%E5%90%88%E7%94%9F%E5%85%83",
"%E8%AF%BA%E4%BC%98%E8%83%BD",
"%E9%A3%9E%E9%B9%A4",
"%E7%88%B1%E4%BB%96%E7%BE%8E",
"%E5%8F%AF%E7%91%9E%E5%BA%B7",
"%E7%BE%8E%E7%B4%A0%E4%BD%B3%E5%84%BF")
Sites <- data.frame()
for (i in 1:length(Keywords)){
for (j in 1: length(Pages)){
Sites[i,j] <- paste("http://news.baidu.com/ns?word=",
Keywords[i],
"&pn",
Pages[j],
"&cl=2&ct=1&tn=news&rn=20&ie=utf-8&bt=0&et=0",
sep="")}}
Sites_vector <- unlist (Sites)
News <- data.frame()
for (i in 1:length(Sites_vector)){
News <- rbind(News, scrap.f(Sites_vector[i]))}
News <- News %>% distinct(Headline)
# # # # # # # # # # # # # # # # # PART 4 WHERE TO SAVE # # # # # # # # # # # # # # # # # # # # #
write.csv(News, "E:/RCase/Scrap/DailyNews/IMF/New.csv", row.names = FALSE)
以上代码是用来抓取百度行业新闻, 但是在rvest 0.3.0的版本下,即便根据新包更改了html()读取中文网页即为乱码
求指导