rvest 0.3.0版本中文网页读取乱码

wyfhdl · 2015年10月7日


install.packages("rvest") # Delet after first install
install.packages("dplyr") # Delet after first install
install.packages("ggplot2") # Delet after first install
library(rvest)
library(dplyr)
library(ggplot2)
# Function scrap.f 
scrap.f<-function (url) {
  library(rvest)
  library(dplyr)
  web <- rvest::html(url,encoding="utf-8")
  Headline <- web %>% html_nodes("h3 a") %>% html_text()%>%as.character()
  Source <- web %>% html_nodes("p.c-author") %>% html_text() %>% as.character()
  keyword <- web %>% html_nodes("em") %>% html_text()
  keyword <- keyword[1:length(Headline)]%>%as.factor()
  Media <- Source %>% substring(1,regexpr('[0-9]',.)-2)%>%as.factor()
  a <- web %>% html_nodes("a.c-more_link") %>%html_text()%>%grep("相同新闻",.)
  b <- web %>% html_nodes("a.c-more_link") %>%
    html_text()%>%
    substring(regexpr('[0-9]|[0-9][0-9]',.),
              regexpr('[0-9]|[0-9][0-9]',.)+attr(regexpr('[0-9]|[0-9][0-9]',.),'match.length')-1)
  Same_New <- vector(length=length(Headline),mode="numeric")
  Same_New[a]<-b
  Same_New<-Same_New %>% as.numeric()
  TIME <- Source %>%　substring(regexpr('((201.{1,})|([0-9].{1,}))(前|日)',.),
                               regexpr('((201.{1,})|([0-9].{1,}))(前|日)',.)+
                                 attr(regexpr('((201.{1,})|([0-9].{1,}))(前|日)',.),'match.length')-1)
  TIME <- sub('[0-9].{1,}前',Sys.Date(),TIME)
  TIME <- gsub('年|月','-',TIME)
  TIME <- gsub('日','',TIME)
  DATE <- as.Date(TIME, format = "%Y-%m-%d")
  Link <- web %>% html_nodes("h3.c-title a") %>% html_attrs() %>% pluck(1,character(1))
  data.frame(Headline,Media,keyword,Same_New,DATE,Link)}
# How mang news you want from 0 to 200, by = 20
Pages <- seq(0,200,20)
# key words, whats in the box
Keywords <- c("%E5%A5%B6%E7%B2%89",
              "%E7%BE%8E%E8%B5%9E%E8%87%A3",
              "%E5%A4%9A%E7%BE%8E%E6%BB%8B",
              "%E8%B4%9D%E5%9B%A0%E7%BE%8E",
              "%E4%BC%8A%E5%88%A9",
              "%E9%9B%85%E5%9F%B9",
              "%E6%83%A0%E6%B0%8F",
              "%E6%83%A0%E6%B0%8F%E5%90%AF%E8%B5%8B",
              "%E5%90%88%E7%94%9F%E5%85%83",
              "%E8%AF%BA%E4%BC%98%E8%83%BD",
              "%E9%A3%9E%E9%B9%A4",
              "%E7%88%B1%E4%BB%96%E7%BE%8E",
              "%E5%8F%AF%E7%91%9E%E5%BA%B7",
              "%E7%BE%8E%E7%B4%A0%E4%BD%B3%E5%84%BF")
Sites <- data.frame()
for (i in 1:length(Keywords)){
  for (j in 1: length(Pages)){
    Sites[i,j] <- paste("http://news.baidu.com/ns?word=",
                        Keywords[i],
                        "&pn",
                        Pages[j],
                        "&cl=2&ct=1&tn=news&rn=20&ie=utf-8&bt=0&et=0",
                        sep="")}}
Sites_vector <- unlist (Sites)
News <- data.frame()
for (i in 1:length(Sites_vector)){
  News <- rbind(News, scrap.f(Sites_vector[i]))}
News <- News %>% distinct(Headline)
# # # # # # # # # # # # # # # # # PART 4  WHERE TO SAVE # # # # # # # # # # # # # # # # # # # # # 
write.csv(News, "E:/RCase/Scrap/DailyNews/IMF/New.csv", row.names = FALSE)

以上代码是用来抓取百度行业新闻，但是在rvest 0.3.0的版本下，即便根据新包更改了html()读取中文网页即为乱码
求指导

zggjtsgzczh · 2015年10月7日

[未知用户]

或许你需要用以下函数先猜一下网页代码，应该会有帮助：
见guess_encoding(x)，repair_encoding(x, from = NULL)两个函数的帮助信息。