近期研究R爬取新浪围脖。
发现现有对Rweibo的微博搜索已经无法使用了。
写下此文记录解析过程。希望对继续研究者有帮助
新浪一直在更新,R微博已经很久没变了。。。。。
目前新浪网有如下改变:
1、若非微博登入用户,无法在使用高级搜索,且只能访问搜索第一页
解决方案:cookie模拟登入
2、一定周期的,会在url间加入跳转url,使用固定的url会有一定频率爬去失败。
解决方案:若获取为跳转信息,解析跳转链接地址。
3、部分页面使用fromJSON无法解析(提示有异常字符,俺查了半天json没找出错在哪)
解决方案:改用gsub代替fromJSON
以下方法参考了RCurl不务正业,以及R微博的官方教程,对Rweibo建议应有以下改进。
1、使用cookie模拟登入:
将头信息换成如下形式:
<br />
myHttpheader<- c(<br />
"Connection"="keep-alive",<br />
"Host"="s.weibo.com",<br />
"User-Agent"="Mozilla/5.0 (Windows NT 6.2; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0",<br />
"Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",<br />
"Accept-Language"="zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",<br />
#"Accept-Encoding"="gzip,deflate",#(建议根据需要添加,若是不懂尽量别加这条,若是加错甚至会导致R崩溃)<br />
#登入的相关cookie信息都贴过来,以;分割。<br />
"Cookie"="SUB=0033WrSXqPxfM725Ws9jq(略);UOR=ww(略);_ndefined"<br />
)<br />
#设置背包<br />
d = debugGatherer()<br />
cHandle2 <- getCurlHandle(httpheader=myHttpheader,followlocation=1,<br />
debugfunction=d$update,verbose=TRUE)<br />
#若无意外此时手工拼:<br />
temp<-getURL("http://s.weibo.com/weibo/%E5%9B%BD%E5%AE%B6%E5%BC%80%E5%8F%91%E9%93%B6%E8%A1%8C&xsort=time&Refer=g&nodup=1&page=2",curl=cHandle2,encoding='UTF-8')<br />
#会返回对应搜索页html。<br />
2、也可能返回,跳转链接</p>
如下:
[data]
"\n\t\t<html>\n\t\t<head>\n\t\t<title>\xd0\xc2\xc0\xcbͨ\xd0\xd0֤</title>\n\t\t<meta http-equiv=\"refresh\" content=\"0; url='http://s.weibo.com/weibo/%E5%9B%BD%E5%AE%B6%E5%BC%80%E5%8F%91%E9%93%B6%E8%A1%8C&xsort=time&Refer=g&nodup=1&page=1?retcode=6102'\"/>\n\t\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset=GBK\" />\n\t\t</head>\n\t\t<body bgcolor=\"#ffffff\" text=\"#000000\" link=\"#0000cc\" vlink=\"#551a8b\" alink=\"#ff0000\">\n\t\t<script type=\"text/javascript\" language=\"javascript\">\n\t\tlocation.replace(\"http://s.weibo.com/weibo/%E5%9B%BD%E5%AE%B6%E5%BC%80%E5%8F%91%E9%93%B6%E8%A1%8C&xsort=time&Refer=g&nodup=1&page=1?retcode=6102\");\n\t\t</script>\n\t\t</body>\n\t\t</html>"
[/data]
其中location.replace()内的信息就是新地址,利用其即可访问搜索页。
<br />
#如果抓到的是跳转页则读取跳转地址<br />
if(grep("location.replace\\(\"",temp)==1){<br />
strurl<-gsub(".*location.replace\\(\"(.*)\"\\);.*","\\1",temp)<br />
temp <- getURL(strurl, curl = curl, .encoding = 'UTF-8')<br />
}<br />
3、在Rweibo函数中有如下一句解析json的代码
<br />
weibolist <- .fromJSON(weibojson)<br />
weibopage <- htmlParse(weibojson2, asText=TRUE, encoding = "UTF-8")<br />
经测试这段代码解析新浪围脖html大概有5%左右的概率会解析失败,利用gsub修改如下:
<br />
weibojson1<-gsub(".*\"html\":\"(.*)\"}","\\1",weibojson)<br />
weibojson2<-gsub("\\\\","",weibojson1)<br />
#weibolist <- .fromJSON(weibojson)<br />
weibopage <- htmlParse(weibojson2, asText=TRUE, encoding = "UTF-8")<br />
</p>
最后给出略微修改的Rweibo搜索解析函数web.search.combinewith(稍有改动):
使用样例:
<br />
res12 <- wb("统计之都", page = 20,curl=cHandle2, sleepmean = 30)<br />
需要准备
使用前需加载包:
library(Rcpp)
library(bitops)
library(RCurl)
library(rjson)
library(tools)
library(digest)
library(XML)
library(Rweibo)
需单读加载Rweibo包中的utils.R内的函数
<br />
wb<-function(sword="国际开发银行",page=1,curl = NULL,sleepmean=20,combinewith=NULL,since=NULL){<br />
#判断页数输入无问题<br />
if (length(page) == 1) page = 1:page<br />
page <- page[page > 0 & page <= 50]<br />
page <- sort(page)<br />
if (length(page) > 50) page <- page[1:50]<br />
#如果判断扩展前格式,排重后读取<br />
if (!is.null(combinewith)) {<br />
if (all(c("MID", "UID" ,"Author", "Weibo", "Forward", "Time_Weibo", "Time_Search", "Count_Forward", "Count_Reply") %in% names(combinewith))) {<br />
OUT <- combinewith[, c("MID", "UID" ,"Author", "Weibo", "Forward", "Time_Weibo", "Time_Search", "Count_Forward", "Count_Reply")]<br />
maxid <- max(as.numeric(OUT$MID))<br />
} else {<br />
OUT <- data.frame(stringsAsFactors = FALSE)<br />
maxid <- 0<br />
warning("Ignored 'combinewith' because of wrong format!")<br />
}<br />
} else {<br />
OUT <- data.frame(stringsAsFactors = FALSE)<br />
maxid <- 0<br />
}<br />
#判断启适日期格式是否正确<br />
if (is.null(since)) {<br />
maxdate <- -Inf<br />
} else {<br />
if (inherits(since, "character")) {<br />
since <- strptime(since, format = "%Y-%m-%d")<br />
if (is.na(since)) {<br />
warning("Ignore 'since' because of the wrong format!")<br />
maxdate <- -Inf<br />
}<br />
}<br />
if (inherits(since, "POSIXlt")) maxdate <- since<br />
}<br />
#开始循环读取指定页<br />
for(ipage in page){<br />
#同一页面连续获取5次若是失败5次破坏<br />
print(paste("当前页面是:",ipage))<br />
for(i in 1:5){<br />
#系统待机时间<br />
Sys.sleep(abs(rnorm(1, mean = sleepmean, sd = 2)))<br />
tmp.search <- try(lgsearch.content(sword,page=ipage,curl=curl), silent = TRUE)<br />
if (is.null(tmp.search)) {<br />
#睡眠时间随失败次数翻倍<br />
Sys.sleep(abs(rnorm(1, mean = sleepmean*(1.5^i), sd = 2)))<br />
cat(paste(ipage - 1, " pages was stored!\n", sep = ""))<br />
Search <- FALSE<br />
} else if (inherits(tmp.search, "try-error")){<br />
warning(paste("Error in page ", ipage, sep = ""))<br />
} else {<br />
if (min(as.numeric(tmp.search$MID)) <= maxid || min(tmp.search$Time_Weibo) < maxdate) {<br />
Search <- FALSE<br />
tmp.search <- tmp.search[as.numeric(tmp.search$MID) > maxid & tmp.search$Time_Weibo >= maxdate, ]<br />
}<br />
OUT <- rbind(tmp.search, OUT)<br />
break;<br />
}<br />
print(paste("当前错误次数是:",i,"(到达5次就崩溃了)"))<br />
}<br />
}<br />
return(OUT)<br />
}<br />
lgsearch.content<-function(sword="国家开发银行",page=1,curl = NULL){<br />
#拼接URL地址<br />
requestURL <- "http://s.weibo.com/weibo/"<br />
sword <- curlEscape(.cntoUTF8(sword))<br />
strurl <- paste(requestURL, sword, "&xsort=time&Refer=g&nodup=1&page=", page,sep = "")# time sorting<br />
temp <- getURL(strurl, curl = curl, .encoding = 'UTF-8')<br />
#如果抓到的是跳转页则读取跳转地址<br />
if(grep("location.replace\\(\"",temp)==1){<br />
strurl<-gsub(".*location.replace\\(\"(.*)\"\\);.*","\\1",temp)<br />
temp <- getURL(strurl, curl = curl, .encoding = 'UTF-8')<br />
}<br />
resHTMLs <- .strextract(temp, "<script>.+?</script>")[[1]]<br />
resHTML <- resHTMLs[grep("\"pid\":\"pl_weibo_direct\"", resHTMLs)][1]<br />
if (is.na(resHTML)) {<br />
warning("Can not crawl any page now. May be forbidden by Sina temporarily.", call. = FALSE)<br />
return(NULL)<br />
}<br />
weibojson <- gsub("\\)</script>$", "", gsub("^.*STK.pageletM.view\\(", "", resHTML))<br />
#使用正则代替解析<br />
weibojson1<-gsub(".*\"html\":\"(.*)\"}","\\1",weibojson)<br />
weibojson2<-gsub("\\\\","",weibojson1)<br />
#weibolist <- .fromJSON(weibojson)<br />
weibopage <- htmlParse(weibojson2, asText=TRUE, encoding = "UTF-8")<br />
weiboitem.attr <- getNodeSet(weibopage, "//dl[@action-type='feed_list_item']")<br />
weiboitem.con <- getNodeSet(weibopage, "//dd[@class='content']")<br />
weiboitem.nores <- getNodeSet(weibopage, "//div[@class='pl_noresult']")<br />
if (length(weiboitem.nores) == 0) {<br />
res.mid <- sapply(weiboitem.attr, function(X) xmlGetAttr(X, "mid"))<br />
res.con <- sapply(weiboitem.con, FUN = function(X) xmlValue(getNodeSet(X, "p[@node-type='feed_list_content']")[[1]]))<br />
res.uid <- sapply(weiboitem.con, FUN = function(X) xmlGetAttr(getNodeSet(X, "p[@node-type='feed_list_content']/a")[[1]], "usercard"))<br />
res.uid <- gsub("id=(.*)&.*","\\1",res.uid)<br />
res.name <- sapply(weiboitem.con, FUN = function(X) xmlGetAttr(getNodeSet(X, "p[@node-type='feed_list_content']/a")[[1]], "nick-name"))<br />
res.date <- sapply(weiboitem.con, FUN = function(X) xmlGetAttr(getNodeSet(X, "p/a[@node-type='feed_list_item_date']")[[1]], "title"))<br />
res.stat <- lapply(weiboitem.con, FUN = function(X) sapply(getNodeSet(X, "p/span/a"), xmlValue))<br />
res.forward <- sapply(weiboitem.con, FUN = function(X) {<br />
tmp.node <- getNodeSet(X, "dl/dt[@node-type='feed_list_forwardContent']")<br />
if (length(tmp.node) == 0) {<br />
NA<br />
} else {<br />
xmlValue(tmp.node[[1]])<br />
}<br />
}<br />
)<br />
Encoding(res.name) <- "UTF-8"<br />
res.con <- .strtrim(res.con)<br />
res.forward <- .strtrim(res.forward)<br />
res.date <- strptime(res.date, format = "%Y-%m-%d %H:%M")<br />
res.stat.f <- as.numeric(gsub("[^0-9]", "", sapply(res.stat, FUN = function(X) X[grep("\u8F6C\u53D1", X)])))<br />
res.stat.r <- as.numeric(gsub("[^0-9]", "", sapply(res.stat, FUN = function(X) X[grep("\u8BC4\u8BBA", X)])))<br />
res.stat.f[is.na(res.stat.f)] <- 0<br />
res.stat.r[is.na(res.stat.r)] <- 0</p>
<p> OUT <- data.frame(MID = res.mid, UID=res.uid,Author = res.name, Weibo = res.con, Forward = res.forward, Time_Weibo = res.date,<br />
Time_Search = Sys.time(), Count_Forward = res.stat.f, Count_Reply = res.stat.r, stringsAsFactors = FALSE)<br />
OUT$Weibo <- sapply(seq_along(OUT$Weibo), FUN = function(X)<br />
gsub(paste("^ *", OUT$Author[X], "\uFF1A *", sep = ""), "", OUT$Weibo[X]))<br />
} else {<br />
OUT <- NULL<br />
}<br />
return(OUT)<br />
}<br />
</p>