<br />
library(stringr)<br />
library(XML)<br />
# parse html file<br />
extract_from_url = function(url) {<br />
doc = htmlParse(url)<br />
txt = unlist(xpathApply(doc, "//div[@id='introduction']//dd[@class='lr']", xmlValue))<br />
get_info(txt)<br />
}<br />
# 从文本提取个人信息<br />
get_info = function(str) {<br />
list(<br />
year = get_birth_year(str),<br />
month = get_birth_month(str),<br />
day = get_birth_day(str),<br />
addr = get_birth_addr(str),<br />
graduate = get_graduate_from(str)<br />
)<br />
}<br />
get_birth_year = function(str) {<br />
as.numeric(str_extract(str, pattern=perl("\\d{4}(?=年[^。、,,.]*生)")))<br />
}<br />
get_birth_month = function(str) {<br />
as.numeric(str_extract(str, pattern=perl("\\d{1,2}(?=月[^。、,,.]*生)")))<br />
}<br />
get_birth_day = function(str) {<br />
as.numeric(str_extract(str, pattern=perl("\\d{1,2}(?=日[^。、,,.]*生)")))<br />
}</p>
<p>alt1 = "(?<=生于)[^。、,,.[:digit:]]+"<br />
alt2 = "[^。、,,.[:digit:]]+(?=人。)"<br />
alt3 = "(?<=籍贯)[^。、,,.[:digit:]]+"<br />
place_expr = perl(paste(alt1, alt2, alt3, sep="|"))<br />
get_birth_addr = function(str) {<br />
str_extract(str, place_expr)<br />
}</p>
<p>get_graduate_from = function(str) {<br />
str_extract(str, pattern=perl("(?<=毕业于)[^。、,,.[:digit:]]+"))<br />
}</p>
<p>## 获得所有链接地址<br />
url = "http://sourcedb.cas.cn/sourcedb_ad_cas/zw2/ysxx_xxbwz/qtysmd/index.html"<br />
doc = htmlParse(url, encoding='utf-8')<br />
links = xpathSApply(doc, path="//td/a", fun=function(c){<br />
href = xmlGetAttr(c, 'href')<br />
names(href) = xmlValue(c)<br />
return(href)<br />
})<br />
# replace relative links with absolute links<br />
links = sub(links, pattern="\\.\\.", replacement="http://sourcedb.cas.cn/sourcedb_ad_cas/zw2/ysxx_xxbwz")</p>
<p># grab information from each URL<br />
result = list()<br />
for(i in seq_along(links)) {<br />
cat(names(links)[i], ":", links[i], "\n")<br />
Sys.sleep(0.3) # avoid DOS attack<br />
result[[i]] = c(name =names(links)[i], extract_from_url(links[i]))<br />
}<br />
result = do.call(rbind, lapply(result, as.data.frame))<br />
</p>