<br />
require(XML)<br />
require(RCurl)</p>
<p>pg1 = 'http://fuzhou.customs.gov.cn/publish/portal123/tab39441/module21195/page1.htm'<br />
pg2 = 'http://fuzhou.customs.gov.cn/publish/portal123/tab39441/module21195/page2.htm'<br />
pg3 = 'http://fuzhou.customs.gov.cn/publish/portal123/tab39441/module21195/page3.htm'</p>
<p>url1 = htmlTreeParse(pg1, useInternal = TRUE)<br />
url2 = htmlTreeParse(pg2, useInternal = TRUE)<br />
url3 = htmlTreeParse(pg3, useInternal = TRUE)</p>
<p>urls1 = unlist(xpathApply(url1, path = "//td[@width='81%']//a", xmlGetAttr, "href"))<br />
urls2 = unlist(xpathApply(url2, path = "//td[@width='81%']//a", xmlGetAttr, "href"))<br />
urls3 = unlist(xpathApply(url3, path = "//td[@width='81%']//a", xmlGetAttr, "href"))[1:18]</p>
<p>urls = c(urls1, urls2, urls3)<br />
subpage = getURIAsynchronous(urls)<br />
subpage = lapply(subpage, htmlParse, asText = TRUE, encoding = 'utf-8')</p>
<p>num = vector('numeric')<br />
for (i in 1:length(urls)) {<br />
num[i] = xmlValue(getNodeSet(doc = subpage[[i]],<br />
path = "//tbody//tr//td[@align='right']")[[7]])<br />
}</p>
<p>as.numeric(num)</p>
<p># [1] 853748 902361 857044 790004 768103 739519 725246 649049 404036 759120 737251<br />
# [12] 647507 580390 614395 612146 693975 622930 592834 571778 485672 445540 546648<br />
# [23] 570929 473495 463263 520837 464895 493134 448495 411770 403945 387954 247721<br />
# [34] 447646 463540 425932 493237 517111 538998 526606 481722 491564 508951 439320<br />
# [45] 349813 463458 502048 469083 433273 451939 427186 415441 397339 318752 333626<br />
# [56] 423604 483545 394661<br />
</p>