爬虫很长时间没用过了,最近试着更新一些数据,结果发现rvest没法直接选到h3里面的内容,各位帮看看是哪里出了问题,谢谢!r
library(rvest)
#> Loading required package: xml2
sessionInfo()
#> R version 3.5.3 (2019-03-11)
#> Platform: x86_64-apple-darwin15.6.0 (64-bit)
#> Running under: macOS High Sierra 10.13.6
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] rvest_0.3.5 xml2_1.3.2
#>
#> loaded via a namespace (and not attached):
#> [1] Rcpp_1.0.1 digest_0.6.18 R6_2.4.0 magrittr_1.5
#> [5] evaluate_0.13 highr_0.8 httr_1.4.0 stringi_1.4.3
#> [9] rmarkdown_1.12 tools_3.5.3 stringr_1.4.0 xfun_0.5
#> [13] yaml_2.2.0 compiler_3.5.3 htmltools_0.3.6 knitr_1.22
xx <- '<h3 class="on"><li style="width:25%"><em></em>北京市</li><li style="width:15%">通州区</li><li style="width:15%">1300</li><li style="width:13%">16412</li><li style="width:11%">110000</li><li style="width:8%"></li><li style="width:8%"></li></h3>'
yy <- read_html(xx)
html_structure(yy)
#> <html>
#> <body>
#> <h3.on>
#> <li [style]>
#> <em>
#> {text}
#> <li [style]>
#> {text}
#> <li [style]>
#> {text}
#> <li [style]>
#> {text}
#> <li [style]>
#> {text}
#> <li [style]>
#> <li [style]>
html_nodes(yy, '.on > li')
#> {xml_nodeset (0)}
xx <- gsub('h3', 'ul', xx)
yy <- read_html(xx)
html_structure(yy) ## changed
#> <html>
#> <body>
#> <ul.on>
#> <li [style]>
#> <em>
#> {text}
#> <li [style]>
#> {text}
#> <li [style]>
#> {text}
#> <li [style]>
#> {text}
#> <li [style]>
#> {text}
#> <li [style]>
#> <li [style]>
html_nodes(yy, '.on > li')
#> {xml_nodeset (7)}
#> [1] <li style="width:25%">\n<em></em>北京市</li>\n
#> [2] <li style="width:15%">通州区</li>\n
#> [3] <li style="width:15%">1300</li>\n
#> [4] <li style="width:13%">16412</li>\n
#> [5] <li style="width:11%">110000</li>\n
#> [6] <li style="width:8%">\n
#> [7] <li style="width:8%">
<sup>Created on 2020-05-22 by the reprex package (v0.2.1)</sup>