kongdd

  •  
  • 2019年9月11日
  • 注册于 2014年12月4日
  • 刚好500个

    > pkgs<br />
      [1] "abind"                "ACWD"                 "ADGofTest"           
      [4] "airGR"                "alabama"              "amap"                
      [7] "AnnotationDbi"        "ape"                  "assertive"           
     [10] "assertive.base"       "assertive.code"       "assertive.data"      
     [13] "assertive.data.uk"    "assertive.data.us"    "assertive.datetimes" 
     [16] "assertive.files"      "assertive.matrices"   "assertive.models"    
     [19] "assertive.numbers"    "assertive.properties" "assertive.reflection"
     [22] "assertive.sets"       "assertive.strings"    "assertive.types"     
     [25] "assertthat"           "automap"              "babynames"           
     [28] "backports"            "BAMMtools"            "base"                
     [31] "base64"               "base64enc"            "BB"                  
     [34] "bcp"                  "bfast"                "BH"                  
     [37] "bindr"                "bindrcpp"             "Biobase"             
     [40] "BiocGenerics"         "BiocInstaller"        "BiocParallel"        
     [43] "BiocStyle"            "BioMark"              "bit"                 
     [46] "bit64"                "bitops"               "blob"                
     [49] "bmp"                  "bookdown"             "boot"                
     [52] "brew"                 "Cairo"                "cairoDevice"         
     [55] "callr"                "car"                  "caTools"             
     [58] "cellranger"           "changepoint"          "checkpoint"          
     [61] "chillR"               "chron"                "circlize"            
     [64] "circular"             "class"                "classInt"            
     [67] "climdex.pcic"         "clipr"                "cluster"             
     [70] "clv"                  "cmprsk"               "coda"                
     [73] "codetools"            "colorr"               "colorspace"          
     [76] "colortools"           "commonmark"           "compiler"            
     [79] "copBasic"             "copula"               "corpcor"             
     [82] "corrplot"             "crayon"               "crosstalk"           
     [85] "curl"                 "curlconverter"        "data.table"          
     [88] "datasets"             "date"                 "DBI"                 
     [91] "debug"                "deldir"               "DEoptim"             
     [94] "DEoptimR"             "deployrRserve"        "desc"                
     [97] "devtools"             "dfoptim"              "diagram"             
    [100] "DiagrammeR"           "dichromat"            "digest"              
    [103] "DMwR"                 "doParallel"           "doSNOW"              
    [106] "dotCall64"            "downloader"           "dplyr"               
    [109] "drat"                 "DT"                   "dtplyr"              
    [112] "e1071"                "EcoHydRology"         "EDAWR"               
    [115] "ellipse"              "emojifont"            "entropy"             
    [118] "evaluate"             "Evapotranspiration"   "evd"                 
    [121] "expm"                 "faahKO"               "faraway"             
    [124] "fastmatch"            "fasttime"             "fdrtool"             
    [127] "feather"              "feature"              "fields"              
    [130] "fingerprint"          "floodmap"             "FNN"                 
    [133] "forcats"              "foreach"              "forecast"            
    [136] "foreign"              "formatR"              "fracdiff"            
    [139] "fume"                 "futile.logger"        "futile.options"      
    [142] "gcookbook"            "gdalUtils"            "gdata"               
    [145] "GenomeInfoDb"         "geocors"              "geosphere"           
    [148] "ggm"                  "ggmap"                "ggplot2"             
    [151] "ggthemes"             "ggvis"                "gimms"               
    [154] "git2r"                "glmnet"               "GlobalOptions"       
    [157] "glue"                 "gmodels"              "goftest"             
    [160] "gpclib"               "gplots"               "gpuR"                
    [163] "graphics"             "grDevices"            "greenbrown"          
    [166] "grid"                 "gridBase"             "gridExtra"           
    [169] "gsl"                  "gstat"                "gsw"                 
    [172] "gtable"               "gtools"               "gWidgets"            
    [175] "gWidgetsRGtk2"        "haven"                "hexbin"              
    [178] "highr"                "hms"                  "htmltools"           
    [181] "htmlwidgets"          "httpuv"               "httr"                
    [184] "hyperSpec"            "igraph"               "imager"              
    [187] "influenceR"           "inline"               "installr"            
    [190] "intamap"              "intervals"            "Ipaper"              
    [193] "IRanges"              "irlba"                "iterators"           
    [196] "jpeg"                 "js"                   "jsonlite"            
    [199] "Kendall"              "KernSmooth"           "kml"                 
    [202] "knitr"                "kolmim"               "kongdd"              
    [205] "ks"                   "labeling"             "lambda.r"            
    [208] "landsat"              "landsat8"             "landscapeR"          
    [211] "lattice"              "latticeExtra"         "lazyeval"            
    [214] "lbfgs"                "leaflet"              "leaflet.extras"      
    [217] "leafletR"             "LEAPFrOG"             "LearnBayes"          
    [220] "lme4"                 "lmodel2"              "lmomco"              
    [223] "Lmoments"             "lmtest"               "locfit"              
    [226] "longitudinalData"     "lubridate"            "magrittr"            
    [229] "mailR"                "manipulate"           "mapdata"             
    [232] "mapedit"              "mapproj"              "maps"                
    [235] "maptools"             "mapview"              "marelac"             
    [238] "markdown"             "MASS"                 "MassSpecWavelet"     
    [241] "matlab"               "Matrix"               "MatrixModels"        
    [244] "maxent"               "mclust"               "memisc"              
    [247] "memoise"              "MEMSS"                "methods"             
    [250] "mgcv"                 "microbenchmark"       "mime"                
    [253] "miniUI"               "minqa"                "misc3d"              
    [256] "miscTools"            "MissInfo"             "mixOmics"            
    [259] "mlbench"              "mnormt"               "MODIS"               
    [262] "MODISTools"           "mss"                  "multicool"           
    [265] "MultinomialCI"        "multtest"             "muma"                
    [268] "munsell"              "mvbutils"             "mvna"                
    [271] "mvtnorm"              "mxnet"                "mzR"                 
    [274] "ncdf"                 "ncdf4"                "ncdf4.helpers"       
    [277] "nlme"                 "nloptr"               "NLP"                 
    [280] "NMF"                  "nnet"                 "numDeriv"            
    [283] "nycflights13"         "oce"                  "oldbookdown"         
    [286] "openssl"              "openxlsx"             "operators"           
    [289] "optextras"            "optimx"               "oz"                  
    [292] "packrat"              "parallel"             "pbdMPI"              
    [295] "pbkrtest"             "PBSmapping"           "pcaMethods"          
    [298] "pcaPP"                "pcaXcca"              "PCICt"               
    [301] "pdist"                "permute"              "phenopix"            
    [304] "pkgconfig"            "pkgmaker"             "PKI"                 
    [307] "playwith"             "plogr"                "plotly"              
    [310] "plotmap"              "plotrix"              "pls"                 
    [313] "plsdepot"             "plspm"                "plyr"                
    [316] "png"                  "polyclip"             "POT"                 
    [319] "ppcor"                "pracma"               "praise"              
    [322] "prettyunits"          "printr"               "profvis"             
    [325] "progress"             "ProtGenerics"         "proto"               
    [328] "pryr"                 "pspline"              "psych"               
    [331] "ptw"                  "purrr"                "quadprog"            
    [334] "quantmod"             "quantreg"             "R.matlab"            
    [337] "R.methodsS3"          "R.oo"                 "R.utils"             
    [340] "R6"                   "randomForest"         "randtoolbox"         
    [343] "RANN"                 "raster"               "rastermap"           
    [346] "rbenchmark"           "Rcgmin"               "rCharts"             
    [349] "RCMIP5"               "RColorBrewer"         "Rcpp"                
    [352] "RcppArmadillo"        "RcppEigen"            "RcppNumerical"       
    [355] "RcppParallel"         "RCurl"                "readbitmap"          
    [358] "readr"                "readxl"               "registry"            
    [361] "REmap"                "rematch"              "remote"              
    [364] "remoteParallel"       "repr"                 "reprex"              
    [367] "reshape"              "reshape2"             "rgdal"               
    [370] "rgeos"                "rgexf"                "rgl"                 
    [373] "RgoogleMaps"          "RGraphics"            "RGtk2"               
    [376] "rhdf5"                "RInside"              "rJava"               
    [379] "rjson"                "RJSONIO"              "rlang"               
    [382] "rlecuyer"             "rlist"                "rmarkdown"           
    [385] "RMAWGEN"              "Rmpi"                 "RMySQL"              
    [388] "rngtools"             "rngWELL"              "robustbase"          
    [391] "ROCR"                 "RODBC"                "Rook"                
    [394] "ropls"                "roxygen2"             "rpart"               
    [397] "rprojroot"            "rrcov"                "rsconnect"           
    [400] "rscPDSSI"             "RSQLite"              "rstudio"             
    [403] "rstudioapi"           "rticles"              "rversions"           
    [406] "rvest"                "RViennaCL"            "Rvmmin"              
    [409] "S4Vectors"            "sandwich"             "satellite"           
    [412] "scales"               "sda"                  "SDMTools"            
    [415] "seacarb"              "segmented"            "selectr"             
    [418] "servr"                "setRNG"               "sf"                  
    [421] "shape"                "shiny"                "shinydashboard"      
    [424] "showtext"             "showtextdb"           "slam"                
    [427] "snow"                 "snowfall"             "sourcetools"         
    [430] "sp"                   "spacetime"            "spam"                
    [433] "SparseM"              "spatial"              "SpatMCA"             
    [436] "spatstat"             "spatstat.utils"       "spdep"               
    [439] "SPEI"                 "splines"              "st"                  
    [442] "stabledist"           "stationaRy"           "stats"               
    [445] "stats4"               "stepPlr"              "stringi"             
    [448] "stringr"              "strucchange"          "survival"            
    [451] "svUnit"               "swirl"                "sysfonts"            
    [454] "tcltk"                "TeachingDemos"        "tensor"              
    [457] "tester"               "testRcppPackage"      "testthat"            
    [460] "tibble"               "tidyr"                "timeDate"            
    [463] "titanic"              "tm"                   "tools"               
    [466] "topmodel"             "translations"         "tseries"             
    [469] "TTR"                  "turner"               "ucminf"              
    [472] "udunits2"             "units"                "urca"                
    [475] "utils"                "V8"                   "vars"                
    [478] "viridis"              "viridisLite"          "visNetwork"          
    [481] "VSURF"                "water"                "waveslim"            
    [484] "webshot"              "whisker"              "withr"               
    [487] "wq"                   "xaringan"             "xcms"                
    [490] "XML"                  "xml2"                 "xtable"              
    [493] "xts"                  "yacca"                "yaml"                
    [496] "yhat"                 "yhatr"                "zlibbioc"            
    [499] "zoo"                  "zyp"
  • 可以尝试在R里面shell调用cmd吧
  • 给个完整的数据测试一下呗。一般在xml head中定义编码格式,或者用R语言的iconv,
    iconv(value, "utf-8", "gbk")
  • 建议看一下hadley的advanced R中的style guide, 如下是formatR的输出
    lmwK = function(x, y) {
        I1 = array(1, dim = c(ncol(y), 1))
        ty = t(y)
        IY1 = I1 %*% ty
        IY2 = y %*% t(I1)
        IY3 = IY2 - IY1
        JIY3 = abs(IY3)
        IY5 = IY3 + JIY3
        IY6 = IY5/2
        IY7 = IY6/IY6
        IY7[is.na(IY7)] = 0
        IY = IY7
        w9 = IY %*% x
        w8 = w9/ncol(y)
        w7 = w8^2
        w = apply(w7, 2, mean)
        w1 = as.data.frame(w)
        w2 = t(w1)
        dim(w2)
        class(w2)
        w3 = as.data.frame(w2)
        rownames(w3)
        colnames(w3) = colnames(x)
        w4 = sort(w3, decreasing = T)
        wk = w4[1:30]
        l30 = colnames(wk)
        x1.select30 = as.matrix(x[, l30])
        x.30 = as.data.frame(x1.select30)
        Y = as.data.frame(y)
        MY = cbind(x.30, Y)
        fit = lm(formula = y ~ ., data = MY)
        r2 <- cor(y, fit$fitted.values)^2
        cat("Original R-square =", r2, "\n")
    }
    y = as.matrix(yX17.AAG)
    lmwk(x, y)
    
  • 1. 首先加载包,创建函数getParams函数(用于从带参数的url中提取postForm的参数,带参数的url可以用Firefox的firebug中复制下来)
    library(RCurl)
    library(XML)
    library(magrittr)
    getParams <- function(url, 
                          containURL = substr(url, 1, 4) == "http"){
      str_find_first <- function(str, pattern)
        regexpr(pattern, str) %>% {list(pos = as.numeric(.), len = attr(., "match.length"))}
    
      str_split_first <- function(str, pattern){
        pos <- str_find_first(str, pattern)
        if (pos$pos < 0) return(set_names(list(""), str))
        set_names(list(substr(str, pos$pos + pos$len, nchar(str))), substr(str, 1, pos$pos - 1))
      }
      url <- iconv(URLdecode(url), "utf-8", "gbk")
      if (containURL){
        pos <- str_find_first(url, "\\?")
        if (pos$pos > 0) url <- substr(url, pos$pos + 1, nchar(url))
      }
      abcd <- strsplit(url,"&")[[1]]
      params <- lapply(abcd, str_split_first, pattern="\\=") %>% do.call(c,.)
      return(params)
    }
    2. 然后带上百宝箱curlhand上路,行走江湖必备
    ## please input your password here
    user <- "kongdd"
    pwd <- "****"
    myHttpheader<- c(
      "User-Agent" = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0",
      "Accept" = "text/html,application/xhtml+xml,application/xml,application/json;q=0.9,*/*;q=0.8",
      "Accept-Language" = "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
      "Connection"="keep-alive",
      "Host" = "cos.name")
    ch <- getCurlHandle()#带上百宝箱开始上路
    curlSetOpt(curl = ch, ssl.verifypeer = FALSE, 
               followlocation = TRUE,
               verbose = TRUE,
               cookiejar = "cookies_cnki.txt", #cookiefile = "cookies_cnki.txt", 
               httpheader = myHttpheader)
    url_login <- 'http://cos.name/cn/wp-login.php'
    # (1) first call to initializate session. you get the session cookie
    page <- getURL(url_login, curl = ch)
    3. 构造提交的表单的参数
    post1 <- "log=user&pwd=pwd&pTE-gB-H-f-O-c-V=WPZKteIRNC9n9nwYjh758ig-mtHo4KXjrryHb0Ag1SDIlN8TZsEWs7U6qLAUWxhUCPXuED8XoSLBtNqkQSln9ONpRCdW0YfjXEfbGUf-9Echd4sR6YIwQHLfWdLAFVra&wp-submit=%E7%99%BB%E5%BD%95&redirect_to=http%3A%2F%2Fcos.name%2Fcn%2F&testcookie=1&y-v-F-FV-A-MP-U-Rh=10809510.020100101"
    params1 <- getParams(post1)
    params1$log <- user
    params1$pwd <- pwd
    # 其他尝试
    # post2 <- "http://cos.name/cn/wp-admin/admin-ajax.php?action=gdbcRetrieveToken&browserInfo=%7B%22screenWidth%22%3A1920%2C%22screenHeight%22%3A1080%2C%22engine%22%3A24%2C%22features%22%3A95%2C%22mozilla%22%3A%225.0%22%2C%22windows_nt%22%3A%2210.0%22%2C%22wow64%22%3Atrue%2C%22rv%22%3A%2250.0%22%2C%22gecko%22%3A%2220100101%22%2C%22firefox%22%3A%2250.0%22%7D&pTE-gB-H-f-O-c-V=3759969693&requestTime=1480418883349"
    # params2 <- getParams(post2)
    # params2$requestTime <- as.character(floor(as.numeric(Sys.time())*1000))
    # 
    # postForm("http://cos.name/cn/wp-admin/admin-ajax.php", .params = params2, curl = ch, 
    #          .opt = list(verbose = TRUE), 
    #          Referer = "http://cos.name/cn/wp-login.php", style = "post")
    str(params1)
    List of 7
     $ log               : chr "kongdd"
     $ pwd               : chr "****"
     $ pTE-gB-H-f-O-c-V  : chr "WPZKteIRNC9n9nwYjh758ig-mtHo4KXjrryHb0Ag1SDIlN8TZsEWs7U6qLAUWxhUCPXuED8XoSLBtNqkQSln9ONpRCdW0YfjXEfbGUf-9Echd4sR6YIwQHLfWdLAFVr"| __truncated__
     $ wp-submit         : chr "登录"
     $ redirect_to       : chr "http://cos.name/cn/"
     $ testcookie        : chr "1"
     $ y-v-F-FV-A-MP-U-Rh: chr "10809510.020100101"
    4. 登陆并打印作者信息,如果没报错的话,你的用户名就在info中了
    page <- postForm("http://cos.name/cn/wp-login.php", .params = params1, curl = ch, 
             .opt = list(verbose = TRUE), 
             Referer = "http://cos.name/cn/wp-login.php", style = "post")
    ## login success if your username could be find in info
    info <- htmlParse(page, encoding = "utf-8") %>% {getNodeSet(., "//div[@class='bbp-logged-in']")[[1]]} %T>% print
    虽然能正常登陆了,但是提交表单的参数不是很懂。
    欢迎相互讨论https://github.com/kongdd/RCurl_project/
  • 在利用parallel包,借助plink, openssh链接远程计算机进行并行运算,链接、运算正常,但是stopclutser时会导致服务端sshd服务终止,有人又遇到过这种状况吗?
    多电脑并行运算,除了parallel with plink还有其他推荐的吗?
    以下是我的代码,链接及计算都没有问题,就是每次结束运算之后,服务器sshd服务都会终止。
    library(parallel)
    library(magrittr)
    # library(stringr)#stringr package need to be installed
    #' @param host remote computer host name or ip
    #' @param user remote computer login username
    #' @param rscript remote computer R software bin path
    #' @param rshcmd plink login command with password
    localhost <- list(host = "localhost")
    kong <- list(
      host = "KONG-pc",
      user = "KONG",
      rscript = "C:/Program Files/R/R-3.3.2/bin/Rscript.exe",
      rshcmd = "plink -pw password"
    )
    
    works <- list(kong, localhost) %>% set_names(sapply(., <code>[[</code>, "host"))
    
    ## test for KONG-pc
    cl <- makePSOCKcluster(names = rep(works, c(5, 0)), outfile = "log.txt")
    stopCluster(cl)
  • 建议学习RCurl, XML, xml2, rvest, httr packages
  • setClass("oblig", slots = c(begin = "Date"), contains = "Date")
    new("oblig", as.Date("2014-01-01"))
    
    程序运行报错,请问这种情况应该如何处理
    Error in validObject(.Object) : 
      invalid class “oblig” object: invalid object for slot "begin" in class "oblig": got class "S4", should be or extend class "Date"
  • openxlsx这个包很好用,另外还有readxl
  • 打开任务管理器监控内存变化情况不就行了
  • 都不知道你要下的是什么数据
  • require(RCurl)
    require(XML)
    require(magrittr)
    ## Writed By Dongdong Kong
    url <- "http://www.china-customs.com/customs-tax/04/"
    table <- iconv(paste(readLines(url), collapse = ""), "gbk", "utf-8") %>%
      htmlParse(., encoding = "utf-8") %>% getNodeSet(., "//table[3]/tr[2]/td/table") %>% extract2(1)
    
    tablehead <- xpathSApply(table, "td", xmlValue)
    result<- lapply(getNodeSet(table, "tr")[-1], function(tr) xpathSApply(tr, "td", xmlValue)) %>% 
      do.call(rbind, .)
  • 我对quandmod包不是很了解,如下是一种粗暴的做法
    mystock <- function(se.name,se.code){
      eval(parse(text = sprintf("setSymbolLookup(%s = list(name = '%s', src='yahoo'))", se.name, se.code)))
      getSymbols(se.name,from="2006-01-01")
    }
    mystock("SZ000001","000001.SZ")
  • [未知用户]
    你的包装对了吗?这么简单的爬虫都不会
  • require(RCurl)
    require(XML)
    require(rvest)
    
    url <- "http://www.sse.com.cn/assortment/stock/list/name/"
    page <- htmlParse(url, encoding = "utf-8")
    table_stock <- getNodeSet(page, "//div[@id='dateList']/table")
    html_table(table_stock)[[1]]
    
       证券代码 证券简称
    1    600000 浦发银行
    2    600004 白云机场
    3    600005 武钢股份
    4    600006 东风汽车
    5    600007 中国国贸
    6    600008 首创股份
    7    600009 上海机场
    8    600010 包钢股份
    9    600011 华能国际
    10   600012 皖通高速
    11   600015 华夏银行
    12   600016 民生银行
    13   600017   日照港
    14   600018 上港集团
    15   600019 宝钢股份
    16   600020 中原高速
    17   600021 上海电力
    18   600022 山东钢铁
    19   600023 浙能电力
    20   600026 中海发展
    21   600027 华电国际
    22   600028 中国石化
    23   600029 南方航空
    24   600030 中信证券
    25   600031 三一重工
    26   600033 福建高速
    27   600035 楚天高速
    28   600036 招商银行
    29   600037 歌华有线
    30   600038 中直股份
    31   600039 四川路桥
    32   600048 保利地产
    33   600050 中国联通
    34   600051 宁波联合
    35   600052 浙江广厦
    36   600053 中江地产
    37   600054 黄山旅游
    38   600055 华润万东
    39   600056 中国医药
    40   600057 象屿股份
    41   600058 五矿发展
    42   600059 古越龙山
    43   600060 海信电器
    44   600061 国投安信
    45   600062 华润双鹤
    46   600063 皖维高新
    47   600064 南京高科
    48   600066 宇通客车
    49   600067 冠城大通
    50   600068   葛洲坝
    不谢
  • 从国际地图审核系统抓取以审查结束的单号信息:国家地图审核系统
    require(RCurl)
    require(XML)
    
    url_head <- "http://dtsh.nasg.gov.cn/gjchjPub/Top.htm"
    url_data <- "http://dtsh.nasg.gov.cn/gjchjPub/WebService/MainService.asmx/GetXzxkjgList"
    
    ch <- getCurlHandle()
    curlSetOpt(curl = ch,
               useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36",
               timeout = 60,
               #host = "http://dtsh.nasg.gov.cn", 
               followlocation = TRUE)
    
    d <- debugGatherer()
    
    params<-list(strIID = "",
                 strName = "",
                 strRequestStartDate = "2015-9-21",
                 strRequestEndDate = "2015-10-21")
    
    tmp <- getURL(url_head, curl = ch, verbose= TRUE, debugfunction = d$update)
    
    page <- postForm(url_data, .params = params, curl = ch, 
                     .opts = list(debugfunction = d$update,verbose = TRUE), style = "post")
    
    
    Internal Server Error
  • 您说的附送给我的资料,指的是什么呢?
    您说的这些东西我基本都掌握了,昨天学了一天正则表达式,综合来看,对于新浪微博的数据正则表达式更方便,数据基本都提取出来了。在写S4类的时候出了一点小问题