dhfly

  • 25 天前
  • 注册于 2018年5月29日
  • dhfly
    这样可以吗?

    lapply(1:length(list2), 
           FUN=function(x, datalist, indexlist) list2[[x]][list1[[x]][1], list1[[x]][2]], 
           datalist = list2, indexlist = list1)

    顺便蹲个简洁点的写法

  • 是有一些麻烦,特意分开了写,这样看的清楚些

    library(data.table)
    
    aa <- data.frame(num=c(50,110,120,110,105,101,103,30,40,101,102,110),str=c(rep("a",2),rep("b",2),rep("a",3),"b",rep("a",4) ))
    
    setDT(aa)
    
    # 根据条件,判断每一行是否符合条件
    aa[,grp:= (str=="a" & num>100)][]
    #>     num str   grp
    #>  1:  50   a FALSE
    #>  2: 110   a  TRUE
    #>  3: 120   b FALSE
    #>  4: 110   b FALSE
    #>  5: 105   a  TRUE
    #>  6: 101   a  TRUE
    #>  7: 103   a  TRUE
    #>  8:  30   b FALSE
    #>  9:  40   a FALSE
    #> 10: 101   a  TRUE
    #> 11: 102   a  TRUE
    #> 12: 110   a  TRUE
    
    # 判断是否为连续符合条件,连续符合即保持同一个index, 否则就递增
    aa[,grp_index:=rleid(grp)][]
    #>     num str   grp grp_index
    #>  1:  50   a FALSE         1
    #>  2: 110   a  TRUE         2
    #>  3: 120   b FALSE         3
    #>  4: 110   b FALSE         3
    #>  5: 105   a  TRUE         4
    #>  6: 101   a  TRUE         4
    #>  7: 103   a  TRUE         4
    #>  8:  30   b FALSE         5
    #>  9:  40   a FALSE         5
    #> 10: 101   a  TRUE         6
    #> 11: 102   a  TRUE         6
    #> 12: 110   a  TRUE         6
    
    # 判断连续有几个符合条件
    aa[,count:=.N,by= grp_index][]
    #>     num str   grp grp_index count
    #>  1:  50   a FALSE         1     1
    #>  2: 110   a  TRUE         2     1
    #>  3: 120   b FALSE         3     2
    #>  4: 110   b FALSE         3     2
    #>  5: 105   a  TRUE         4     3
    #>  6: 101   a  TRUE         4     3
    #>  7: 103   a  TRUE         4     3
    #>  8:  30   b FALSE         5     2
    #>  9:  40   a FALSE         5     2
    #> 10: 101   a  TRUE         6     3
    #> 11: 102   a  TRUE         6     3
    #> 12: 110   a  TRUE         6     3
    
    # 筛选3个以上且符合条件的每一组的最后一行
    aa[grp==TRUE & count>=3,.I[.N],by = grp_index]$V1
    #> [1]  7 12

    <sup>Created on 2021-04-05 by the reprex package (v0.3.0)</sup>

    • 这是我的解法,可能有点复杂,欢迎更好的解法,我的想法是将字符那一列转为字符串,然后借助正则表达式进行处理。

      aa<-data.frame(数值=c(50,110,120,110,105,101,103,30,40,101,102,110),
                    字符=c(rep("a",2),rep("b",2),rep("a",3),"b",rep("a",4)))
      
      library(stringr)
      b<-Reduce(paste0,unlist(aa$字符))
      locates<-str_locate_all(b,"a{3,}")[[1]]
      for(i in 1:nrow(locates)){
        start<-locates[i,][1]
        end<-locates[i,][2]
        
        flag<-rep('N',end-start+1)
        bigger<-aa$数值[start:end]>100
        for(j in 1:(end-start+1)){
          if(bigger[j]){flag[j]<-'Y'}
        }
        
        row_num<-start:end 
        d<-Reduce(paste0,unlist(flag))
        e<-str_locate_all(d,"Y{3,}")[[1]]
        cat(row_num[e[,2]],'\n')
      }
      #> 7 
      #> 12

      <sup>Created on 2021-04-05 by the reprex package (v2.0.0)</sup>

      • 或者考虑将向量改为字符串,利用正则表达式进行操作,代码如下:
        `
        qq<-c("a","b","b","a","b","b","b","b",
        "a","b","b","b","a")
        qq_str<-paste0(qq,collapse = "")

        library(stringr)
        three_b<-str_locate_all(qq_str,pattern='b{3,}')
        g<-function(x){return(seq(x[1],x[2],by=1))}
        apply(three_b[[1]],1,g)

        `
        结果为
        [[1]]
        [1] 5 6 7 8

        [[2]]
        [1] 10 11 12

      • 或者for loop

        qq=c("a","b","b","a","b","b","b","b","a","b","b","b","a")
        
        result <- integer(length(qq))
        for (i in 3:length(qq)) {
          if (all(qq[(i-2):i]=="b")) {
            result[(i-2):i] <- (i-2):i
          }
        }
        
        result[result!=0]
        #> [1]  5  6  7  8 10 11 12

        <sup>Created on 2020-05-10 by the reprex package (v0.3.0)</sup>

      • library(data.table)
        
        dt <- data.table(qq=c("a","b","b","a","b","b","b","b","a","b","b","b","a"))
        
        dt[,id:=rleid(qq)
           ][,count:=.N,by=.(id)
             ][count>=3 & qq=="b",which=TRUE]
        #> [1]  5  6  7  8 10 11 12

        <sup>Created on 2020-05-10 by the reprex package (v0.3.0)</sup>

      • # 源数据
        c1 <- c('a', 2015, 300)
        c2 <- c('a', 2016, 400)
        c3 <- c('b', 2015, 700)
        c4 <- c('b', 2016, 600)
        dt <- data.frame(rbind(c1, c2, c3, c4), stringsAsFactors = FALSE)
        names(dt) <- c('name', 'date', 'amount')
        
        library(dplyr)
        library(tidyr)
        dt %>% 
          group_by(name) %>% 
          filter(amount == max(amount)) %>% 
          unite("merged", date, amount, sep = "-") -> dt2
        dt2
        • dhfly

          去掉sep = "."

          separate(data = ex, col =id, into = c("q", "w")) 即可

          如果你一定要手动指定,

          separate(data = ex, col =id, into = c("q", "w"),sep="\\.")

          报错是因为你传进去的sep会被当做一个正则表达式,.在正则表达式里match任何字符(except new line)。

          默认的sep会match任何non-alphanumeric 的字符,绝大多数情况下使用默认的即可。

          sep
          Separator between columns.

          If character, is interpreted as a regular expression. The default value is a regular expression that matches any sequence of non-alphanumeric values.

          • 借用上面的例子

            library(tidyverse)
            
            c1 <- c('a', 2015, 300)
            c2 <- c('a', 2016, 400)
            c3 <- c('b', 2015, 700)
            c4 <- c('b', 2016, 600)
            dt <- data.frame(rbind(c1, c2, c3, c4), stringsAsFactors = FALSE)
            names(dt) <- c('name', 'date', 'amount')
            
            new.dt <- dt %>%
                group_by(name) %>%
                filter(amount == max(amount)) %>%
                mutate(merged = paste(date, amount, sep = "-")) %>%
                ungroup()
            • 来一个data.table的写法

              
              library(data.table)
              
              c1 <- c('a', 2015, 300)
              c2 <- c('a', 2016, 400)
              c3 <- c('b', 2015, 700)
              c4 <- c('b', 2016, 600)
              dt <- data.frame(rbind(c1, c2, c3, c4), stringsAsFactors = FALSE)
              names(dt) <- c('name', 'date', 'amount')
              
              setDT(dt)
              
              dt
              #>    name date amount
              #> 1:    a 2015    300
              #> 2:    a 2016    400
              #> 3:    b 2015    700
              #> 4:    b 2016    600
              
              # 一步到位
              dt2 <- dt[,.SD[amount==max(amount),.(merged=paste0(date,"-",amount))],by=.(name)]
              
              dt2
              #>    name   merged
              #> 1:    a 2016-400
              #> 2:    b 2015-700

              Created on 2020-01-02 by the reprex package (v0.3.0)

              • 为了避免中文支持的隐患,我把代码里的中文都改成英文,意思到了即可。

                没有使用其他包。

                # 源数据
                c1 <- c('a', 2015, 300)
                c2 <- c('a', 2016, 400)
                c3 <- c('b', 2015, 700)
                c4 <- c('b', 2016, 600)
                dt <- data.frame(rbind(c1, c2, c3, c4), stringsAsFactors = FALSE)
                names(dt) <- c('name', 'date', 'amount')
                dt$merged <- paste0(dt$date, '-', dt$amount)
                
                # 计算
                tb <- tapply(dt$merged, dt$name, function(x) x[which.max(substr(x, 6, nchar(x)))])
                tb
                dt2 <- data.frame(name = names(tb), merged = tb)
                dt2
                • beginr 包开发版 (github: pzhaonet/beginr) 里有个函数,批量读入指定文件夹里的所有结构相同的文本文件,作为 list 或 data.frame ,而源文件的文件名作为 list 里的 name 或者 data.frame 的新列来存放:

                  beginr::readdir()

                  并没有读成三维数组。我觉得作为 list 或 data.frame 保存更为直观一些,后续处理也更方便,尤其是存为 data.frame 时,源文件名作为新列,这样就可以直接用 tapply() 以及 tidyr 的 gather()spread() 函数折腾了。

                • 可以

                  手上有段现成代码:

                  # list files with pattern
                  filel = list.files("../dataset/out/", pattern = "fix3_1.61*", full.names=TRUE)
                  # read all files into a list, each element is a data.frame
                  data.l = lapply(filel, data.table::fread)