• R语言已解决
  • 新人求助,把数据框的每行按照某列拆分不同的行,只能用循环吗?

1、data.table包的tstrsplit函数是个好东西,按;分割时,长度不够的会用NA填充;
2、用melt()来个宽转长,对应baseR的reshape()

library("data.table")
x<-data.frame(year=c(2019,2019,2019,2020,2020),
              c1=c('a;b','c;d;e','f;g','h;i;j;k','l;m'))
setDT(x)
x[,tstrsplit(c1,";")]
#>    V1 V2   V3   V4
#> 1:  a  b <NA> <NA>
#> 2:  c  d    e <NA>
#> 3:  f  g <NA> <NA>
#> 4:  h  i    j    k
#> 5:  l  m <NA> <NA>

y<-x[,tstrsplit(c1,";")][,year:=x$year]
melt(y,id.vars="year",na.rm=TRUE)[order(year,value),.(c1=value,year)]
#>     c1 year
#>  1:  a 2019
#>  2:  b 2019
#>  3:  c 2019
#>  4:  d 2019
#>  5:  e 2019
#>  6:  f 2019
#>  7:  g 2019
#>  8:  h 2020
#>  9:  i 2020
#> 10:  j 2020
#> 11:  k 2020
#> 12:  l 2020
#> 13:  m 2020

<sup>Created on 2022-05-02 by the reprex package (v2.0.1)</sup>

    linzx

    library("data.table")
    
    x <- data.frame(
      year = c(2019, 2019, 2019, 2020, 2020),
      c1 = c("a;b", "c;d;e", "f;g", "h;i;j;k", "l;m")
    )
    x <- as.data.table(x)
    x[, lapply(.SD, function(x) unlist(strsplit(x, ";"))),
      .SDcols = "c1", by = c("year")
    ]
    #>     year c1
    #>  1: 2019  a
    #>  2: 2019  b
    #>  3: 2019  c
    #>  4: 2019  d
    #>  5: 2019  e
    #>  6: 2019  f
    #>  7: 2019  g
    #>  8: 2020  h
    #>  9: 2020  i
    #> 10: 2020  j
    #> 11: 2020  k
    #> 12: 2020  l
    #> 13: 2020  m

    <sup>Created on 2022-05-02 by the reprex package (v2.0.1)</sup>

      linzx
      数一数每行有几个“;”,把year相应扩长
      按“;”把数据切开、拉直

      x<-data.frame(year=c(2019,2019,2019,2020,2020),
                    c1=c('a;b','c;d;e','f;g','h;i;j;k','l;m'))
      data.frame(
        year = rep(x$year,times= sapply(gregexpr(";",x$c1),length)+1 ),
        value = unlist(strsplit(x$c1 , ";"))
      )

      我一直在考虑,这个数据这么处理后会不会有信息损失(如果还有第三个字段)?因为加工处理后的数据无法逆回去了。

        15 天 后

        tctcab 不光学会了 separate_row() 函数,还学到了 reprex 包的用法 😆

        chuxinyuan 操作前新增一列行号就可以逆回去

        library(tidyverse)
        
        df=data.table::fread(
          "
        year      c1
        2019     a;b
        2019   c;d;e
        2019     f;g
        2020 h;i;j;k
        2020     l;m
        "
        )
        
        df2 <- tidyr::separate_rows(df %>% rownames_to_column(),c1)
        df2
        #> # A tibble: 13 x 3
        #>    rowname  year c1   
        #>    <chr>   <int> <chr>
        #>  1 1        2019 a    
        #>  2 1        2019 b    
        #>  3 2        2019 c    
        #>  4 2        2019 d    
        #>  5 2        2019 e    
        #>  6 3        2019 f    
        #>  7 3        2019 g    
        #>  8 4        2020 h    
        #>  9 4        2020 i    
        #> 10 4        2020 j    
        #> 11 4        2020 k    
        #> 12 5        2020 l    
        #> 13 5        2020 m
        
        df2 %>% 
          dplyr::group_by(year, rowname) %>% 
          dplyr::summarise(c1 = str_flatten(c1, collapse = ";"))
        #> `summarise()` has grouped output by 'year'. You can override using the
        #> `.groups` argument.
        #> # A tibble: 5 x 3
        #> # Groups:   year [2]
        #>    year rowname c1     
        #>   <int> <chr>   <chr>  
        #> 1  2019 1       a;b    
        #> 2  2019 2       c;d;e  
        #> 3  2019 3       f;g    
        #> 4  2020 4       h;i;j;k
        #> 5  2020 5       l;m
        3 个月 后

        linzx
        这个dataMojo R 包可以一步到位。欢迎来逛逛: https://github.com/jienagu/dataMojo

        df=data.table::fread(
          "
        year      c1
        2019     a;b
        2019   c;d;e
        2019     f;g
        2020 h;i;j;k
        2020     l;m
        "
        )
        
        dataMojo::row_expand_pattern(df, "c1", ";", "c2")[]
            year c2
         1: 2019  a
         2: 2019  b
         3: 2019  c
         4: 2019  d
         5: 2019  e
         6: 2019  f
         7: 2019  g
         8: 2020  h
         9: 2020  i
        10: 2020  j
        11: 2020  k
        12: 2020  l
        13: 2020  m