• R语言已解决
  • 新人求助,把数据框的每行按照某列拆分不同的行,只能用循环吗?

试试tidyr::separate_rows()

# 读数据
df=data.table::fread(
"
year      c1
2019     a;b
2019   c;d;e
2019     f;g
2020 h;i;j;k
2020     l;m
"
)

tidyr::separate_rows(df,c1)
#> # A tibble: 13 x 2
#>     year c1   
#>    <int> <chr>
#>  1  2019 a    
#>  2  2019 b    
#>  3  2019 c    
#>  4  2019 d    
#>  5  2019 e    
#>  6  2019 f    
#>  7  2019 g    
#>  8  2020 h    
#>  9  2020 i    
#> 10  2020 j    
#> 11  2020 k    
#> 12  2020 l    
#> 13  2020 m

<sup>Created on 2022-05-02 by the reprex package (v2.0.1)</sup>

    1、data.table包的tstrsplit函数是个好东西,按;分割时,长度不够的会用NA填充;
    2、用melt()来个宽转长,对应baseR的reshape()

    library("data.table")
    x<-data.frame(year=c(2019,2019,2019,2020,2020),
                  c1=c('a;b','c;d;e','f;g','h;i;j;k','l;m'))
    setDT(x)
    x[,tstrsplit(c1,";")]
    #>    V1 V2   V3   V4
    #> 1:  a  b <NA> <NA>
    #> 2:  c  d    e <NA>
    #> 3:  f  g <NA> <NA>
    #> 4:  h  i    j    k
    #> 5:  l  m <NA> <NA>
    
    y<-x[,tstrsplit(c1,";")][,year:=x$year]
    melt(y,id.vars="year",na.rm=TRUE)[order(year,value),.(c1=value,year)]
    #>     c1 year
    #>  1:  a 2019
    #>  2:  b 2019
    #>  3:  c 2019
    #>  4:  d 2019
    #>  5:  e 2019
    #>  6:  f 2019
    #>  7:  g 2019
    #>  8:  h 2020
    #>  9:  i 2020
    #> 10:  j 2020
    #> 11:  k 2020
    #> 12:  l 2020
    #> 13:  m 2020

    <sup>Created on 2022-05-02 by the reprex package (v2.0.1)</sup>

      linzx

      library("data.table")
      
      x <- data.frame(
        year = c(2019, 2019, 2019, 2020, 2020),
        c1 = c("a;b", "c;d;e", "f;g", "h;i;j;k", "l;m")
      )
      x <- as.data.table(x)
      x[, lapply(.SD, function(x) unlist(strsplit(x, ";"))),
        .SDcols = "c1", by = c("year")
      ]
      #>     year c1
      #>  1: 2019  a
      #>  2: 2019  b
      #>  3: 2019  c
      #>  4: 2019  d
      #>  5: 2019  e
      #>  6: 2019  f
      #>  7: 2019  g
      #>  8: 2020  h
      #>  9: 2020  i
      #> 10: 2020  j
      #> 11: 2020  k
      #> 12: 2020  l
      #> 13: 2020  m

      <sup>Created on 2022-05-02 by the reprex package (v2.0.1)</sup>

        linzx
        数一数每行有几个“;”,把year相应扩长
        按“;”把数据切开、拉直

        x<-data.frame(year=c(2019,2019,2019,2020,2020),
                      c1=c('a;b','c;d;e','f;g','h;i;j;k','l;m'))
        data.frame(
          year = rep(x$year,times= sapply(gregexpr(";",x$c1),length)+1 ),
          value = unlist(strsplit(x$c1 , ";"))
        )

        我一直在考虑,这个数据这么处理后会不会有信息损失(如果还有第三个字段)?因为加工处理后的数据无法逆回去了。

          15 天 后

          tctcab 不光学会了 separate_row() 函数,还学到了 reprex 包的用法 😆

          chuxinyuan 操作前新增一列行号就可以逆回去

          library(tidyverse)
          
          df=data.table::fread(
            "
          year      c1
          2019     a;b
          2019   c;d;e
          2019     f;g
          2020 h;i;j;k
          2020     l;m
          "
          )
          
          df2 <- tidyr::separate_rows(df %>% rownames_to_column(),c1)
          df2
          #> # A tibble: 13 x 3
          #>    rowname  year c1   
          #>    <chr>   <int> <chr>
          #>  1 1        2019 a    
          #>  2 1        2019 b    
          #>  3 2        2019 c    
          #>  4 2        2019 d    
          #>  5 2        2019 e    
          #>  6 3        2019 f    
          #>  7 3        2019 g    
          #>  8 4        2020 h    
          #>  9 4        2020 i    
          #> 10 4        2020 j    
          #> 11 4        2020 k    
          #> 12 5        2020 l    
          #> 13 5        2020 m
          
          df2 %>% 
            dplyr::group_by(year, rowname) %>% 
            dplyr::summarise(c1 = str_flatten(c1, collapse = ";"))
          #> `summarise()` has grouped output by 'year'. You can override using the
          #> `.groups` argument.
          #> # A tibble: 5 x 3
          #> # Groups:   year [2]
          #>    year rowname c1     
          #>   <int> <chr>   <chr>  
          #> 1  2019 1       a;b    
          #> 2  2019 2       c;d;e  
          #> 3  2019 3       f;g    
          #> 4  2020 4       h;i;j;k
          #> 5  2020 5       l;m
          3 个月 后

          linzx
          这个dataMojo R 包可以一步到位。欢迎来逛逛: https://github.com/jienagu/dataMojo

          df=data.table::fread(
            "
          year      c1
          2019     a;b
          2019   c;d;e
          2019     f;g
          2020 h;i;j;k
          2020     l;m
          "
          )
          
          dataMojo::row_expand_pattern(df, "c1", ";", "c2")[]
              year c2
           1: 2019  a
           2: 2019  b
           3: 2019  c
           4: 2019  d
           5: 2019  e
           6: 2019  f
           7: 2019  g
           8: 2020  h
           9: 2020  i
          10: 2020  j
          11: 2020  k
          12: 2020  l
          13: 2020  m