barnett874 个人愚见:

  • 没有考虑跨年的情况,因此还需要增加 Year 变量,但是 Day 变量感觉没有必要。
  • 谨慎起见有必要对数据按照日期升序排列,然后按照 Year、Month分组。
  • 既然走 tidyverse 风格,可以考虑将do(tail(., 5))换成slice_tail(n = 5)
library(dplyr)
expand.grid(
  year = 2020:2021,
  month = 4:5,
  day = 14:19
  ) %>% 
  arrange(year, month, day) %>%
  mutate(value = 1:nrow(.)) %>%
  group_by(year, month) %>%
  slice_tail(n = 5) %>%
  summarise(
    mean = mean(value), 
    .groups = "keep"
  )
#> # A tibble: 4 x 3
#> # Groups:   year, month [4]
#>    year month  mean
#>   <int> <int> <dbl>
#> 1  2020     4     4
#> 2  2020     5    10
#> 3  2021     4    16
#> 4  2021     5    22

<sup>Created on 2021-05-17 by the reprex package (v0.3.0)</sup>

    来个base版:

    dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19)
    dat$value <- seq_len(nrow(dat))
    
    do.call(rbind, with(dat, by(
       dat[order(year, month, day), ],  interaction(year, month),
       function(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value, 5))))
    )))

      chuxinyuan
      增加一行能好,但是baseR功力太弱,不知道怎么改的更好看

      dat <- dat[order(dat$year, dat$month, dat$day),]

        更正base版:

        dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19)
        dat$value <- seq_len(nrow(dat))
         
        dat <- dat[order(dat$year, dat$month, dat$day), ]
        do.call(rbind, by(dat, interaction(dat$month, dat$year),
           function(i) c(year = i$year[1], month = i$month[1], value_mean = mean(tail(i$value, 5)))
        ))

        chuxinyuan barnett874 多谢二位的指正。

        dplyr解决方法

        library(tidyverse)
        #构造数据
        data <- data.frame(
          stringsAsFactors = FALSE,
          date = c("2020-4-5","2020-4-6",
                   "2020-4-7","2020-4-8","2020-4-9","2020-4-10","2020-5-15",
                   "2020-5-16","2020-5-17","2020-5-18","2020-5-19",
                   "2020-5-20","2020-5-21"),
          value = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L)
        )
        data$date = as.Date(data$date)
        
        # 解决方案
        data %>% 
          slice_max(date, n = 5) %>% # 选中后5天
          summarise(mean = mean(value)) # 计算value的平均值

        R 4.1.0 版

        dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19) |> 
          {\(i) within(i, value <- seq_len(nrow(i))) }() 
        
        dat |> 
          split(~ month + year) |>
          lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |>
          {\(i) do.call(rbind, i) }()

          wangbinzjcc wangbinzjcc

          ✓ [main*]> system.time({
               dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19) |> 
                   {\(i) within(i, value <- seq_len(nrow(i))) }();
               
               dat |> 
                   split(~ month + year) |>
                   lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |>
                   {\(i) do.call(rbind, i) }()  
           })
             user  system elapsed 
            0.009   0.000   0.009 
          ✓ [main*]> 
          ✓ [main*]> system.time({
               dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19);
               dat$value <- seq_len(nrow(dat));
               
               dat <- dat[order(dat$year, dat$month, dat$day), ];
               do.call(rbind, by(dat, interaction(dat$month, dat$year),
                                 function(i) c(year = i$year[1], month = i$month[1], value_mean = mean(tail(i$value, 5)))
               ))
           })
             user  system elapsed 
            0.034   0.093   0.125 
            test_4.1.0 <- function(){
              # R 4.1.0
              system.time({
                
                dat <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31) |> 
                  {\(i) within(i, value <- seq_len(nrow(i))) }()
                
                dat |> 
                  split(~ month + year) |>
                  lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |>
                  {\(i) do.call(rbind, i) }()  
                
                })
            }
            
            times_4.1.0 <- replicate(10, test_4.1.0()[1:3])
            times_4.1.0
            #>           [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
            #> user.self    0    0    0    0    0 0.02 0.01 0.02    0  0.01
            #> sys.self     0    0    0    0    0 0.00 0.00 0.00    0  0.00
            #> elapsed      0    0    0    0    0 0.01 0.01 0.02    0  0.01
            
            
            
            
            
            test_4.0.5 <- function(){
              # R 4.0.5
              system.time({
                
                dat <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31)
                dat <- within(dat, value <- seq_len(nrow(dat)))
                
                do.call(rbind, lapply(
                  split(dat, dat[, c("month", "year")]), 
                  function(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))
                  ))
                
              })
            }
            
            times_4.0.5 <- replicate(10, test_4.0.5()[1:3])
            times_4.0.5
            #>           [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
            #> user.self    0    0    0 0.01    0 0.01    0    0    0     0
            #> sys.self     0    0    0 0.00    0 0.00    0    0    0     0
            #> elapsed      0    0    0 0.01    0 0.01    0    0    0     0

            <sup>Created on 2021-05-21 by the reprex package (v2.0.0)</sup>

            运行速度上没有区别。

              wangbinzjcc ryo 多测算几次求平均这个思路没错,但是数据量太少了,说明不了问题。

              重新开个帖子讨论这个问题吧!楼主的问题早就干净利索的解决了。

                chuxinyuan 是的,正规测速不是这么测的。system.time() 可以用来粗测,但对运行时间短的代码仅仅跑 10 遍求平均是不太靠谱的。可以考虑 microbenchmark::microbenchmark() 等工具。

                ✓ 1.9 GiB [main*]> microbenchmark(
                     test_4.1.0 = {
                         dat1 <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31) |> 
                             {\(i) within(i, value <- seq_len(nrow(i))) }(); 
                         dat1 |> 
                             split(~ month + year) |> 
                             lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |> 
                             {\(i) do.call(rbind, i) }()}, 
                     test_4.0.5 = {
                         dat2 <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31)
                         dat2 <- within(dat2, value <- seq_len(nrow(dat2)))
                         
                         do.call(rbind, lapply(
                             split(dat2, dat2[, c("month", "year")]), 
                             function(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))
                         ))}, 
                     ## https://github.com/nmatzke/BioGeoBEARS
                     #test_BioGeoBEARS.alt = {
                     #  dat <- expand.grid.alt(year = 2015:2021, month = 1:12, day = 1:31) |> 
                     #    {\(i) within(i, value <- seq_len(nrow(i))) }(); 
                     #  dat |> 
                     #    split(~ month + year) |> 
                     #    lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |> 
                     #    {\(i) do.call(rbind, i) }()}, 
                     ## https://github.com/nmatzke/BioGeoBEARS
                     #test_BioGeoBEARS.jc = {
                     #  dat <- expand.grid.jc(year = 2015:2021, month = 1:12, day = 1:31) |> 
                     #    {\(i) within(i, value <- seq_len(nrow(i))) }(); 
                     #  dat |> 
                     #    split(~ month + year) |> 
                     #    lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |> 
                     #    {\(i) do.call(rbind, i) }()}, 
                     
                     test_dplyr = {
                         dat3 <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31) %>% 
                             mutate(value = 1:nrow(.))
                         dat3 %>% ddply(., .(year, month), summarise, mean_value = mean(tail(value, 5)))
                         #}, 
                         ## https://rdrr.io/cran/noncompliance/man/expand.grid.DT.html
                         ## https://atrebas.github.io/post/2019-03-03-datatable-dplyr
                         #test_exg.DT = {
                         #dat4 <- noncompliance::expand.grid.DT(2015:2021, 1:12, 1:31)
                         #setnames(dat4, old = c('seq1', 'seq2', 'seq3'), new = c('year', 'month', 'day'))
                         #dat4[, value := 1:.N][]
                        
                         ## https://s3.amazonaws.com/assets.datacamp.com/blog_assets/datatable_Cheat_Sheet_R.pdf
                         #dat4[order(year, month, day), .(mean_value = mean((value[.N-5:.N]))), by = c('year', 'month')]
                         
                         ## https://stackoverflow.com/questions/36526141/summarize-data-table-by-group
                         ## https://stackoverflow.com/a/16325932/3806250
                         ## https://stackoverflow.com/a/34753260/3806250
                         ## https://stackoverflow.com/a/53994503/3806250
                         ## https://stackoverflow.com/a/50093928/3806250
                         ## https://s3.amazonaws.com/assets.datacamp.com/blog_assets/datatable_Cheat_Sheet_R.pdf
                         #dat4[dat4[, .I[(.N-4):.N], by = c('year', 'month')]$V1]
                         #dat4[dat4[, .I[tail(seq_len(.N),5)], by = c('year', 'month')]$V1]
                             
                         #dat4[, {
                         #value = value
                         #value = value[order(day)]
                         #.SD[, .(mean_value = mean(tail(value, 5))), 
                         #    by=.(year, month)]}]
                         
                         #dat4[(.N-5):.N, 
                         #   .(mean_value = mean(value)), 
                         #   by = c('year', 'month')]
                         
                         #dat4[, mean(value[(.N-5):.N]), by = c('year', 'month')]
                         
                         #dat4[, mean(tail(value, 5)), by = c('year', 'month')]
                         
                         #dat4[, .(mean_value = mean(tail(value, 5))), by = c('year', 'month')]
                         
                         #dat4[, .(mean_value = mean(tail(value[order(day)], 5))), by = c('year', 'month')]
                
                         #dat4[, by = c('year', 'month'),
                         #   lapply(tail(.SD, 5), mean),
                        #    .SDcols = c('value')]
                
                         #dat4[, keyby = .(year, month),
                         #     .(mean_value = mean(tail(value, 5)))]
                         
                         ## https://stackoverflow.com/questions/33702470/aggregate-sum-and-mean-in-r-with-ddply
                         #dat4[, lapply(.SD[c(.N-5, .N)], mean), by = c('year', 'month')]
                         #},
                         ## https://stackoverflow.com/questions/12948686/why-is-expand-grid-faster-than-data-table-s-cj
                         #test_CJ = {
                         #dat5 <- CJ(2015:2021, 1:12, 1:31)
                         #setnames(dat4, old = c('seq1', 'seq2', 'seq3'), new = c('year', 'month', 'day'))
                         #dat4[, value := 1:.N][]
                     }
                 )
                Unit: milliseconds
                       expr    min     lq   mean median     uq   max neval
                 test_4.1.0  7.129  7.647  8.293  7.915  8.145 19.88   100
                 test_4.0.5  7.087  7.609  8.492  7.866  8.254 21.05   100
                 test_dplyr 15.888 16.695 17.533 17.089 17.528 28.48   100

                data.table怎么使用?搞了一整天都错,晕 😅

                ✓ 1.9 GiB [main*]> microbenchmark(
                     eg = expand.grid(1800:2050, 1:12, 1:31),
                     egDT = noncompliance::expand.grid.DT(expand.grid.DT(1800:2050, 1:12, 1:31)),
                     CJ = data.table::CJ(1800:2050, 1:12, 1:31),
                     eg.L = expand.grid(1800:2050L, 1:12, 1:31),
                     egDT.L = noncompliance::expand.grid.DT(expand.grid.DT(1800:2050L, 1:12, 1:31)),
                     CJ.L = data.table::CJ(1800:2050L, 1:12, 1:31),
                     eg.LL = expand.grid(1800L:2050L, 1:12, 1:31),
                     egDT.LL = noncompliance::expand.grid.DT(expand.grid.DT(1800L:2050L, 1:12, 1:31)),
                     CJ.LL = data.table::CJ(1800L:2050L, 1:12, 1:31))
                Unit: microseconds
                    expr    min     lq   mean median     uq    max neval
                      eg 2682.4 2939.9 3171.6 3002.4 3096.1  13861   100
                    egDT 4455.3 4652.5 5500.6 4785.3 5015.9  20647   100
                      CJ  459.9  531.6 4260.9  594.2  681.8 357043   100
                    eg.L 2737.2 2938.1 3345.6 3020.6 3191.0  14063   100
                  egDT.L 4378.7 4662.0 5497.7 4836.1 5238.5  16041   100
                    CJ.L  459.8  538.6  731.5  586.2  670.0   9731   100
                   eg.LL 2725.1 2949.2 3288.2 3022.1 3176.0  13908   100
                 egDT.LL 4463.2 4681.4 5278.5 4841.8 5090.6  16971   100
                   CJ.LL  464.8  554.1  695.0  612.4  679.5   5130   100