请问各位如何选取一个月中最后五天的数据,由于是交易日,所以每个月最后五天都不一样,取完后还需要对这五天的数据进行平均,谢谢各位

可以参考这个
https://stackoverflow.com/questions/53994497/how-to-select-last-n-observation-from-each-group-in-dplyr-dataframe

library(tidyverse)
library(lubridate)
#构造数据
data <- data.frame(
  stringsAsFactors = FALSE,
              date = c("2020-4-5","2020-4-6",
                       "2020-4-7","2020-4-8","2020-4-9","2020-4-10","2020-5-15",
                       "2020-5-16","2020-5-17","2020-5-18","2020-5-19",
                       "2020-5-20","2020-5-21"),
             value = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L)
)

data %>% 
  mutate(date = date(date),  #转换成date型
         Month = month(date),  #添加提取月份
         Day = day(date)) %>%  #添加日期
  group_by(Month) %>% #数据按照月份分组
  do(tail(., 5)) %>%  #选取每个月的最后五行,这里默认日期已经排序
  summarise(mean = mean(value)) #求均值

结果如下

# A tibble: 2 x 2
  Month  mean
* <dbl> <dbl>
1     4     4
2     5    11

    barnett874 个人愚见:

    • 没有考虑跨年的情况,因此还需要增加 Year 变量,但是 Day 变量感觉没有必要。
    • 谨慎起见有必要对数据按照日期升序排列,然后按照 Year、Month分组。
    • 既然走 tidyverse 风格,可以考虑将do(tail(., 5))换成slice_tail(n = 5)
    library(dplyr)
    expand.grid(
      year = 2020:2021,
      month = 4:5,
      day = 14:19
      ) %>% 
      arrange(year, month, day) %>%
      mutate(value = 1:nrow(.)) %>%
      group_by(year, month) %>%
      slice_tail(n = 5) %>%
      summarise(
        mean = mean(value), 
        .groups = "keep"
      )
    #> # A tibble: 4 x 3
    #> # Groups:   year, month [4]
    #>    year month  mean
    #>   <int> <int> <dbl>
    #> 1  2020     4     4
    #> 2  2020     5    10
    #> 3  2021     4    16
    #> 4  2021     5    22

    <sup>Created on 2021-05-17 by the reprex package (v0.3.0)</sup>

      来个base版:

      dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19)
      dat$value <- seq_len(nrow(dat))
      
      do.call(rbind, with(dat, by(
         dat[order(year, month, day), ],  interaction(year, month),
         function(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value, 5))))
      )))

        chuxinyuan
        增加一行能好,但是baseR功力太弱,不知道怎么改的更好看

        dat <- dat[order(dat$year, dat$month, dat$day),]

          更正base版:

          dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19)
          dat$value <- seq_len(nrow(dat))
           
          dat <- dat[order(dat$year, dat$month, dat$day), ]
          do.call(rbind, by(dat, interaction(dat$month, dat$year),
             function(i) c(year = i$year[1], month = i$month[1], value_mean = mean(tail(i$value, 5)))
          ))

          chuxinyuan barnett874 多谢二位的指正。

          dplyr解决方法

          library(tidyverse)
          #构造数据
          data <- data.frame(
            stringsAsFactors = FALSE,
            date = c("2020-4-5","2020-4-6",
                     "2020-4-7","2020-4-8","2020-4-9","2020-4-10","2020-5-15",
                     "2020-5-16","2020-5-17","2020-5-18","2020-5-19",
                     "2020-5-20","2020-5-21"),
            value = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L)
          )
          data$date = as.Date(data$date)
          
          # 解决方案
          data %>% 
            slice_max(date, n = 5) %>% # 选中后5天
            summarise(mean = mean(value)) # 计算value的平均值

          R 4.1.0 版

          dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19) |> 
            {\(i) within(i, value <- seq_len(nrow(i))) }() 
          
          dat |> 
            split(~ month + year) |>
            lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |>
            {\(i) do.call(rbind, i) }()

            wangbinzjcc wangbinzjcc

            [main*]> system.time({
                 dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19) |> 
                     {\(i) within(i, value <- seq_len(nrow(i))) }();
                 
                 dat |> 
                     split(~ month + year) |>
                     lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |>
                     {\(i) do.call(rbind, i) }()  
             })
               user  system elapsed 
              0.009   0.000   0.009[main*]>[main*]> system.time({
                 dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19);
                 dat$value <- seq_len(nrow(dat));
                 
                 dat <- dat[order(dat$year, dat$month, dat$day), ];
                 do.call(rbind, by(dat, interaction(dat$month, dat$year),
                                   function(i) c(year = i$year[1], month = i$month[1], value_mean = mean(tail(i$value, 5)))
                 ))
             })
               user  system elapsed 
              0.034   0.093   0.125 
              test_4.1.0 <- function(){
                # R 4.1.0
                system.time({
                  
                  dat <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31) |> 
                    {\(i) within(i, value <- seq_len(nrow(i))) }()
                  
                  dat |> 
                    split(~ month + year) |>
                    lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |>
                    {\(i) do.call(rbind, i) }()  
                  
                  })
              }
              
              times_4.1.0 <- replicate(10, test_4.1.0()[1:3])
              times_4.1.0
              #>           [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
              #> user.self    0    0    0    0    0 0.02 0.01 0.02    0  0.01
              #> sys.self     0    0    0    0    0 0.00 0.00 0.00    0  0.00
              #> elapsed      0    0    0    0    0 0.01 0.01 0.02    0  0.01
              
              
              
              
              
              test_4.0.5 <- function(){
                # R 4.0.5
                system.time({
                  
                  dat <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31)
                  dat <- within(dat, value <- seq_len(nrow(dat)))
                  
                  do.call(rbind, lapply(
                    split(dat, dat[, c("month", "year")]), 
                    function(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))
                    ))
                  
                })
              }
              
              times_4.0.5 <- replicate(10, test_4.0.5()[1:3])
              times_4.0.5
              #>           [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
              #> user.self    0    0    0 0.01    0 0.01    0    0    0     0
              #> sys.self     0    0    0 0.00    0 0.00    0    0    0     0
              #> elapsed      0    0    0 0.01    0 0.01    0    0    0     0

              <sup>Created on 2021-05-21 by the reprex package (v2.0.0)</sup>

              运行速度上没有区别。

                wangbinzjcc ryo 多测算几次求平均这个思路没错,但是数据量太少了,说明不了问题。

                重新开个帖子讨论这个问题吧!楼主的问题早就干净利索的解决了。

                  chuxinyuan 是的,正规测速不是这么测的。system.time() 可以用来粗测,但对运行时间短的代码仅仅跑 10 遍求平均是不太靠谱的。可以考虑 microbenchmark::microbenchmark() 等工具。

                  1.9 GiB [main*]> microbenchmark(
                       test_4.1.0 = {
                           dat1 <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31) |> 
                               {\(i) within(i, value <- seq_len(nrow(i))) }(); 
                           dat1 |> 
                               split(~ month + year) |> 
                               lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |> 
                               {\(i) do.call(rbind, i) }()}, 
                       test_4.0.5 = {
                           dat2 <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31)
                           dat2 <- within(dat2, value <- seq_len(nrow(dat2)))
                           
                           do.call(rbind, lapply(
                               split(dat2, dat2[, c("month", "year")]), 
                               function(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))
                           ))}, 
                       ## https://github.com/nmatzke/BioGeoBEARS
                       #test_BioGeoBEARS.alt = {
                       #  dat <- expand.grid.alt(year = 2015:2021, month = 1:12, day = 1:31) |> 
                       #    {\(i) within(i, value <- seq_len(nrow(i))) }(); 
                       #  dat |> 
                       #    split(~ month + year) |> 
                       #    lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |> 
                       #    {\(i) do.call(rbind, i) }()}, 
                       ## https://github.com/nmatzke/BioGeoBEARS
                       #test_BioGeoBEARS.jc = {
                       #  dat <- expand.grid.jc(year = 2015:2021, month = 1:12, day = 1:31) |> 
                       #    {\(i) within(i, value <- seq_len(nrow(i))) }(); 
                       #  dat |> 
                       #    split(~ month + year) |> 
                       #    lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |> 
                       #    {\(i) do.call(rbind, i) }()}, 
                       
                       test_dplyr = {
                           dat3 <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31) %>% 
                               mutate(value = 1:nrow(.))
                           dat3 %>% ddply(., .(year, month), summarise, mean_value = mean(tail(value, 5)))
                           #}, 
                           ## https://rdrr.io/cran/noncompliance/man/expand.grid.DT.html
                           ## https://atrebas.github.io/post/2019-03-03-datatable-dplyr
                           #test_exg.DT = {
                           #dat4 <- noncompliance::expand.grid.DT(2015:2021, 1:12, 1:31)
                           #setnames(dat4, old = c('seq1', 'seq2', 'seq3'), new = c('year', 'month', 'day'))
                           #dat4[, value := 1:.N][]
                          
                           ## https://s3.amazonaws.com/assets.datacamp.com/blog_assets/datatable_Cheat_Sheet_R.pdf
                           #dat4[order(year, month, day), .(mean_value = mean((value[.N-5:.N]))), by = c('year', 'month')]
                           
                           ## https://stackoverflow.com/questions/36526141/summarize-data-table-by-group
                           ## https://stackoverflow.com/a/16325932/3806250
                           ## https://stackoverflow.com/a/34753260/3806250
                           ## https://stackoverflow.com/a/53994503/3806250
                           ## https://stackoverflow.com/a/50093928/3806250
                           ## https://s3.amazonaws.com/assets.datacamp.com/blog_assets/datatable_Cheat_Sheet_R.pdf
                           #dat4[dat4[, .I[(.N-4):.N], by = c('year', 'month')]$V1]
                           #dat4[dat4[, .I[tail(seq_len(.N),5)], by = c('year', 'month')]$V1]
                               
                           #dat4[, {
                           #value = value
                           #value = value[order(day)]
                           #.SD[, .(mean_value = mean(tail(value, 5))), 
                           #    by=.(year, month)]}]
                           
                           #dat4[(.N-5):.N, 
                           #   .(mean_value = mean(value)), 
                           #   by = c('year', 'month')]
                           
                           #dat4[, mean(value[(.N-5):.N]), by = c('year', 'month')]
                           
                           #dat4[, mean(tail(value, 5)), by = c('year', 'month')]
                           
                           #dat4[, .(mean_value = mean(tail(value, 5))), by = c('year', 'month')]
                           
                           #dat4[, .(mean_value = mean(tail(value[order(day)], 5))), by = c('year', 'month')]
                  
                           #dat4[, by = c('year', 'month'),
                           #   lapply(tail(.SD, 5), mean),
                          #    .SDcols = c('value')]
                  
                           #dat4[, keyby = .(year, month),
                           #     .(mean_value = mean(tail(value, 5)))]
                           
                           ## https://stackoverflow.com/questions/33702470/aggregate-sum-and-mean-in-r-with-ddply
                           #dat4[, lapply(.SD[c(.N-5, .N)], mean), by = c('year', 'month')]
                           #},
                           ## https://stackoverflow.com/questions/12948686/why-is-expand-grid-faster-than-data-table-s-cj
                           #test_CJ = {
                           #dat5 <- CJ(2015:2021, 1:12, 1:31)
                           #setnames(dat4, old = c('seq1', 'seq2', 'seq3'), new = c('year', 'month', 'day'))
                           #dat4[, value := 1:.N][]
                       }
                   )
                  Unit: milliseconds
                         expr    min     lq   mean median     uq   max neval
                   test_4.1.0  7.129  7.647  8.293  7.915  8.145 19.88   100
                   test_4.0.5  7.087  7.609  8.492  7.866  8.254 21.05   100
                   test_dplyr 15.888 16.695 17.533 17.089 17.528 28.48   100

                  data.table怎么使用?搞了一整天都错,晕 😅

                   1.9 GiB [main*]> microbenchmark(
                       eg = expand.grid(1800:2050, 1:12, 1:31),
                       egDT = noncompliance::expand.grid.DT(expand.grid.DT(1800:2050, 1:12, 1:31)),
                       CJ = data.table::CJ(1800:2050, 1:12, 1:31),
                       eg.L = expand.grid(1800:2050L, 1:12, 1:31),
                       egDT.L = noncompliance::expand.grid.DT(expand.grid.DT(1800:2050L, 1:12, 1:31)),
                       CJ.L = data.table::CJ(1800:2050L, 1:12, 1:31),
                       eg.LL = expand.grid(1800L:2050L, 1:12, 1:31),
                       egDT.LL = noncompliance::expand.grid.DT(expand.grid.DT(1800L:2050L, 1:12, 1:31)),
                       CJ.LL = data.table::CJ(1800L:2050L, 1:12, 1:31))
                  Unit: microseconds
                      expr    min     lq   mean median     uq    max neval
                        eg 2682.4 2939.9 3171.6 3002.4 3096.1  13861   100
                      egDT 4455.3 4652.5 5500.6 4785.3 5015.9  20647   100
                        CJ  459.9  531.6 4260.9  594.2  681.8 357043   100
                      eg.L 2737.2 2938.1 3345.6 3020.6 3191.0  14063   100
                    egDT.L 4378.7 4662.0 5497.7 4836.1 5238.5  16041   100
                      CJ.L  459.8  538.6  731.5  586.2  670.0   9731   100
                     eg.LL 2725.1 2949.2 3288.2 3022.1 3176.0  13908   100
                   egDT.LL 4463.2 4681.4 5278.5 4841.8 5090.6  16971   100
                     CJ.LL  464.8  554.1  695.0  612.4  679.5   5130   100