请问各位如何选取一个月中最后五天的数据,由于是交易日,所以每个月最后五天都不一样,取完后还需要对这五天的数据进行平均,谢谢各位
求助:如何选取一个月中最后五天的数据
- 已编辑
library(tidyverse)
library(lubridate)
#构造数据
data <- data.frame(
stringsAsFactors = FALSE,
date = c("2020-4-5","2020-4-6",
"2020-4-7","2020-4-8","2020-4-9","2020-4-10","2020-5-15",
"2020-5-16","2020-5-17","2020-5-18","2020-5-19",
"2020-5-20","2020-5-21"),
value = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L)
)
data %>%
mutate(date = date(date), #转换成date型
Month = month(date), #添加提取月份
Day = day(date)) %>% #添加日期
group_by(Month) %>% #数据按照月份分组
do(tail(., 5)) %>% #选取每个月的最后五行,这里默认日期已经排序
summarise(mean = mean(value)) #求均值
结果如下
# A tibble: 2 x 2
Month mean
* <dbl> <dbl>
1 4 4
2 5 11
barnett874 个人愚见:
- 没有考虑跨年的情况,因此还需要增加 Year 变量,但是 Day 变量感觉没有必要。
- 谨慎起见有必要对数据按照日期升序排列,然后按照 Year、Month分组。
- 既然走 tidyverse 风格,可以考虑将
do(tail(., 5))
换成slice_tail(n = 5)
。
- 已编辑
library(dplyr)
expand.grid(
year = 2020:2021,
month = 4:5,
day = 14:19
) %>%
arrange(year, month, day) %>%
mutate(value = 1:nrow(.)) %>%
group_by(year, month) %>%
slice_tail(n = 5) %>%
summarise(
mean = mean(value),
.groups = "keep"
)
#> # A tibble: 4 x 3
#> # Groups: year, month [4]
#> year month mean
#> <int> <int> <dbl>
#> 1 2020 4 4
#> 2 2020 5 10
#> 3 2021 4 16
#> 4 2021 5 22
<sup>Created on 2021-05-17 by the reprex package (v0.3.0)</sup>
chuxinyuan 谢谢兄弟,会了!
来个base版:
dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19)
dat$value <- seq_len(nrow(dat))
do.call(rbind, with(dat, by(
dat[order(year, month, day), ], interaction(year, month),
function(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value, 5))))
)))
wangbinzjcc 貌似结果不对。但是我也看不出来哪里不对,我这 base R 功力太弱了。
chuxinyuan
增加一行能好,但是baseR功力太弱,不知道怎么改的更好看
dat <- dat[order(dat$year, dat$month, dat$day),]
更正base版:
dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19)
dat$value <- seq_len(nrow(dat))
dat <- dat[order(dat$year, dat$month, dat$day), ]
do.call(rbind, by(dat, interaction(dat$month, dat$year),
function(i) c(year = i$year[1], month = i$month[1], value_mean = mean(tail(i$value, 5)))
))
chuxinyuan barnett874 多谢二位的指正。
dplyr解决方法
library(tidyverse)
#构造数据
data <- data.frame(
stringsAsFactors = FALSE,
date = c("2020-4-5","2020-4-6",
"2020-4-7","2020-4-8","2020-4-9","2020-4-10","2020-5-15",
"2020-5-16","2020-5-17","2020-5-18","2020-5-19",
"2020-5-20","2020-5-21"),
value = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L)
)
data$date = as.Date(data$date)
# 解决方案
data %>%
slice_max(date, n = 5) %>% # 选中后5天
summarise(mean = mean(value)) # 计算value的平均值
R 4.1.0 版
dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19) |>
{\(i) within(i, value <- seq_len(nrow(i))) }()
dat |>
split(~ month + year) |>
lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |>
{\(i) do.call(rbind, i) }()
✓ [main*]> system.time({
dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19) |>
{\(i) within(i, value <- seq_len(nrow(i))) }();
dat |>
split(~ month + year) |>
lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |>
{\(i) do.call(rbind, i) }()
})
user system elapsed
0.009 0.000 0.009
✓ [main*]>
✓ [main*]> system.time({
dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19);
dat$value <- seq_len(nrow(dat));
dat <- dat[order(dat$year, dat$month, dat$day), ];
do.call(rbind, by(dat, interaction(dat$month, dat$year),
function(i) c(year = i$year[1], month = i$month[1], value_mean = mean(tail(i$value, 5)))
))
})
user system elapsed
0.034 0.093 0.125
- 已编辑
test_4.1.0 <- function(){
# R 4.1.0
system.time({
dat <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31) |>
{\(i) within(i, value <- seq_len(nrow(i))) }()
dat |>
split(~ month + year) |>
lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |>
{\(i) do.call(rbind, i) }()
})
}
times_4.1.0 <- replicate(10, test_4.1.0()[1:3])
times_4.1.0
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#> user.self 0 0 0 0 0 0.02 0.01 0.02 0 0.01
#> sys.self 0 0 0 0 0 0.00 0.00 0.00 0 0.00
#> elapsed 0 0 0 0 0 0.01 0.01 0.02 0 0.01
test_4.0.5 <- function(){
# R 4.0.5
system.time({
dat <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31)
dat <- within(dat, value <- seq_len(nrow(dat)))
do.call(rbind, lapply(
split(dat, dat[, c("month", "year")]),
function(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))
))
})
}
times_4.0.5 <- replicate(10, test_4.0.5()[1:3])
times_4.0.5
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#> user.self 0 0 0 0.01 0 0.01 0 0 0 0
#> sys.self 0 0 0 0.00 0 0.00 0 0 0 0
#> elapsed 0 0 0 0.01 0 0.01 0 0 0 0
<sup>Created on 2021-05-21 by the reprex package (v2.0.0)</sup>
运行速度上没有区别。
- 已编辑
wangbinzjcc ryo 多测算几次求平均这个思路没错,但是数据量太少了,说明不了问题。
重新开个帖子讨论这个问题吧!楼主的问题早就干净利索的解决了。
chuxinyuan 是的,正规测速不是这么测的。system.time()
可以用来粗测,但对运行时间短的代码仅仅跑 10 遍求平均是不太靠谱的。可以考虑 microbenchmark::microbenchmark()
等工具。
✓ 1.9 GiB [main*]> microbenchmark(
test_4.1.0 = {
dat1 <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31) |>
{\(i) within(i, value <- seq_len(nrow(i))) }();
dat1 |>
split(~ month + year) |>
lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |>
{\(i) do.call(rbind, i) }()},
test_4.0.5 = {
dat2 <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31)
dat2 <- within(dat2, value <- seq_len(nrow(dat2)))
do.call(rbind, lapply(
split(dat2, dat2[, c("month", "year")]),
function(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))
))},
## https://github.com/nmatzke/BioGeoBEARS
#test_BioGeoBEARS.alt = {
# dat <- expand.grid.alt(year = 2015:2021, month = 1:12, day = 1:31) |>
# {\(i) within(i, value <- seq_len(nrow(i))) }();
# dat |>
# split(~ month + year) |>
# lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |>
# {\(i) do.call(rbind, i) }()},
## https://github.com/nmatzke/BioGeoBEARS
#test_BioGeoBEARS.jc = {
# dat <- expand.grid.jc(year = 2015:2021, month = 1:12, day = 1:31) |>
# {\(i) within(i, value <- seq_len(nrow(i))) }();
# dat |>
# split(~ month + year) |>
# lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |>
# {\(i) do.call(rbind, i) }()},
test_dplyr = {
dat3 <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31) %>%
mutate(value = 1:nrow(.))
dat3 %>% ddply(., .(year, month), summarise, mean_value = mean(tail(value, 5)))
#},
## https://rdrr.io/cran/noncompliance/man/expand.grid.DT.html
## https://atrebas.github.io/post/2019-03-03-datatable-dplyr
#test_exg.DT = {
#dat4 <- noncompliance::expand.grid.DT(2015:2021, 1:12, 1:31)
#setnames(dat4, old = c('seq1', 'seq2', 'seq3'), new = c('year', 'month', 'day'))
#dat4[, value := 1:.N][]
## https://s3.amazonaws.com/assets.datacamp.com/blog_assets/datatable_Cheat_Sheet_R.pdf
#dat4[order(year, month, day), .(mean_value = mean((value[.N-5:.N]))), by = c('year', 'month')]
## https://stackoverflow.com/questions/36526141/summarize-data-table-by-group
## https://stackoverflow.com/a/16325932/3806250
## https://stackoverflow.com/a/34753260/3806250
## https://stackoverflow.com/a/53994503/3806250
## https://stackoverflow.com/a/50093928/3806250
## https://s3.amazonaws.com/assets.datacamp.com/blog_assets/datatable_Cheat_Sheet_R.pdf
#dat4[dat4[, .I[(.N-4):.N], by = c('year', 'month')]$V1]
#dat4[dat4[, .I[tail(seq_len(.N),5)], by = c('year', 'month')]$V1]
#dat4[, {
#value = value
#value = value[order(day)]
#.SD[, .(mean_value = mean(tail(value, 5))),
# by=.(year, month)]}]
#dat4[(.N-5):.N,
# .(mean_value = mean(value)),
# by = c('year', 'month')]
#dat4[, mean(value[(.N-5):.N]), by = c('year', 'month')]
#dat4[, mean(tail(value, 5)), by = c('year', 'month')]
#dat4[, .(mean_value = mean(tail(value, 5))), by = c('year', 'month')]
#dat4[, .(mean_value = mean(tail(value[order(day)], 5))), by = c('year', 'month')]
#dat4[, by = c('year', 'month'),
# lapply(tail(.SD, 5), mean),
# .SDcols = c('value')]
#dat4[, keyby = .(year, month),
# .(mean_value = mean(tail(value, 5)))]
## https://stackoverflow.com/questions/33702470/aggregate-sum-and-mean-in-r-with-ddply
#dat4[, lapply(.SD[c(.N-5, .N)], mean), by = c('year', 'month')]
#},
## https://stackoverflow.com/questions/12948686/why-is-expand-grid-faster-than-data-table-s-cj
#test_CJ = {
#dat5 <- CJ(2015:2021, 1:12, 1:31)
#setnames(dat4, old = c('seq1', 'seq2', 'seq3'), new = c('year', 'month', 'day'))
#dat4[, value := 1:.N][]
}
)
Unit: milliseconds
expr min lq mean median uq max neval
test_4.1.0 7.129 7.647 8.293 7.915 8.145 19.88 100
test_4.0.5 7.087 7.609 8.492 7.866 8.254 21.05 100
test_dplyr 15.888 16.695 17.533 17.089 17.528 28.48 100
这data.table
怎么使用?搞了一整天都错,晕 ~
✓ 1.9 GiB [main*]> microbenchmark(
eg = expand.grid(1800:2050, 1:12, 1:31),
egDT = noncompliance::expand.grid.DT(expand.grid.DT(1800:2050, 1:12, 1:31)),
CJ = data.table::CJ(1800:2050, 1:12, 1:31),
eg.L = expand.grid(1800:2050L, 1:12, 1:31),
egDT.L = noncompliance::expand.grid.DT(expand.grid.DT(1800:2050L, 1:12, 1:31)),
CJ.L = data.table::CJ(1800:2050L, 1:12, 1:31),
eg.LL = expand.grid(1800L:2050L, 1:12, 1:31),
egDT.LL = noncompliance::expand.grid.DT(expand.grid.DT(1800L:2050L, 1:12, 1:31)),
CJ.LL = data.table::CJ(1800L:2050L, 1:12, 1:31))
Unit: microseconds
expr min lq mean median uq max neval
eg 2682.4 2939.9 3171.6 3002.4 3096.1 13861 100
egDT 4455.3 4652.5 5500.6 4785.3 5015.9 20647 100
CJ 459.9 531.6 4260.9 594.2 681.8 357043 100
eg.L 2737.2 2938.1 3345.6 3020.6 3191.0 14063 100
egDT.L 4378.7 4662.0 5497.7 4836.1 5238.5 16041 100
CJ.L 459.8 538.6 731.5 586.2 670.0 9731 100
eg.LL 2725.1 2949.2 3288.2 3022.1 3176.0 13908 100
egDT.LL 4463.2 4681.4 5278.5 4841.8 5090.6 16971 100
CJ.LL 464.8 554.1 695.0 612.4 679.5 5130 100