chuxinyuan 谢谢兄弟,会了!
求助:如何选取一个月中最后五天的数据
来个base版:
dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19)
dat$value <- seq_len(nrow(dat))
do.call(rbind, with(dat, by(
dat[order(year, month, day), ], interaction(year, month),
function(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value, 5))))
)))
wangbinzjcc 貌似结果不对。但是我也看不出来哪里不对,我这 base R 功力太弱了。
chuxinyuan
增加一行能好,但是baseR功力太弱,不知道怎么改的更好看
dat <- dat[order(dat$year, dat$month, dat$day),]
更正base版:
dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19)
dat$value <- seq_len(nrow(dat))
dat <- dat[order(dat$year, dat$month, dat$day), ]
do.call(rbind, by(dat, interaction(dat$month, dat$year),
function(i) c(year = i$year[1], month = i$month[1], value_mean = mean(tail(i$value, 5)))
))
chuxinyuan barnett874 多谢二位的指正。
dplyr解决方法
library(tidyverse)
#构造数据
data <- data.frame(
stringsAsFactors = FALSE,
date = c("2020-4-5","2020-4-6",
"2020-4-7","2020-4-8","2020-4-9","2020-4-10","2020-5-15",
"2020-5-16","2020-5-17","2020-5-18","2020-5-19",
"2020-5-20","2020-5-21"),
value = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L)
)
data$date = as.Date(data$date)
# 解决方案
data %>%
slice_max(date, n = 5) %>% # 选中后5天
summarise(mean = mean(value)) # 计算value的平均值
R 4.1.0 版
dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19) |>
{\(i) within(i, value <- seq_len(nrow(i))) }()
dat |>
split(~ month + year) |>
lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |>
{\(i) do.call(rbind, i) }()
✓ [main*]> system.time({
dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19) |>
{\(i) within(i, value <- seq_len(nrow(i))) }();
dat |>
split(~ month + year) |>
lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |>
{\(i) do.call(rbind, i) }()
})
user system elapsed
0.009 0.000 0.009
✓ [main*]>
✓ [main*]> system.time({
dat <- expand.grid(year = 2015:2021, month = 1:5, day = 10:19);
dat$value <- seq_len(nrow(dat));
dat <- dat[order(dat$year, dat$month, dat$day), ];
do.call(rbind, by(dat, interaction(dat$month, dat$year),
function(i) c(year = i$year[1], month = i$month[1], value_mean = mean(tail(i$value, 5)))
))
})
user system elapsed
0.034 0.093 0.125
- 已编辑
test_4.1.0 <- function(){
# R 4.1.0
system.time({
dat <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31) |>
{\(i) within(i, value <- seq_len(nrow(i))) }()
dat |>
split(~ month + year) |>
lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |>
{\(i) do.call(rbind, i) }()
})
}
times_4.1.0 <- replicate(10, test_4.1.0()[1:3])
times_4.1.0
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#> user.self 0 0 0 0 0 0.02 0.01 0.02 0 0.01
#> sys.self 0 0 0 0 0 0.00 0.00 0.00 0 0.00
#> elapsed 0 0 0 0 0 0.01 0.01 0.02 0 0.01
test_4.0.5 <- function(){
# R 4.0.5
system.time({
dat <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31)
dat <- within(dat, value <- seq_len(nrow(dat)))
do.call(rbind, lapply(
split(dat, dat[, c("month", "year")]),
function(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))
))
})
}
times_4.0.5 <- replicate(10, test_4.0.5()[1:3])
times_4.0.5
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#> user.self 0 0 0 0.01 0 0.01 0 0 0 0
#> sys.self 0 0 0 0.00 0 0.00 0 0 0 0
#> elapsed 0 0 0 0.01 0 0.01 0 0 0 0
<sup>Created on 2021-05-21 by the reprex package (v2.0.0)</sup>
运行速度上没有区别。
- 已编辑
wangbinzjcc ryo 多测算几次求平均这个思路没错,但是数据量太少了,说明不了问题。
重新开个帖子讨论这个问题吧!楼主的问题早就干净利索的解决了。
chuxinyuan 是的,正规测速不是这么测的。system.time()
可以用来粗测,但对运行时间短的代码仅仅跑 10 遍求平均是不太靠谱的。可以考虑 microbenchmark::microbenchmark()
等工具。
✓ 1.9 GiB [main*]> microbenchmark(
test_4.1.0 = {
dat1 <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31) |>
{\(i) within(i, value <- seq_len(nrow(i))) }();
dat1 |>
split(~ month + year) |>
lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |>
{\(i) do.call(rbind, i) }()},
test_4.0.5 = {
dat2 <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31)
dat2 <- within(dat2, value <- seq_len(nrow(dat2)))
do.call(rbind, lapply(
split(dat2, dat2[, c("month", "year")]),
function(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))
))},
## https://github.com/nmatzke/BioGeoBEARS
#test_BioGeoBEARS.alt = {
# dat <- expand.grid.alt(year = 2015:2021, month = 1:12, day = 1:31) |>
# {\(i) within(i, value <- seq_len(nrow(i))) }();
# dat |>
# split(~ month + year) |>
# lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |>
# {\(i) do.call(rbind, i) }()},
## https://github.com/nmatzke/BioGeoBEARS
#test_BioGeoBEARS.jc = {
# dat <- expand.grid.jc(year = 2015:2021, month = 1:12, day = 1:31) |>
# {\(i) within(i, value <- seq_len(nrow(i))) }();
# dat |>
# split(~ month + year) |>
# lapply(\(i) with(i, c(year = year[1], month = month[1], value_mean = mean(tail(value[order(day)], 5))))) |>
# {\(i) do.call(rbind, i) }()},
test_dplyr = {
dat3 <- expand.grid(year = 2015:2021, month = 1:12, day = 1:31) %>%
mutate(value = 1:nrow(.))
dat3 %>% ddply(., .(year, month), summarise, mean_value = mean(tail(value, 5)))
#},
## https://rdrr.io/cran/noncompliance/man/expand.grid.DT.html
## https://atrebas.github.io/post/2019-03-03-datatable-dplyr
#test_exg.DT = {
#dat4 <- noncompliance::expand.grid.DT(2015:2021, 1:12, 1:31)
#setnames(dat4, old = c('seq1', 'seq2', 'seq3'), new = c('year', 'month', 'day'))
#dat4[, value := 1:.N][]
## https://s3.amazonaws.com/assets.datacamp.com/blog_assets/datatable_Cheat_Sheet_R.pdf
#dat4[order(year, month, day), .(mean_value = mean((value[.N-5:.N]))), by = c('year', 'month')]
## https://stackoverflow.com/questions/36526141/summarize-data-table-by-group
## https://stackoverflow.com/a/16325932/3806250
## https://stackoverflow.com/a/34753260/3806250
## https://stackoverflow.com/a/53994503/3806250
## https://stackoverflow.com/a/50093928/3806250
## https://s3.amazonaws.com/assets.datacamp.com/blog_assets/datatable_Cheat_Sheet_R.pdf
#dat4[dat4[, .I[(.N-4):.N], by = c('year', 'month')]$V1]
#dat4[dat4[, .I[tail(seq_len(.N),5)], by = c('year', 'month')]$V1]
#dat4[, {
#value = value
#value = value[order(day)]
#.SD[, .(mean_value = mean(tail(value, 5))),
# by=.(year, month)]}]
#dat4[(.N-5):.N,
# .(mean_value = mean(value)),
# by = c('year', 'month')]
#dat4[, mean(value[(.N-5):.N]), by = c('year', 'month')]
#dat4[, mean(tail(value, 5)), by = c('year', 'month')]
#dat4[, .(mean_value = mean(tail(value, 5))), by = c('year', 'month')]
#dat4[, .(mean_value = mean(tail(value[order(day)], 5))), by = c('year', 'month')]
#dat4[, by = c('year', 'month'),
# lapply(tail(.SD, 5), mean),
# .SDcols = c('value')]
#dat4[, keyby = .(year, month),
# .(mean_value = mean(tail(value, 5)))]
## https://stackoverflow.com/questions/33702470/aggregate-sum-and-mean-in-r-with-ddply
#dat4[, lapply(.SD[c(.N-5, .N)], mean), by = c('year', 'month')]
#},
## https://stackoverflow.com/questions/12948686/why-is-expand-grid-faster-than-data-table-s-cj
#test_CJ = {
#dat5 <- CJ(2015:2021, 1:12, 1:31)
#setnames(dat4, old = c('seq1', 'seq2', 'seq3'), new = c('year', 'month', 'day'))
#dat4[, value := 1:.N][]
}
)
Unit: milliseconds
expr min lq mean median uq max neval
test_4.1.0 7.129 7.647 8.293 7.915 8.145 19.88 100
test_4.0.5 7.087 7.609 8.492 7.866 8.254 21.05 100
test_dplyr 15.888 16.695 17.533 17.089 17.528 28.48 100
这data.table
怎么使用?搞了一整天都错,晕 ~
✓ 1.9 GiB [main*]> microbenchmark(
eg = expand.grid(1800:2050, 1:12, 1:31),
egDT = noncompliance::expand.grid.DT(expand.grid.DT(1800:2050, 1:12, 1:31)),
CJ = data.table::CJ(1800:2050, 1:12, 1:31),
eg.L = expand.grid(1800:2050L, 1:12, 1:31),
egDT.L = noncompliance::expand.grid.DT(expand.grid.DT(1800:2050L, 1:12, 1:31)),
CJ.L = data.table::CJ(1800:2050L, 1:12, 1:31),
eg.LL = expand.grid(1800L:2050L, 1:12, 1:31),
egDT.LL = noncompliance::expand.grid.DT(expand.grid.DT(1800L:2050L, 1:12, 1:31)),
CJ.LL = data.table::CJ(1800L:2050L, 1:12, 1:31))
Unit: microseconds
expr min lq mean median uq max neval
eg 2682.4 2939.9 3171.6 3002.4 3096.1 13861 100
egDT 4455.3 4652.5 5500.6 4785.3 5015.9 20647 100
CJ 459.9 531.6 4260.9 594.2 681.8 357043 100
eg.L 2737.2 2938.1 3345.6 3020.6 3191.0 14063 100
egDT.L 4378.7 4662.0 5497.7 4836.1 5238.5 16041 100
CJ.L 459.8 538.6 731.5 586.2 670.0 9731 100
eg.LL 2725.1 2949.2 3288.2 3022.1 3176.0 13908 100
egDT.LL 4463.2 4681.4 5278.5 4841.8 5090.6 16971 100
CJ.LL 464.8 554.1 695.0 612.4 679.5 5130 100