大晚上的,看到这帖子想了下:
- 题目简化为,把底层表汇总成 表
DT
,字段为购买人、购买产品、购买记录、总金额
- 构建一个算法
no_of_letters(x)
,可以根据一个数值向量输出一个list,list的长度就是函件数量,list的内容就是哪几个购买记录对应一个函件
- 按购买人、购买产品为组别,对总金额的向量应用算法
no_of_letters(x)
算法no_of_letters(x)
:
- 统计x>=50的个数,每一个记录一个函件
- 对与x<50的部分的算法,取决于目标是什么,从题主的描述来看,是想最大化函件数量,但不知道有没有直接的算法。最简单的就是先找一个最大的购买记录,再从最小的往上加,加到50了就算一个。再不断重复。这应该不是最优解,但也不是最差的。
上代码:
library(data.table)
data = data.table(
id = c(1:14),
customer_id = c(rep(1, 2), rep(2, 3), rep(3, 5), rep(4, 4)),
policy_id = c(123:124, rep(125, 3), 126:134),
product = c(rep('A', 2), rep('B', 3), rep('C', 4), 'D', rep('E', 4)),
fee = c(51, 51, 71, 31, 21, 28, 28, 28, 20, 51, 41, 31, 21, 11))
DT = data[, .(fee = sum(fee), ids = list(id)), keyby = .(customer_id, product, policy_id)]
cal = function(x, ids, out = NULL) {
# x must be in accending order and all smaller than 50
# x and ids must share the same length
v_cumsum = cumsum(x)
if (length(x) && v_cumsum[length(v_cumsum)] >= 50) {
v_cumsum2 = v_cumsum + x[length(x)]
i = which(v_cumsum2 >= 50)[1L]
i = c(seq_len(i), length(x))
out = list(unlist(ids[i]))
ids = ids[-i]
x = x[-i]
if (sum(x) >= 50) {
out = c(out, cal(x, ids))
} else {
out
}
}
}
no_of_letters = function(x, ids) {
# x must be accending order
# ids must be a list with the same length as x
stopifnot(is.list(ids))
out = ids[x >= 50]
ids = ids[x < 50]
x = x[x < 50]
c(out, cal(x, ids))
}
# calculate the letter info
setorder(DT, fee)
out = DT[, .(letter_info = no_of_letters(fee, ids)), keyby = .(customer_id, product)]
out[, letter_id := seq_len(.N)]
print(out)
#> customer_id product letter_info letter_id
#> 1: 1 A 1 1
#> 2: 1 A 2 2
#> 3: 2 B 3,4,5 3
#> 4: 3 C 9,6,8 4
#> 5: 3 D 10 5
#> 6: 4 E 14,11 6
#> 7: 4 E 13,12 7
# merge back to the dataset
out2 = out[, .(id = unlist(letter_info)), keyby = .(customer_id, product, letter_id)]
data = out2[data, on = c("customer_id", "product", "id")]
print(data)
#> customer_id product letter_id id policy_id fee
#> 1: 1 A 1 1 123 51
#> 2: 1 A 2 2 124 51
#> 3: 2 B 3 3 125 71
#> 4: 2 B 3 4 125 31
#> 5: 2 B 3 5 125 21
#> 6: 3 C 4 6 126 28
#> 7: 3 C NA 7 127 28
#> 8: 3 C 4 8 128 28
#> 9: 3 C 4 9 129 20
#> 10: 3 D 5 10 130 51
#> 11: 4 E 6 11 131 41
#> 12: 4 E 7 12 132 31
#> 13: 4 E 7 13 133 21
#> 14: 4 E 6 14 134 11
# check the result
data[!is.na(letter_id), .(fee = sum(fee)), keyby = .(customer_id, product, letter_id)] |> print()
#> customer_id product letter_id fee
#> 1: 1 A 1 51
#> 2: 1 A 2 51
#> 3: 2 B 3 123
#> 4: 3 C 4 76
#> 5: 3 D 5 51
#> 6: 4 E 6 52
#> 7: 4 E 7 52
<sup>Created on 2022-12-05 with reprex v2.0.2</sup>