dhfly
- 25 天前
- 注册于 2018年5月29日
- 已编辑
是有一些麻烦,特意分开了写,这样看的清楚些
library(data.table) aa <- data.frame(num=c(50,110,120,110,105,101,103,30,40,101,102,110),str=c(rep("a",2),rep("b",2),rep("a",3),"b",rep("a",4) )) setDT(aa) # 根据条件,判断每一行是否符合条件 aa[,grp:= (str=="a" & num>100)][] #> num str grp #> 1: 50 a FALSE #> 2: 110 a TRUE #> 3: 120 b FALSE #> 4: 110 b FALSE #> 5: 105 a TRUE #> 6: 101 a TRUE #> 7: 103 a TRUE #> 8: 30 b FALSE #> 9: 40 a FALSE #> 10: 101 a TRUE #> 11: 102 a TRUE #> 12: 110 a TRUE # 判断是否为连续符合条件,连续符合即保持同一个index, 否则就递增 aa[,grp_index:=rleid(grp)][] #> num str grp grp_index #> 1: 50 a FALSE 1 #> 2: 110 a TRUE 2 #> 3: 120 b FALSE 3 #> 4: 110 b FALSE 3 #> 5: 105 a TRUE 4 #> 6: 101 a TRUE 4 #> 7: 103 a TRUE 4 #> 8: 30 b FALSE 5 #> 9: 40 a FALSE 5 #> 10: 101 a TRUE 6 #> 11: 102 a TRUE 6 #> 12: 110 a TRUE 6 # 判断连续有几个符合条件 aa[,count:=.N,by= grp_index][] #> num str grp grp_index count #> 1: 50 a FALSE 1 1 #> 2: 110 a TRUE 2 1 #> 3: 120 b FALSE 3 2 #> 4: 110 b FALSE 3 2 #> 5: 105 a TRUE 4 3 #> 6: 101 a TRUE 4 3 #> 7: 103 a TRUE 4 3 #> 8: 30 b FALSE 5 2 #> 9: 40 a FALSE 5 2 #> 10: 101 a TRUE 6 3 #> 11: 102 a TRUE 6 3 #> 12: 110 a TRUE 6 3 # 筛选3个以上且符合条件的每一组的最后一行 aa[grp==TRUE & count>=3,.I[.N],by = grp_index]$V1 #> [1] 7 12
<sup>Created on 2021-04-05 by the reprex package (v0.3.0)</sup>
- 已编辑
这是我的解法,可能有点复杂,欢迎更好的解法,我的想法是将
字符
那一列转为字符串,然后借助正则表达式进行处理。aa<-data.frame(数值=c(50,110,120,110,105,101,103,30,40,101,102,110), 字符=c(rep("a",2),rep("b",2),rep("a",3),"b",rep("a",4))) library(stringr) b<-Reduce(paste0,unlist(aa$字符)) locates<-str_locate_all(b,"a{3,}")[[1]] for(i in 1:nrow(locates)){ start<-locates[i,][1] end<-locates[i,][2] flag<-rep('N',end-start+1) bigger<-aa$数值[start:end]>100 for(j in 1:(end-start+1)){ if(bigger[j]){flag[j]<-'Y'} } row_num<-start:end d<-Reduce(paste0,unlist(flag)) e<-str_locate_all(d,"Y{3,}")[[1]] cat(row_num[e[,2]],'\n') } #> 7 #> 12
<sup>Created on 2021-04-05 by the reprex package (v2.0.0)</sup>
或者考虑将向量改为字符串,利用正则表达式进行操作,代码如下:
`
qq<-c("a","b","b","a","b","b","b","b",
"a","b","b","b","a")
qq_str<-paste0(qq,collapse = "")library(stringr)
three_b<-str_locate_all(qq_str,pattern='b{3,}')
g<-function(x){return(seq(x[1],x[2],by=1))}
apply(three_b[[1]],1,g)`
结果为
[[1]]
[1] 5 6 7 8[[2]]
[1] 10 11 12或者
for
loopqq=c("a","b","b","a","b","b","b","b","a","b","b","b","a") result <- integer(length(qq)) for (i in 3:length(qq)) { if (all(qq[(i-2):i]=="b")) { result[(i-2):i] <- (i-2):i } } result[result!=0] #> [1] 5 6 7 8 10 11 12
<sup>Created on 2020-05-10 by the reprex package (v0.3.0)</sup>
- 已编辑
library(data.table) dt <- data.table(qq=c("a","b","b","a","b","b","b","b","a","b","b","b","a")) dt[,id:=rleid(qq) ][,count:=.N,by=.(id) ][count>=3 & qq=="b",which=TRUE] #> [1] 5 6 7 8 10 11 12
<sup>Created on 2020-05-10 by the reprex package (v0.3.0)</sup>
- 已编辑
df1$id <- ifelse(df1$age > 30, "old", "yun") df2 <- transform(df1, id = paste(id, sex, sep = ""))
- 于 请教数据整形
# 源数据 c1 <- c('a', 2015, 300) c2 <- c('a', 2016, 400) c3 <- c('b', 2015, 700) c4 <- c('b', 2016, 600) dt <- data.frame(rbind(c1, c2, c3, c4), stringsAsFactors = FALSE) names(dt) <- c('name', 'date', 'amount') library(dplyr) library(tidyr) dt %>% group_by(name) %>% filter(amount == max(amount)) %>% unite("merged", date, amount, sep = "-") -> dt2 dt2
- 于 请教数据整形
去掉
sep = "."
separate(data = ex, col =id, into = c("q", "w"))
即可如果你一定要手动指定,
separate(data = ex, col =id, into = c("q", "w"),sep="\\.")
报错是因为你传进去的
sep
会被当做一个正则表达式,.
在正则表达式里match任何字符(except new line)。默认的
sep
会match任何non-alphanumeric 的字符,绝大多数情况下使用默认的即可。sep
Separator between columns.If character, is interpreted as a regular expression. The default value is a regular expression that matches any sequence of non-alphanumeric values.
- 于 请教数据整形
- 已编辑
借用上面的例子
library(tidyverse) c1 <- c('a', 2015, 300) c2 <- c('a', 2016, 400) c3 <- c('b', 2015, 700) c4 <- c('b', 2016, 600) dt <- data.frame(rbind(c1, c2, c3, c4), stringsAsFactors = FALSE) names(dt) <- c('name', 'date', 'amount') new.dt <- dt %>% group_by(name) %>% filter(amount == max(amount)) %>% mutate(merged = paste(date, amount, sep = "-")) %>% ungroup()
- 于 请教数据整形
- 已编辑
来一个
data.table
的写法library(data.table) c1 <- c('a', 2015, 300) c2 <- c('a', 2016, 400) c3 <- c('b', 2015, 700) c4 <- c('b', 2016, 600) dt <- data.frame(rbind(c1, c2, c3, c4), stringsAsFactors = FALSE) names(dt) <- c('name', 'date', 'amount') setDT(dt) dt #> name date amount #> 1: a 2015 300 #> 2: a 2016 400 #> 3: b 2015 700 #> 4: b 2016 600 # 一步到位 dt2 <- dt[,.SD[amount==max(amount),.(merged=paste0(date,"-",amount))],by=.(name)] dt2 #> name merged #> 1: a 2016-400 #> 2: b 2015-700
Created on 2020-01-02 by the reprex package (v0.3.0)
- 于 请教数据整形
- 已编辑
为了避免中文支持的隐患,我把代码里的中文都改成英文,意思到了即可。
没有使用其他包。
# 源数据 c1 <- c('a', 2015, 300) c2 <- c('a', 2016, 400) c3 <- c('b', 2015, 700) c4 <- c('b', 2016, 600) dt <- data.frame(rbind(c1, c2, c3, c4), stringsAsFactors = FALSE) names(dt) <- c('name', 'date', 'amount') dt$merged <- paste0(dt$date, '-', dt$amount) # 计算 tb <- tapply(dt$merged, dt$name, function(x) x[which.max(substr(x, 6, nchar(x)))]) tb dt2 <- data.frame(name = names(tb), merged = tb) dt2
beginr 包开发版 (github: pzhaonet/beginr) 里有个函数,批量读入指定文件夹里的所有结构相同的文本文件,作为 list 或 data.frame ,而源文件的文件名作为 list 里的 name 或者 data.frame 的新列来存放:
beginr::readdir()
并没有读成三维数组。我觉得作为 list 或 data.frame 保存更为直观一些,后续处理也更方便,尤其是存为 data.frame 时,源文件名作为新列,这样就可以直接用
tapply()
以及 tidyr 的gather()
和spread()
函数折腾了。可以
手上有段现成代码:
# list files with pattern filel = list.files("../dataset/out/", pattern = "fix3_1.61*", full.names=TRUE) # read all files into a list, each element is a data.frame data.l = lapply(filel, data.table::fread)