dhfly

sleepstan · 2021年4月28日

dhfly
这样可以吗？

lapply(1:length(list2), 
       FUN=function(x, datalist, indexlist) list2[[x]][list1[[x]][1], list1[[x]][2]], 
       datalist = list2, indexlist = list1)

顺便蹲个简洁点的写法

frankzhang21 · 2021年4月5日

是有一些麻烦，特意分开了写，这样看的清楚些

library(data.table)

aa <- data.frame(num=c(50,110,120,110,105,101,103,30,40,101,102,110),str=c(rep("a",2),rep("b",2),rep("a",3),"b",rep("a",4) ))

setDT(aa)

# 根据条件，判断每一行是否符合条件
aa[,grp:= (str=="a" & num>100)][]
#>     num str   grp
#>  1:  50   a FALSE
#>  2: 110   a  TRUE
#>  3: 120   b FALSE
#>  4: 110   b FALSE
#>  5: 105   a  TRUE
#>  6: 101   a  TRUE
#>  7: 103   a  TRUE
#>  8:  30   b FALSE
#>  9:  40   a FALSE
#> 10: 101   a  TRUE
#> 11: 102   a  TRUE
#> 12: 110   a  TRUE

# 判断是否为连续符合条件，连续符合即保持同一个index, 否则就递增
aa[,grp_index:=rleid(grp)][]
#>     num str   grp grp_index
#>  1:  50   a FALSE         1
#>  2: 110   a  TRUE         2
#>  3: 120   b FALSE         3
#>  4: 110   b FALSE         3
#>  5: 105   a  TRUE         4
#>  6: 101   a  TRUE         4
#>  7: 103   a  TRUE         4
#>  8:  30   b FALSE         5
#>  9:  40   a FALSE         5
#> 10: 101   a  TRUE         6
#> 11: 102   a  TRUE         6
#> 12: 110   a  TRUE         6

# 判断连续有几个符合条件
aa[,count:=.N,by= grp_index][]
#>     num str   grp grp_index count
#>  1:  50   a FALSE         1     1
#>  2: 110   a  TRUE         2     1
#>  3: 120   b FALSE         3     2
#>  4: 110   b FALSE         3     2
#>  5: 105   a  TRUE         4     3
#>  6: 101   a  TRUE         4     3
#>  7: 103   a  TRUE         4     3
#>  8:  30   b FALSE         5     2
#>  9:  40   a FALSE         5     2
#> 10: 101   a  TRUE         6     3
#> 11: 102   a  TRUE         6     3
#> 12: 110   a  TRUE         6     3

# 筛选3个以上且符合条件的每一组的最后一行
aa[grp==TRUE & count>=3,.I[.N],by = grp_index]$V1
#> [1]  7 12

Created on 2021-04-05 by the reprex package (v0.3.0)

9-5-2-7 · 2021年4月5日

这是我的解法，可能有点复杂，欢迎更好的解法，我的想法是将字符那一列转为字符串，然后借助正则表达式进行处理。

aa<-data.frame(数值=c(50,110,120,110,105,101,103,30,40,101,102,110),
              字符=c(rep("a",2),rep("b",2),rep("a",3),"b",rep("a",4)))

library(stringr)
b<-Reduce(paste0,unlist(aa$字符))
locates<-str_locate_all(b,"a{3,}")[[1]]
for(i in 1:nrow(locates)){
  start<-locates[i,][1]
  end<-locates[i,][2]
  
  flag<-rep('N',end-start+1)
  bigger<-aa$数值[start:end]>100
  for(j in 1:(end-start+1)){
    if(bigger[j]){flag[j]<-'Y'}
  }
  
  row_num<-start:end 
  d<-Reduce(paste0,unlist(flag))
  e<-str_locate_all(d,"Y{3,}")[[1]]
  cat(row_num[e[,2]],'\n')
}
#> 7 
#> 12

Created on 2021-04-05 by the reprex package (v2.0.0)

9-5-2-7 · 2020年5月10日

或者考虑将向量改为字符串，利用正则表达式进行操作，代码如下：
`
qq<-c("a","b","b","a","b","b","b","b",
"a","b","b","b","a")
qq_str<-paste0(qq,collapse = "")

library(stringr)
three_b<-str_locate_all(qq_str,pattern='b{3,}')
g<-function(x){return(seq(x[1],x[2],by=1))}
apply(three_b[[1]],1,g)

`
结果为
[[1]]
[1] 5 6 7 8

[[2]]
[1] 10 11 12

frankzhang21 · 2020年5月10日

或者for loop

qq=c("a","b","b","a","b","b","b","b","a","b","b","b","a")

result <- integer(length(qq))
for (i in 3:length(qq)) {
  if (all(qq[(i-2):i]=="b")) {
    result[(i-2):i] <- (i-2):i
  }
}

result[result!=0]
#> [1]  5  6  7  8 10 11 12

Created on 2020-05-10 by the reprex package (v0.3.0)

frankzhang21 · 2020年5月10日

library(data.table)

dt <- data.table(qq=c("a","b","b","a","b","b","b","b","a","b","b","b","a"))

dt[,id:=rleid(qq)
   ][,count:=.N,by=.(id)
     ][count>=3 & qq=="b",which=TRUE]
#> [1]  5  6  7  8 10 11 12

Created on 2020-05-10 by the reprex package (v0.3.0)

Liechi · 2020年2月7日

df1$id <- ifelse(df1$age > 30, "old", "yun")
df2 <- transform(df1, id = paste(id, sex, sep = ""))

chuxinyuan · 2020年1月2日

# 源数据
c1 <- c('a', 2015, 300)
c2 <- c('a', 2016, 400)
c3 <- c('b', 2015, 700)
c4 <- c('b', 2016, 600)
dt <- data.frame(rbind(c1, c2, c3, c4), stringsAsFactors = FALSE)
names(dt) <- c('name', 'date', 'amount')

library(dplyr)
library(tidyr)
dt %>% 
  group_by(name) %>% 
  filter(amount == max(amount)) %>% 
  unite("merged", date, amount, sep = "-") -> dt2
dt2

frankzhang21 · 2020年1月2日

dhfly

去掉sep = "."

separate(data = ex, col =id, into = c("q", "w")) 即可

如果你一定要手动指定，

separate(data = ex, col =id, into = c("q", "w"),sep="\\.")

报错是因为你传进去的sep会被当做一个正则表达式，.在正则表达式里match任何字符(except new line)。

默认的sep会match任何non-alphanumeric 的字符，绝大多数情况下使用默认的即可。

sep
Separator between columns.

If character, is interpreted as a regular expression. The default value is a regular expression that matches any sequence of non-alphanumeric values.

zpmu · 2020年1月2日

借用上面的例子

library(tidyverse)

c1 <- c('a', 2015, 300)
c2 <- c('a', 2016, 400)
c3 <- c('b', 2015, 700)
c4 <- c('b', 2016, 600)
dt <- data.frame(rbind(c1, c2, c3, c4), stringsAsFactors = FALSE)
names(dt) <- c('name', 'date', 'amount')

new.dt <- dt %>%
    group_by(name) %>%
    filter(amount == max(amount)) %>%
    mutate(merged = paste(date, amount, sep = "-")) %>%
    ungroup()

frankzhang21 · 2020年1月1日

来一个data.table的写法


library(data.table)

c1 <- c('a', 2015, 300)
c2 <- c('a', 2016, 400)
c3 <- c('b', 2015, 700)
c4 <- c('b', 2016, 600)
dt <- data.frame(rbind(c1, c2, c3, c4), stringsAsFactors = FALSE)
names(dt) <- c('name', 'date', 'amount')

setDT(dt)

dt
#>    name date amount
#> 1:    a 2015    300
#> 2:    a 2016    400
#> 3:    b 2015    700
#> 4:    b 2016    600

# 一步到位
dt2 <- dt[,.SD[amount==max(amount),.(merged=paste0(date,"-",amount))],by=.(name)]

dt2
#>    name   merged
#> 1:    a 2016-400
#> 2:    b 2015-700

Created on 2020-01-02 by the reprex package (v0.3.0)

dapengde · 2020年1月1日

为了避免中文支持的隐患，我把代码里的中文都改成英文，意思到了即可。

没有使用其他包。

# 源数据
c1 <- c('a', 2015, 300)
c2 <- c('a', 2016, 400)
c3 <- c('b', 2015, 700)
c4 <- c('b', 2016, 600)
dt <- data.frame(rbind(c1, c2, c3, c4), stringsAsFactors = FALSE)
names(dt) <- c('name', 'date', 'amount')
dt$merged <- paste0(dt$date, '-', dt$amount)

# 计算
tb <- tapply(dt$merged, dt$name, function(x) x[which.max(substr(x, 6, nchar(x)))])
tb
dt2 <- data.frame(name = names(tb), merged = tb)
dt2

dapengde · 2019年1月5日

beginr 包开发版 (github: pzhaonet/beginr) 里有个函数，批量读入指定文件夹里的所有结构相同的文本文件，作为 list 或 data.frame ，而源文件的文件名作为 list 里的 name 或者 data.frame 的新列来存放：

beginr::readdir()

并没有读成三维数组。我觉得作为 list 或 data.frame 保存更为直观一些，后续处理也更方便，尤其是存为 data.frame 时，源文件名作为新列，这样就可以直接用 tapply() 以及 tidyr 的 gather() 和 spread() 函数折腾了。

tctcab · 2019年1月5日

可以

手上有段现成代码：

# list files with pattern
filel = list.files("../dataset/out/", pattern = "fix3_1.61*", full.names=TRUE)
# read all files into a list, each element is a data.frame
data.l = lapply(filel, data.table::fread)