重新组合列值低于 x 的数据框的行

Regroup lines of a data frame for which a column value is inferior to x

我有这个数据框:

> df
    Z freq        proba
1  17    1 0.0033289263
2  18    4 0.0055569026
3  19    2 0.0087878028
4  20    3 0.0132023556
5  21   16 0.0188900561
6  22   12 0.0257995234
7  23   30 0.0337042731
8  24   41 0.0421963455
9  25   56 0.0507149437
10 26   65 0.0586089198
11 27   65 0.0652230449
12 28   93 0.0699913154
13 29   82 0.0725182432
14 30   94 0.0726318551
15 31   72 0.0703990113
16 32   74 0.0661024717
17 33   58 0.0601873020
18 34   66 0.0531896431
19 35   38 0.0456625487
20 36   45 0.0381117389
21 37   27 0.0309498221
22 38   17 0.0244723502
23 39   15 0.0188543771
24 40   13 0.0141629367
25 41    4 0.0103793600
26 42    1 0.0074254435
27 43    2 0.0051886582
28 45    1 0.0023658767
29 46    1 0.0015453804
30 49    2 0.0003792308

# Here are my datas :
> dput(df)
structure(list(Z = c(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 
27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 
43, 45, 46, 49), freq = c(1, 4, 2, 3, 16, 12, 30, 41, 56, 65, 
65, 93, 82, 94, 72, 74, 58, 66, 38, 45, 27, 17, 15, 13, 4, 1, 
2, 1, 1, 2), proba = c(0.0033289262662263, 0.00555690264007235, 
0.00878780282243439, 0.0132023555702843, 0.0188900560866825, 
0.0257995234198431, 0.0337042730520012, 0.0421963455163949, 0.0507149437492447, 
0.0586089198012906, 0.0652230449359029, 0.0699913153996099, 0.0725182432348992, 
0.0726318551493006, 0.0703990113442269, 0.0661024716831246, 0.0601873020200862, 
0.0531896430528685, 0.045662548708844, 0.0381117389181843, 0.030949822142559, 
0.0244723501557229, 0.01885437705459, 0.0141629366839816, 0.0103793599644779, 
0.00742544354411115, 0.00518865818999788, 0.00236587669133322, 
0.00154538036835848, 0.000379230768851682)), .Names = c("Z", 
"freq", "proba"), row.names = c(NA, -30L), class = "data.frame")

我想将值 "freq" 小于 5 的行与下一行重新组合,而下一行小于 5。 Idk 如果我足够清楚,那么这是我期望的输出:

> df2
   labels effectifs         pi
1   17;20        10 0.03087599
2      21        16 0.01889006
3      22        12 0.02579952
4      23        30 0.03370427
5      24        41 0.04219635
6      25        56 0.05071494
7      26        65 0.05860892
8      27        65 0.06522304
9      28        93 0.06999132
10     29        82 0.07251824
11     30        94 0.07263186
12     31        72 0.07039901
13     32        74 0.06610247
14     33        58 0.06018730
15     34        66 0.05318964
16     35        38 0.04566255
17     36        45 0.03811174
18     37        27 0.03094982
19     38        17 0.02447235
20     39        15 0.01885438
21     40        13 0.01416294
22  41;49        11 0.02728395

我用嵌套 while 做到了,但我发现这个解决方案非常痛苦,而且没有经过优化。

i <- 1
freqs <- c()
labels <- c()
pi <- c()
while(i < nrow(df)) {
  if (df$freq[i] >= 5) {
    freqs <- c(freqs, df$freq[i])
    labels <- c(labels, df$Z[i])
    pi <- c(pi, df$proba[i])
    i <- i + 1
  }
  else {
    count <- df$freq[i]
    countPi <- df$proba[i]
    k <- i
    j <- i
    while(df$freq[i] < 5 & i < nrow(df)) {
      if (df$freq[i+1] < 5) {
        count <- count + df$freq[i+1]
        countPi <- countPi + df$proba[i+1]
        j <- i + 1
      }
      i <- i + 1
    }
    labels <- c(labels, paste0(df$Z[k], ";", df$Z[j]))
    freqs <- c(freqs, count)
    pi <- c(pi, countPi)
  }
}

df2 <- data.frame(labels, freqs, pi)

我敢肯定还有更好的,也许是 dplyr。如果您有更好的解决方案..谢谢!

我们可以使用 "data.table" 的 "devel" 版本,因为引入了新功能 (rleid)。在这里,我们将 "data.frame" 转换为 "data.table" (setDT(df)),根据逻辑索引 (freq <5) 使用 [= 创建一个分组变量 ("gr") 11=]。 'Z' 列是 'numeric/integer' class。从 "Z" 创建一个字符列 ("Z1")。按 'gr' 分组,如果该组所有元素的 "freq" 都小于 5,则通过对列 (.SD[1L]) 的第一次观察将行汇总为一行,删除不需要的列(因为 .SD 包含 "Z1",这将导致列重复),将其附加 "Z1",我们通过粘贴 minmax 值获得 "Z" 该组。否则,保持不变 (else .SD)。通过将其分配给 "NULL".

来删除我们不需要的列
library(data.table) #data.table_1.9.5
res <- setDT(df)[, gr:=rleid(freq<5)][, Z1:= as.character(Z)][, 
        if(all(freq<5)) c(.SD[1L][,-4, with=FALSE], 
          list(Z1=toString(c(min(Z), max(Z)))))
      else .SD, gr][,1:2 :=NULL][]
head(res,3)
#   freq       proba     Z1
#1:    1 0.003328926 17, 20
#2:   16 0.018890056     21
#3:   12 0.025799523     22

由于这是一个 问题,这里有一个 dplyr 解决方案。首先,我使用分组函数来定义组(类似于 data.table 中的 rleid 函数)。然后总结,相当简单。

# grouping function
grouping <- function(condition){
  # calculate runs for grouping
  run <- rle((!condition) * 1:length(condition))
  # revalue 
  run$values <- seq_along(run$values)
  # invert to get grouping
  inverse.rle(run)
}
# load dplyr
require(dplyr)
df %>% 
  mutate(group = grouping(freq<5)) %>%               # add groups
  group_by(group) %>%                                # group data 
  summarize(freq = sum(freq),                        # sum freq
            proba = sum(proba),                      # sum proba
            Z = toString(unique(range(Z)))) %>%      # rename Z
  mutate(group=NULL)                                 # remove groups
## Source: local data table [22 x 3]
## 
##    freq      proba      Z
## 1    10 0.03087599 17, 20
## 2    16 0.01889006     21
## 3    12 0.02579952     22
## 4    30 0.03370427     23
## 5    41 0.04219635     24
## 6    56 0.05071494     25
## 7    65 0.05860892     26
## 8    65 0.06522304     27
## 9    93 0.06999132     28
## 10   82 0.07251824     29
## ..  ...        ...    ...