根据 R 中特定时间间隔内的列值删除重复项

Question

我有如下多列数据。我想删除深度列中具有重复值的行。

      Date     Levels values depth
1   2005-12-31      1 182.80     0
2   2005-12-31      2 182.80     0
3   2005-12-31      5 182.80     2
4   2005-12-31      6 182.80     2
5   2005-12-31      7 182.80     2
6   2005-12-31      8 182.80     3
7   2005-12-31      9 182.80     4
8   2005-12-31     10 182.80     4
9   2005-12-31     11 182.80     5
10  2005-12-31     13 182.70     7
11  2005-12-31     14 182.70     8
12  2005-12-31     16 182.60    10
13  2005-12-31     17 182.50    12
14  2005-12-31     20 181.50    17
15  2005-12-31     23 177.50    23
16  2005-12-31     26 165.90    31
17  2005-12-31     28 155.00    36
18  2005-12-31     29 149.20    40
19  2005-12-31     31 136.90    46
20  2005-12-31     33 126.10    53
21  2005-12-31     35 112.70    60
22  2005-12-31     38  88.23    70
23  2005-12-31     41  67.99    79
24  2005-12-31     44  54.63    87
25  2005-12-31     49  45.40    98
26  2006-12-31      1 182.80     0
27  2006-12-31      2 182.80     0
28  2006-12-31      5 182.80     2
29  2006-12-31      6 182.80     2
30  2006-12-31      7 182.80     2
31  2006-12-31      8 182.80     3
32  2006-12-31      9 182.80     4
33  2006-12-31     10 182.80     4
34  2006-12-31     11 182.70     5
35  2006-12-31     13 182.70     7
36  2006-12-31     14 182.70     8
37  2006-12-31     16 182.60    10
38  2006-12-31     17 182.50    12
39  2006-12-31     20 181.50    17
40  2006-12-31     23 178.60    23
41  2006-12-31     26 168.70    31
42  2006-12-31     28 156.90    36
43  2006-12-31     29 150.40    40
44  2006-12-31     31 137.10    46
45  2006-12-31     33 126.00    53
46  2006-12-31     35 112.70    60
47  2006-12-31     38  91.80    70
48  2006-12-31     41  75.91    79
49  2006-12-31     44  65.17    87
50  2006-12-31     49  58.33    98

我知道如何根据列删除重复项，如下所示；

nodup<- distinct(df, column, .keep_all = TRUE)

但是我怎样才能每隔 25 行间隔执行一次此代码？

Answer 1

基础 R

do.call(rbind, by(dat, (seq_len(nrow(dat))-1) %/% 25,
                  function(z) z[!duplicated(z$depth),]))
#            Date Levels values depth
# 0.1  2005-12-31      1  182.8     0
# 0.3  2005-12-31      5  182.8     2
# 0.6  2005-12-31      8  182.8     3
# 0.7  2005-12-31      9  182.8     4
# 0.9  2005-12-31     11  182.8     5
# 0.10 2005-12-31     13  182.7     7
# 0.11 2005-12-31     14  182.7     8
# 0.12 2005-12-31     16  182.6    10
# 0.13 2005-12-31     17  182.5    12
# 0.14 2005-12-31     20  181.5    17
# 0.15 2005-12-31     23  177.5    23
# 0.16 2005-12-31     26  165.9    31
# 0.17 2005-12-31     28  155.0    36
# 0.18 2005-12-31     29  149.2    40
# 0.19 2005-12-31     31  136.9    46
# 0.20 2005-12-31     33  126.1    53
# 0.21 2005-12-31     35  112.7    60
# 0.22 2005-12-31     38   88.2    70
# 0.23 2005-12-31     41   68.0    79
# 0.24 2005-12-31     44   54.6    87
# 0.25 2005-12-31     49   45.4    98
# 1.26 2006-12-31      1  182.8     0
# 1.28 2006-12-31      5  182.8     2
# 1.31 2006-12-31      8  182.8     3
# 1.32 2006-12-31      9  182.8     4
# 1.34 2006-12-31     11  182.7     5
# 1.35 2006-12-31     13  182.7     7
# 1.36 2006-12-31     14  182.7     8
# 1.37 2006-12-31     16  182.6    10
# 1.38 2006-12-31     17  182.5    12
# 1.39 2006-12-31     20  181.5    17
# 1.40 2006-12-31     23  178.6    23
# 1.41 2006-12-31     26  168.7    31
# 1.42 2006-12-31     28  156.9    36
# 1.43 2006-12-31     29  150.4    40
# 1.44 2006-12-31     31  137.1    46
# 1.45 2006-12-31     33  126.0    53
# 1.46 2006-12-31     35  112.7    60
# 1.47 2006-12-31     38   91.8    70
# 1.48 2006-12-31     41   75.9    79
# 1.49 2006-12-31     44   65.2    87
# 1.50 2006-12-31     49   58.3    98

或

dat[!ave(dat$depth, (seq_len(nrow(dat))-1) %/% 25, FUN = duplicated),]

dplyr

library(dplyr)
dat %>%
  group_by(grp = (seq_len(n())-1) %/% 25) %>%
  distinct(depth, .keep_all = TRUE) %>%
  ungroup() %>%
  select(-grp)
# # A tibble: 42 x 4
#    Date       Levels values depth
#    <chr>       <int>  <dbl> <int>
#  1 2005-12-31      1   183.     0
#  2 2005-12-31      5   183.     2
#  3 2005-12-31      8   183.     3
#  4 2005-12-31      9   183.     4
#  5 2005-12-31     11   183.     5
#  6 2005-12-31     13   183.     7
#  7 2005-12-31     14   183.     8
#  8 2005-12-31     16   183.    10
#  9 2005-12-31     17   182.    12
# 10 2005-12-31     20   182.    17
# # ... with 32 more rows

data.table

library(data.table)
as.data.table(dat)[, .SD[!duplicated(depth),], by=.( (seq_len(nrow(dat))-1) %/% 25 ) ][,-1]

（最后的 [,-1] 是因为 by= 分组操作隐式地将 seq_len(.)... 计数器作为其第一列。）

（注意到主题了吗？:-)

数据

dat <- structure(list(Date = c("2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31"), Levels = c(1L, 2L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 16L, 17L, 20L, 23L, 26L, 28L, 29L, 31L, 33L, 35L, 38L, 41L, 44L, 49L, 1L, 2L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 16L, 17L, 20L, 23L, 26L, 28L, 29L, 31L, 33L, 35L, 38L, 41L, 44L, 49L), values = c(182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.7, 182.7, 182.6, 182.5, 181.5, 177.5, 165.9, 155, 149.2, 136.9, 126.1, 112.7, 88.23, 67.99, 54.63, 45.4, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.7, 182.7, 182.7, 182.6, 182.5, 181.5, 178.6, 168.7, 156.9, 150.4, 137.1, 126, 112.7, 91.8, 75.91, 65.17, 58.33), depth = c(0L, 0L, 2L, 2L, 2L, 3L, 4L, 4L, 5L, 7L, 8L, 10L, 12L, 17L, 23L, 31L, 36L, 40L, 46L, 53L, 60L, 70L, 79L, 87L, 98L, 0L, 0L, 2L, 2L, 2L, 3L, 4L, 4L, 5L, 7L, 8L, 10L, 12L, 17L, 23L, 31L, 36L, 40L, 46L, 53L, 60L, 70L, 79L, 87L, 98L)), class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "50"))

Answer 2

我们可以使用 order 和 !duplicated:

df = df[order(df[,'depth']),]
df = df[!duplicated(df$depth),]
df

   Date       Levels values depth
   <date>      <dbl>  <dbl> <dbl>
 1 2005-12-31      1  183.      0
 2 2005-12-31      5  183.      2
 3 2005-12-31      8  183.      3
 4 2005-12-31      9  183.      4
 5 2005-12-31     11  183.      5
 6 2005-12-31     13  183.      7
 7 2005-12-31     14  183.      8
 8 2006-12-31     49   58.3     9
 9 2005-12-31     16  183.     10
10 2005-12-31     17  182.     12
# … with 12 more rows

根据 R 中特定时间间隔内的列值删除重复项

Remove duplicates based on column values in specific intervals in R

r

duplicates

intervals

基础 R

dplyr

data.table