根据 R 中特定时间间隔内的列值删除重复项
Remove duplicates based on column values in specific intervals in R
我有如下多列数据。我想删除深度列中具有重复值的行。
Date Levels values depth
1 2005-12-31 1 182.80 0
2 2005-12-31 2 182.80 0
3 2005-12-31 5 182.80 2
4 2005-12-31 6 182.80 2
5 2005-12-31 7 182.80 2
6 2005-12-31 8 182.80 3
7 2005-12-31 9 182.80 4
8 2005-12-31 10 182.80 4
9 2005-12-31 11 182.80 5
10 2005-12-31 13 182.70 7
11 2005-12-31 14 182.70 8
12 2005-12-31 16 182.60 10
13 2005-12-31 17 182.50 12
14 2005-12-31 20 181.50 17
15 2005-12-31 23 177.50 23
16 2005-12-31 26 165.90 31
17 2005-12-31 28 155.00 36
18 2005-12-31 29 149.20 40
19 2005-12-31 31 136.90 46
20 2005-12-31 33 126.10 53
21 2005-12-31 35 112.70 60
22 2005-12-31 38 88.23 70
23 2005-12-31 41 67.99 79
24 2005-12-31 44 54.63 87
25 2005-12-31 49 45.40 98
26 2006-12-31 1 182.80 0
27 2006-12-31 2 182.80 0
28 2006-12-31 5 182.80 2
29 2006-12-31 6 182.80 2
30 2006-12-31 7 182.80 2
31 2006-12-31 8 182.80 3
32 2006-12-31 9 182.80 4
33 2006-12-31 10 182.80 4
34 2006-12-31 11 182.70 5
35 2006-12-31 13 182.70 7
36 2006-12-31 14 182.70 8
37 2006-12-31 16 182.60 10
38 2006-12-31 17 182.50 12
39 2006-12-31 20 181.50 17
40 2006-12-31 23 178.60 23
41 2006-12-31 26 168.70 31
42 2006-12-31 28 156.90 36
43 2006-12-31 29 150.40 40
44 2006-12-31 31 137.10 46
45 2006-12-31 33 126.00 53
46 2006-12-31 35 112.70 60
47 2006-12-31 38 91.80 70
48 2006-12-31 41 75.91 79
49 2006-12-31 44 65.17 87
50 2006-12-31 49 58.33 98
我知道如何根据列删除重复项,如下所示;
nodup<- distinct(df, column, .keep_all = TRUE)
但是我怎样才能每隔 25 行间隔执行一次此代码?
基础 R
do.call(rbind, by(dat, (seq_len(nrow(dat))-1) %/% 25,
function(z) z[!duplicated(z$depth),]))
# Date Levels values depth
# 0.1 2005-12-31 1 182.8 0
# 0.3 2005-12-31 5 182.8 2
# 0.6 2005-12-31 8 182.8 3
# 0.7 2005-12-31 9 182.8 4
# 0.9 2005-12-31 11 182.8 5
# 0.10 2005-12-31 13 182.7 7
# 0.11 2005-12-31 14 182.7 8
# 0.12 2005-12-31 16 182.6 10
# 0.13 2005-12-31 17 182.5 12
# 0.14 2005-12-31 20 181.5 17
# 0.15 2005-12-31 23 177.5 23
# 0.16 2005-12-31 26 165.9 31
# 0.17 2005-12-31 28 155.0 36
# 0.18 2005-12-31 29 149.2 40
# 0.19 2005-12-31 31 136.9 46
# 0.20 2005-12-31 33 126.1 53
# 0.21 2005-12-31 35 112.7 60
# 0.22 2005-12-31 38 88.2 70
# 0.23 2005-12-31 41 68.0 79
# 0.24 2005-12-31 44 54.6 87
# 0.25 2005-12-31 49 45.4 98
# 1.26 2006-12-31 1 182.8 0
# 1.28 2006-12-31 5 182.8 2
# 1.31 2006-12-31 8 182.8 3
# 1.32 2006-12-31 9 182.8 4
# 1.34 2006-12-31 11 182.7 5
# 1.35 2006-12-31 13 182.7 7
# 1.36 2006-12-31 14 182.7 8
# 1.37 2006-12-31 16 182.6 10
# 1.38 2006-12-31 17 182.5 12
# 1.39 2006-12-31 20 181.5 17
# 1.40 2006-12-31 23 178.6 23
# 1.41 2006-12-31 26 168.7 31
# 1.42 2006-12-31 28 156.9 36
# 1.43 2006-12-31 29 150.4 40
# 1.44 2006-12-31 31 137.1 46
# 1.45 2006-12-31 33 126.0 53
# 1.46 2006-12-31 35 112.7 60
# 1.47 2006-12-31 38 91.8 70
# 1.48 2006-12-31 41 75.9 79
# 1.49 2006-12-31 44 65.2 87
# 1.50 2006-12-31 49 58.3 98
或
dat[!ave(dat$depth, (seq_len(nrow(dat))-1) %/% 25, FUN = duplicated),]
dplyr
library(dplyr)
dat %>%
group_by(grp = (seq_len(n())-1) %/% 25) %>%
distinct(depth, .keep_all = TRUE) %>%
ungroup() %>%
select(-grp)
# # A tibble: 42 x 4
# Date Levels values depth
# <chr> <int> <dbl> <int>
# 1 2005-12-31 1 183. 0
# 2 2005-12-31 5 183. 2
# 3 2005-12-31 8 183. 3
# 4 2005-12-31 9 183. 4
# 5 2005-12-31 11 183. 5
# 6 2005-12-31 13 183. 7
# 7 2005-12-31 14 183. 8
# 8 2005-12-31 16 183. 10
# 9 2005-12-31 17 182. 12
# 10 2005-12-31 20 182. 17
# # ... with 32 more rows
data.table
library(data.table)
as.data.table(dat)[, .SD[!duplicated(depth),], by=.( (seq_len(nrow(dat))-1) %/% 25 ) ][,-1]
(最后的 [,-1]
是因为 by=
分组操作隐式地将 seq_len(.)...
计数器作为其第一列。)
(注意到主题了吗?:-)
数据
dat <- structure(list(Date = c("2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31"), Levels = c(1L, 2L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 16L, 17L, 20L, 23L, 26L, 28L, 29L, 31L, 33L, 35L, 38L, 41L, 44L, 49L, 1L, 2L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 16L, 17L, 20L, 23L, 26L, 28L, 29L, 31L, 33L, 35L, 38L, 41L, 44L, 49L), values = c(182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.7, 182.7, 182.6, 182.5, 181.5, 177.5, 165.9, 155, 149.2, 136.9, 126.1, 112.7, 88.23, 67.99, 54.63, 45.4, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.7, 182.7, 182.7, 182.6, 182.5, 181.5, 178.6, 168.7, 156.9, 150.4, 137.1, 126, 112.7, 91.8, 75.91, 65.17, 58.33), depth = c(0L, 0L, 2L, 2L, 2L, 3L, 4L, 4L, 5L, 7L, 8L, 10L, 12L, 17L, 23L, 31L, 36L, 40L, 46L, 53L, 60L, 70L, 79L, 87L, 98L, 0L, 0L, 2L, 2L, 2L, 3L, 4L, 4L, 5L, 7L, 8L, 10L, 12L, 17L, 23L, 31L, 36L, 40L, 46L, 53L, 60L, 70L, 79L, 87L, 98L)), class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "50"))
我们可以使用 order
和 !duplicated
:
df = df[order(df[,'depth']),]
df = df[!duplicated(df$depth),]
df
Date Levels values depth
<date> <dbl> <dbl> <dbl>
1 2005-12-31 1 183. 0
2 2005-12-31 5 183. 2
3 2005-12-31 8 183. 3
4 2005-12-31 9 183. 4
5 2005-12-31 11 183. 5
6 2005-12-31 13 183. 7
7 2005-12-31 14 183. 8
8 2006-12-31 49 58.3 9
9 2005-12-31 16 183. 10
10 2005-12-31 17 182. 12
# … with 12 more rows
我有如下多列数据。我想删除深度列中具有重复值的行。
Date Levels values depth
1 2005-12-31 1 182.80 0
2 2005-12-31 2 182.80 0
3 2005-12-31 5 182.80 2
4 2005-12-31 6 182.80 2
5 2005-12-31 7 182.80 2
6 2005-12-31 8 182.80 3
7 2005-12-31 9 182.80 4
8 2005-12-31 10 182.80 4
9 2005-12-31 11 182.80 5
10 2005-12-31 13 182.70 7
11 2005-12-31 14 182.70 8
12 2005-12-31 16 182.60 10
13 2005-12-31 17 182.50 12
14 2005-12-31 20 181.50 17
15 2005-12-31 23 177.50 23
16 2005-12-31 26 165.90 31
17 2005-12-31 28 155.00 36
18 2005-12-31 29 149.20 40
19 2005-12-31 31 136.90 46
20 2005-12-31 33 126.10 53
21 2005-12-31 35 112.70 60
22 2005-12-31 38 88.23 70
23 2005-12-31 41 67.99 79
24 2005-12-31 44 54.63 87
25 2005-12-31 49 45.40 98
26 2006-12-31 1 182.80 0
27 2006-12-31 2 182.80 0
28 2006-12-31 5 182.80 2
29 2006-12-31 6 182.80 2
30 2006-12-31 7 182.80 2
31 2006-12-31 8 182.80 3
32 2006-12-31 9 182.80 4
33 2006-12-31 10 182.80 4
34 2006-12-31 11 182.70 5
35 2006-12-31 13 182.70 7
36 2006-12-31 14 182.70 8
37 2006-12-31 16 182.60 10
38 2006-12-31 17 182.50 12
39 2006-12-31 20 181.50 17
40 2006-12-31 23 178.60 23
41 2006-12-31 26 168.70 31
42 2006-12-31 28 156.90 36
43 2006-12-31 29 150.40 40
44 2006-12-31 31 137.10 46
45 2006-12-31 33 126.00 53
46 2006-12-31 35 112.70 60
47 2006-12-31 38 91.80 70
48 2006-12-31 41 75.91 79
49 2006-12-31 44 65.17 87
50 2006-12-31 49 58.33 98
我知道如何根据列删除重复项,如下所示;
nodup<- distinct(df, column, .keep_all = TRUE)
但是我怎样才能每隔 25 行间隔执行一次此代码?
基础 R
do.call(rbind, by(dat, (seq_len(nrow(dat))-1) %/% 25,
function(z) z[!duplicated(z$depth),]))
# Date Levels values depth
# 0.1 2005-12-31 1 182.8 0
# 0.3 2005-12-31 5 182.8 2
# 0.6 2005-12-31 8 182.8 3
# 0.7 2005-12-31 9 182.8 4
# 0.9 2005-12-31 11 182.8 5
# 0.10 2005-12-31 13 182.7 7
# 0.11 2005-12-31 14 182.7 8
# 0.12 2005-12-31 16 182.6 10
# 0.13 2005-12-31 17 182.5 12
# 0.14 2005-12-31 20 181.5 17
# 0.15 2005-12-31 23 177.5 23
# 0.16 2005-12-31 26 165.9 31
# 0.17 2005-12-31 28 155.0 36
# 0.18 2005-12-31 29 149.2 40
# 0.19 2005-12-31 31 136.9 46
# 0.20 2005-12-31 33 126.1 53
# 0.21 2005-12-31 35 112.7 60
# 0.22 2005-12-31 38 88.2 70
# 0.23 2005-12-31 41 68.0 79
# 0.24 2005-12-31 44 54.6 87
# 0.25 2005-12-31 49 45.4 98
# 1.26 2006-12-31 1 182.8 0
# 1.28 2006-12-31 5 182.8 2
# 1.31 2006-12-31 8 182.8 3
# 1.32 2006-12-31 9 182.8 4
# 1.34 2006-12-31 11 182.7 5
# 1.35 2006-12-31 13 182.7 7
# 1.36 2006-12-31 14 182.7 8
# 1.37 2006-12-31 16 182.6 10
# 1.38 2006-12-31 17 182.5 12
# 1.39 2006-12-31 20 181.5 17
# 1.40 2006-12-31 23 178.6 23
# 1.41 2006-12-31 26 168.7 31
# 1.42 2006-12-31 28 156.9 36
# 1.43 2006-12-31 29 150.4 40
# 1.44 2006-12-31 31 137.1 46
# 1.45 2006-12-31 33 126.0 53
# 1.46 2006-12-31 35 112.7 60
# 1.47 2006-12-31 38 91.8 70
# 1.48 2006-12-31 41 75.9 79
# 1.49 2006-12-31 44 65.2 87
# 1.50 2006-12-31 49 58.3 98
或
dat[!ave(dat$depth, (seq_len(nrow(dat))-1) %/% 25, FUN = duplicated),]
dplyr
library(dplyr)
dat %>%
group_by(grp = (seq_len(n())-1) %/% 25) %>%
distinct(depth, .keep_all = TRUE) %>%
ungroup() %>%
select(-grp)
# # A tibble: 42 x 4
# Date Levels values depth
# <chr> <int> <dbl> <int>
# 1 2005-12-31 1 183. 0
# 2 2005-12-31 5 183. 2
# 3 2005-12-31 8 183. 3
# 4 2005-12-31 9 183. 4
# 5 2005-12-31 11 183. 5
# 6 2005-12-31 13 183. 7
# 7 2005-12-31 14 183. 8
# 8 2005-12-31 16 183. 10
# 9 2005-12-31 17 182. 12
# 10 2005-12-31 20 182. 17
# # ... with 32 more rows
data.table
library(data.table)
as.data.table(dat)[, .SD[!duplicated(depth),], by=.( (seq_len(nrow(dat))-1) %/% 25 ) ][,-1]
(最后的 [,-1]
是因为 by=
分组操作隐式地将 seq_len(.)...
计数器作为其第一列。)
(注意到主题了吗?:-)
数据
dat <- structure(list(Date = c("2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31"), Levels = c(1L, 2L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 16L, 17L, 20L, 23L, 26L, 28L, 29L, 31L, 33L, 35L, 38L, 41L, 44L, 49L, 1L, 2L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 16L, 17L, 20L, 23L, 26L, 28L, 29L, 31L, 33L, 35L, 38L, 41L, 44L, 49L), values = c(182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.7, 182.7, 182.6, 182.5, 181.5, 177.5, 165.9, 155, 149.2, 136.9, 126.1, 112.7, 88.23, 67.99, 54.63, 45.4, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.7, 182.7, 182.7, 182.6, 182.5, 181.5, 178.6, 168.7, 156.9, 150.4, 137.1, 126, 112.7, 91.8, 75.91, 65.17, 58.33), depth = c(0L, 0L, 2L, 2L, 2L, 3L, 4L, 4L, 5L, 7L, 8L, 10L, 12L, 17L, 23L, 31L, 36L, 40L, 46L, 53L, 60L, 70L, 79L, 87L, 98L, 0L, 0L, 2L, 2L, 2L, 3L, 4L, 4L, 5L, 7L, 8L, 10L, 12L, 17L, 23L, 31L, 36L, 40L, 46L, 53L, 60L, 70L, 79L, 87L, 98L)), class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "50"))
我们可以使用 order
和 !duplicated
:
df = df[order(df[,'depth']),]
df = df[!duplicated(df$depth),]
df
Date Levels values depth
<date> <dbl> <dbl> <dbl>
1 2005-12-31 1 183. 0
2 2005-12-31 5 183. 2
3 2005-12-31 8 183. 3
4 2005-12-31 9 183. 4
5 2005-12-31 11 183. 5
6 2005-12-31 13 183. 7
7 2005-12-31 14 183. 8
8 2006-12-31 49 58.3 9
9 2005-12-31 16 183. 10
10 2005-12-31 17 182. 12
# … with 12 more rows