使用 group by 或聚合函数删除变量
Removing variables with group by or aggregate functions
我正在尝试从按分类变量分组的观察子集中删除异常值。这样我就可以绘制没有异常值的箱线图,并获得新数据集的 t-stat。
我尝试了 'group by' 和 data.table 以及列表聚合。但是,考虑到整个数据集,总是会删除异常值。不是来自每个子集。
这是数据集的一部分。有 40 个列变量和 62 个观测值
> dput(head(dat, 30))
structure(list(Treatment = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("SHAM+vehicle", "TAC+vehicle",
"TAC+relaxin", "TAC+Enalapril"), class = "factor"), Comparison = c("TAC(4hrs)+vehicle",
"TAC(4hrs)+vehicle", "TAC(4hrs)+vehicle", "TAC(4hrs)+vehicle",
"TAC(4hrs)+vehicle", "TAC(4hrs)+vehicle", "TAC(4hrs)+vehicle",
"TAC(4hrs)+vehicle", "TAC(4hrs)+vehicle", "TAC(4hrs)+relaxin",
"TAC(4hrs)+relaxin", "TAC(4hrs)+relaxin", "TAC(4hrs)+relaxin",
"TAC(4hrs)+relaxin", "TAC(4hrs)+relaxin", "TAC(4hrs)+relaxin",
"TAC(4hrs)+relaxin", "TAC(4hrs)+relaxin", "SHAM(10hrs)+vehicle",
"SHAM(10hrs)+vehicle", "SHAM(10hrs)+vehicle", "SHAM(10hrs)+vehicle",
"SHAM(10hrs)+vehicle", "SHAM(10hrs)+vehicle", "SHAM(10hrs)+vehicle",
"SHAM(10hrs)+vehicle", "SHAM(10hrs)+vehicle", "TAC(10hrs)+vehicle",
"TAC(10hrs)+vehicle", "TAC(10hrs)+vehicle"), Mode = c("Prevention",
"Prevention", "Prevention", "Prevention", "Prevention", "Prevention",
"Prevention", "Prevention", "Prevention", "Prevention", "Prevention",
"Prevention", "Prevention", "Prevention", "Prevention", "Prevention",
"Prevention", "Prevention", "Intervention", "Intervention", "Intervention",
"Intervention", "Intervention", "Intervention", "Intervention",
"Intervention", "Intervention", "Intervention", "Intervention",
"Intervention"), `Adiponectin/Acrp30` = c(1300000, 650000, 650000,
650000, 1300000, 1300000, 1300000, 1300000, 1300000, 650000,
650000, 650000, 650000, 650000, 1300000, 1300000, 1300000, 1300000,
650000, 650000, 650000, 650000, 1300000, 650000, 650000, 1300000,
1300000, 650000, 1300000, 650000), CRP = c(10666575, 3785850,
3876595, 6287075, 5612955, 4544670, 9467470, 5632695, 8817655,
4273610, 3560300, 10077690, 6504345, 4233480, 5425300, 2193250,
6704455, 7838805, 5144890, 3636160, 4183640, 8913940, 3345130,
4063455, 3823415, 8426135, 5877360, 5499595, 6996230, 2830510
), `Cystatin C` = c(565000, 565000, 565000, 565000, 565000, 565000,
565000, 565000, 565000, 565000, 565000, 565000, 565000, 565000,
565000, 565000, 565000, 565000, 565000, 565000, 565000, 565000,
565000, 565000, 565000, 565000, 565000, 565000, 565000, 565000
), `Endoglin/CD105` = c(5460.36, 2405.94, 2613.33, 1249.04, 3545.37,
2152.72, 1769.2, 695.94, 956.65, 1958.48, 3842.39, 3963.14, 1288.27,
1046.94, 1097.09, 2377.61, 1858.56, 513.67, 1200.51, 2246.9,
2907.68, 1632.56, 892.39, 988.96, 746.25, 682.59, 327.2, 1601.98,
361.54, 692.6), Endostatin = c(29667.6, 22750.32, 21733.44, 23829.04,
20203.12, 14614.88, 17822.56, 23132.24, 20265.84, 17495.76, 27424.16,
17635.44, 22257.68, 34155.44, 16857.52, 18949.6, 25434.64, 22701.36,
18186.16, 24013.12, 14673.92, 14092.4, 26438.4, 18384.4, 19220.96,
18781.52, 19844.08, 23242.96, 23037.2, 22040.24), `FABP4/A-FABP` = c(2389.37,
1143.58, 862.57, 376.15, 1368.68, 649.46, 370.47, 243.43, 378.48,
605.82, 1458.3, 588.77, 616.45, 390.36, 403.54, 603.54, 804.06,
244.41, 1025.16, 602.67, 948.18, 292.27, 260.56, 259.61, 243.58,
240.89, 314.22, 395.73, 304.18, 836.27), `Fas (APO-1)` = c(24.57,
10.13, 11.63, 1.25, 14.74, 1.25, 1.25, 1.25, 1.25, 1.25, 14.63,
6.95, 1.25, 1.25, 1.25, 1.25, 2.5, 1.25, 15.27, 5.68, 8.22, 1.25,
1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 4.42), `FGF-21` = c(136.07,
233.66, 63.28, 99.6, 190.43, 54.54, 141.27, 104.86, 136.07, 131.03,
155.04, 75.54, 130.17, 191.02, 264.49, 97.75, 216.12, 204.42,
431.37, 62.15, 90.38, 47.5, 74.84, 144.45, 88.4, 181.26, 232.14,
128.01, 129.74, 771.73), `FGF-23` = c(244.06, 108.41, 140.06,
168.71, 113.96, 129.91, 274.24, 135.03, 277.9, 168.71, 216.2,
220.28, 207.95, 216.2, 129.91, 164.1, 111.2, 228.33, 276.07,
159.42, 199.54, 145.01, 263.1, 238.22, 195.27, 124.7, 207.95,
145.01, 51.94, 212.09)........
代码如下
dat_o = dat
setDT(dat_o)
for (j in col_names){
dat_o[, (j) := lapply(.SD, function(x) ifelse(!x %in% boxplot.stats(dat_o[[j]])$out, x, NA)),
by = Comparison, .SDcols = j]
}
#aggregate function
aggregate(dat_o[[j]], by=list(dat_o$Comparison),
FUN= function(x) ifelse(!x %in% boxplot.stats(dat_o[[j]])$out, x, NA))
问题出在哪里?感谢您解决此问题的见解和新颖想法。
您在 function(x)
中使用 dat_o[[..]]
始终使用整个框架,而不仅仅是您打算使用的 subset/group。另外,不需要使用for
循环,我们可以使用.SDcols
。我将用 mtcars
:
进行演示
library(data.table)
MT <- as.data.table(mtcars)
cols <- c("hp", "wt", "qsec")
MT[, (cols) := lapply(.SD, function(z) fifelse(z %in% boxplot.stats(z)$out, z[NA], z)),
.SDcols = cols][]
# mpg cyl disp hp drat wt qsec vs am gear carb
# <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num>
# 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
# 2: 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
# 3: 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
# 4: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
# 5: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
# 6: 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
# 7: 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
# 8: 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
# 9: 22.8 4 140.8 95 3.92 3.150 NA 1 0 4 2
# 10: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
# 11: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
# 12: 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
# 13: 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
# 14: 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
# 15: 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
# 16: 10.4 8 460.0 215 3.00 NA 17.82 0 0 3 4
# 17: 14.7 8 440.0 230 3.23 NA 17.42 0 0 3 4
# 18: 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
# 19: 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
# 20: 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
# 21: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
# 22: 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
# 23: 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
# 24: 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
# 25: 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
# 26: 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
# 27: 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
# 28: 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
# 29: 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
# 30: 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
# 31: 15.0 8 301.0 NA 3.54 3.570 14.60 0 1 5 8
# 32: 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
# mpg cyl disp hp drat wt qsec vs am gear carb
仅供参考:我使用 z[NA]
而不是 NA
因为 fifelse
强制 yes=
和 no=
参数必须严格相同class
;孤立的 NA
在技术上属于 class logical
(至少有六种 NA
,仅供参考),但 z[NA]
将始终 return NA
中的适当 class 需要满足 fifelse
。 (dplyr::if_else
是同样的方式。我认为 base::ifelse
一个 小 草率......也许更宽容......因为不强制执行这一点,尽管它可能导致如果您没有预料到或没有做好准备,就会有惊喜。)
(此方法也可应用于基本方法或 dplyr
方法。)
基础 R
这是 ave
的一种方式。请注意,ave
returns 与被分组的向量具有相同 class 的向量,在本例中为数字向量,因此在子集化中它被强制为逻辑。
i <- with(dat, ave(j, Comparison, FUN = function(x){
!x %in% boxplot.stats(x)$out
}))
dat[as.logical(i), ]
套餐data.table
技巧是,像上面一样,在 j
上创建一个逻辑索引,按 Comparison
分组,然后在该索引上创建子集。但是索引是以不同的方式创建的。
library(data.table)
dat_o <- dat
setDT(dat_o)
# This returns a logical index
dat_o[, sapply(.SD, function(x) !x %in% boxplot.stats(x)$out),
by = Comparison, .SDcols = 'j'][[2]]
现在子集使用索引。
dat_o[dat_o[, sapply(.SD, function(x) !x %in% boxplot.stats(x)$out),
by = Comparison, .SDcols = 'j'][[2]], ]
nrow(dat_o)
#[1] 200
但它并没有改变 data.table,它只选择了 TRUE
行。结果必须赋值回dat_o
.
dat_o <- dat_o[dat_o[, sapply(.SD, function(x) !x %in% boxplot.stats(x)$out),
by = Comparison, .SDcols = 'j'][[2]], ]
nrow(dat_o)
#[1] 192
测试数据创建代码。
set.seed(2021)
n <- 100
x <- rnorm(n)
y <- rnorm(n, mean = 20)
x[sample(n, 3)] <- 11:13
y[sample(n, 3)] <- 101:103
boxplot.stats(x)$out
#[1] 13 12 11
boxplot.stats(y)$out
#[1] 17.29928 17.31704 102.00000 101.00000 103.00000
Comparison <- rep(c("A", "B"), each = n)
j <- c(x, y)
dat <- data.frame(Comparison, j)
也许我也会添加我的解决方案。
首先,让我们生成带有异常值的数据。
library(tidyverse)
nrow=100
ncol=10
df = tibble(group = rep(1:ncol, each=nrow) %>% factor(),
x = sample(c(-20:20, rnorm(nrow*ncol)), nrow*ncol))
df %>% ggplot(aes(group, x, fill=group))+
geom_boxplot()
现在让我们做一个小聪明的 f2
函数,将异常值数据转换为 NA
值
f2 = function(data) ifelse(data$x %in% boxplot.stats(data$x)$out, NA, data$x)
是时候使用我们聪明的 f2
功能了
df %>% group_by(group) %>%
nest() %>%
mutate(data = map(data, f2)) %>%
unnest(data) %>%
ggplot(aes(group, data, fill=group))+
geom_boxplot()
看起来非常优雅和简单。
或者,也许您想计算这些准备好的数据的统计数据(没有异常值)?没有比这更简单的了。见下文。
fstat = function(x) tibble(
mean = mean(x, na.rm = TRUE),
sd = sd(x, na.rm = TRUE),
median = median(x, na.rm = TRUE)
)
df %>% group_by(group) %>%
nest() %>%
mutate(data = map(data, f2),
stat = map(data, fstat)) %>%
unnest(stat)
输出
# A tibble: 10 x 5
# Groups: group [10]
group data mean sd median
<fct> <list> <dbl> <dbl> <dbl>
1 1 <dbl [100]> 0.0140 0.886 0.0513
2 2 <dbl [100]> 0.0398 1.11 -0.00458
3 3 <dbl [100]> -0.00975 1.22 0.00258
4 4 <dbl [100]> 0.0179 1.01 -0.0242
5 5 <dbl [100]> 0.0859 0.928 0.160
6 6 <dbl [100]> -0.0374 1.01 -0.00938
7 7 <dbl [100]> -0.0451 0.945 -0.0277
8 8 <dbl [100]> 0.0330 1.06 -0.0535
9 9 <dbl [100]> 0.103 0.964 0.0577
10 10 <dbl [100]> 0.112 1.08 0.0610
我正在尝试从按分类变量分组的观察子集中删除异常值。这样我就可以绘制没有异常值的箱线图,并获得新数据集的 t-stat。
我尝试了 'group by' 和 data.table 以及列表聚合。但是,考虑到整个数据集,总是会删除异常值。不是来自每个子集。
这是数据集的一部分。有 40 个列变量和 62 个观测值
> dput(head(dat, 30))
structure(list(Treatment = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("SHAM+vehicle", "TAC+vehicle",
"TAC+relaxin", "TAC+Enalapril"), class = "factor"), Comparison = c("TAC(4hrs)+vehicle",
"TAC(4hrs)+vehicle", "TAC(4hrs)+vehicle", "TAC(4hrs)+vehicle",
"TAC(4hrs)+vehicle", "TAC(4hrs)+vehicle", "TAC(4hrs)+vehicle",
"TAC(4hrs)+vehicle", "TAC(4hrs)+vehicle", "TAC(4hrs)+relaxin",
"TAC(4hrs)+relaxin", "TAC(4hrs)+relaxin", "TAC(4hrs)+relaxin",
"TAC(4hrs)+relaxin", "TAC(4hrs)+relaxin", "TAC(4hrs)+relaxin",
"TAC(4hrs)+relaxin", "TAC(4hrs)+relaxin", "SHAM(10hrs)+vehicle",
"SHAM(10hrs)+vehicle", "SHAM(10hrs)+vehicle", "SHAM(10hrs)+vehicle",
"SHAM(10hrs)+vehicle", "SHAM(10hrs)+vehicle", "SHAM(10hrs)+vehicle",
"SHAM(10hrs)+vehicle", "SHAM(10hrs)+vehicle", "TAC(10hrs)+vehicle",
"TAC(10hrs)+vehicle", "TAC(10hrs)+vehicle"), Mode = c("Prevention",
"Prevention", "Prevention", "Prevention", "Prevention", "Prevention",
"Prevention", "Prevention", "Prevention", "Prevention", "Prevention",
"Prevention", "Prevention", "Prevention", "Prevention", "Prevention",
"Prevention", "Prevention", "Intervention", "Intervention", "Intervention",
"Intervention", "Intervention", "Intervention", "Intervention",
"Intervention", "Intervention", "Intervention", "Intervention",
"Intervention"), `Adiponectin/Acrp30` = c(1300000, 650000, 650000,
650000, 1300000, 1300000, 1300000, 1300000, 1300000, 650000,
650000, 650000, 650000, 650000, 1300000, 1300000, 1300000, 1300000,
650000, 650000, 650000, 650000, 1300000, 650000, 650000, 1300000,
1300000, 650000, 1300000, 650000), CRP = c(10666575, 3785850,
3876595, 6287075, 5612955, 4544670, 9467470, 5632695, 8817655,
4273610, 3560300, 10077690, 6504345, 4233480, 5425300, 2193250,
6704455, 7838805, 5144890, 3636160, 4183640, 8913940, 3345130,
4063455, 3823415, 8426135, 5877360, 5499595, 6996230, 2830510
), `Cystatin C` = c(565000, 565000, 565000, 565000, 565000, 565000,
565000, 565000, 565000, 565000, 565000, 565000, 565000, 565000,
565000, 565000, 565000, 565000, 565000, 565000, 565000, 565000,
565000, 565000, 565000, 565000, 565000, 565000, 565000, 565000
), `Endoglin/CD105` = c(5460.36, 2405.94, 2613.33, 1249.04, 3545.37,
2152.72, 1769.2, 695.94, 956.65, 1958.48, 3842.39, 3963.14, 1288.27,
1046.94, 1097.09, 2377.61, 1858.56, 513.67, 1200.51, 2246.9,
2907.68, 1632.56, 892.39, 988.96, 746.25, 682.59, 327.2, 1601.98,
361.54, 692.6), Endostatin = c(29667.6, 22750.32, 21733.44, 23829.04,
20203.12, 14614.88, 17822.56, 23132.24, 20265.84, 17495.76, 27424.16,
17635.44, 22257.68, 34155.44, 16857.52, 18949.6, 25434.64, 22701.36,
18186.16, 24013.12, 14673.92, 14092.4, 26438.4, 18384.4, 19220.96,
18781.52, 19844.08, 23242.96, 23037.2, 22040.24), `FABP4/A-FABP` = c(2389.37,
1143.58, 862.57, 376.15, 1368.68, 649.46, 370.47, 243.43, 378.48,
605.82, 1458.3, 588.77, 616.45, 390.36, 403.54, 603.54, 804.06,
244.41, 1025.16, 602.67, 948.18, 292.27, 260.56, 259.61, 243.58,
240.89, 314.22, 395.73, 304.18, 836.27), `Fas (APO-1)` = c(24.57,
10.13, 11.63, 1.25, 14.74, 1.25, 1.25, 1.25, 1.25, 1.25, 14.63,
6.95, 1.25, 1.25, 1.25, 1.25, 2.5, 1.25, 15.27, 5.68, 8.22, 1.25,
1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 4.42), `FGF-21` = c(136.07,
233.66, 63.28, 99.6, 190.43, 54.54, 141.27, 104.86, 136.07, 131.03,
155.04, 75.54, 130.17, 191.02, 264.49, 97.75, 216.12, 204.42,
431.37, 62.15, 90.38, 47.5, 74.84, 144.45, 88.4, 181.26, 232.14,
128.01, 129.74, 771.73), `FGF-23` = c(244.06, 108.41, 140.06,
168.71, 113.96, 129.91, 274.24, 135.03, 277.9, 168.71, 216.2,
220.28, 207.95, 216.2, 129.91, 164.1, 111.2, 228.33, 276.07,
159.42, 199.54, 145.01, 263.1, 238.22, 195.27, 124.7, 207.95,
145.01, 51.94, 212.09)........
代码如下
dat_o = dat
setDT(dat_o)
for (j in col_names){
dat_o[, (j) := lapply(.SD, function(x) ifelse(!x %in% boxplot.stats(dat_o[[j]])$out, x, NA)),
by = Comparison, .SDcols = j]
}
#aggregate function
aggregate(dat_o[[j]], by=list(dat_o$Comparison),
FUN= function(x) ifelse(!x %in% boxplot.stats(dat_o[[j]])$out, x, NA))
问题出在哪里?感谢您解决此问题的见解和新颖想法。
您在 function(x)
中使用 dat_o[[..]]
始终使用整个框架,而不仅仅是您打算使用的 subset/group。另外,不需要使用for
循环,我们可以使用.SDcols
。我将用 mtcars
:
library(data.table)
MT <- as.data.table(mtcars)
cols <- c("hp", "wt", "qsec")
MT[, (cols) := lapply(.SD, function(z) fifelse(z %in% boxplot.stats(z)$out, z[NA], z)),
.SDcols = cols][]
# mpg cyl disp hp drat wt qsec vs am gear carb
# <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num>
# 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
# 2: 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
# 3: 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
# 4: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
# 5: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
# 6: 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
# 7: 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
# 8: 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
# 9: 22.8 4 140.8 95 3.92 3.150 NA 1 0 4 2
# 10: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
# 11: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
# 12: 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
# 13: 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
# 14: 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
# 15: 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
# 16: 10.4 8 460.0 215 3.00 NA 17.82 0 0 3 4
# 17: 14.7 8 440.0 230 3.23 NA 17.42 0 0 3 4
# 18: 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
# 19: 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
# 20: 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
# 21: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
# 22: 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
# 23: 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
# 24: 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
# 25: 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
# 26: 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
# 27: 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
# 28: 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
# 29: 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
# 30: 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
# 31: 15.0 8 301.0 NA 3.54 3.570 14.60 0 1 5 8
# 32: 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
# mpg cyl disp hp drat wt qsec vs am gear carb
仅供参考:我使用 z[NA]
而不是 NA
因为 fifelse
强制 yes=
和 no=
参数必须严格相同class
;孤立的 NA
在技术上属于 class logical
(至少有六种 NA
,仅供参考),但 z[NA]
将始终 return NA
中的适当 class 需要满足 fifelse
。 (dplyr::if_else
是同样的方式。我认为 base::ifelse
一个 小 草率......也许更宽容......因为不强制执行这一点,尽管它可能导致如果您没有预料到或没有做好准备,就会有惊喜。)
(此方法也可应用于基本方法或 dplyr
方法。)
基础 R
这是 ave
的一种方式。请注意,ave
returns 与被分组的向量具有相同 class 的向量,在本例中为数字向量,因此在子集化中它被强制为逻辑。
i <- with(dat, ave(j, Comparison, FUN = function(x){
!x %in% boxplot.stats(x)$out
}))
dat[as.logical(i), ]
套餐data.table
技巧是,像上面一样,在 j
上创建一个逻辑索引,按 Comparison
分组,然后在该索引上创建子集。但是索引是以不同的方式创建的。
library(data.table)
dat_o <- dat
setDT(dat_o)
# This returns a logical index
dat_o[, sapply(.SD, function(x) !x %in% boxplot.stats(x)$out),
by = Comparison, .SDcols = 'j'][[2]]
现在子集使用索引。
dat_o[dat_o[, sapply(.SD, function(x) !x %in% boxplot.stats(x)$out),
by = Comparison, .SDcols = 'j'][[2]], ]
nrow(dat_o)
#[1] 200
但它并没有改变 data.table,它只选择了 TRUE
行。结果必须赋值回dat_o
.
dat_o <- dat_o[dat_o[, sapply(.SD, function(x) !x %in% boxplot.stats(x)$out),
by = Comparison, .SDcols = 'j'][[2]], ]
nrow(dat_o)
#[1] 192
测试数据创建代码。
set.seed(2021)
n <- 100
x <- rnorm(n)
y <- rnorm(n, mean = 20)
x[sample(n, 3)] <- 11:13
y[sample(n, 3)] <- 101:103
boxplot.stats(x)$out
#[1] 13 12 11
boxplot.stats(y)$out
#[1] 17.29928 17.31704 102.00000 101.00000 103.00000
Comparison <- rep(c("A", "B"), each = n)
j <- c(x, y)
dat <- data.frame(Comparison, j)
也许我也会添加我的解决方案。 首先,让我们生成带有异常值的数据。
library(tidyverse)
nrow=100
ncol=10
df = tibble(group = rep(1:ncol, each=nrow) %>% factor(),
x = sample(c(-20:20, rnorm(nrow*ncol)), nrow*ncol))
df %>% ggplot(aes(group, x, fill=group))+
geom_boxplot()
f2
函数,将异常值数据转换为 NA
值
f2 = function(data) ifelse(data$x %in% boxplot.stats(data$x)$out, NA, data$x)
是时候使用我们聪明的 f2
功能了
df %>% group_by(group) %>%
nest() %>%
mutate(data = map(data, f2)) %>%
unnest(data) %>%
ggplot(aes(group, data, fill=group))+
geom_boxplot()
看起来非常优雅和简单。 或者,也许您想计算这些准备好的数据的统计数据(没有异常值)?没有比这更简单的了。见下文。
fstat = function(x) tibble(
mean = mean(x, na.rm = TRUE),
sd = sd(x, na.rm = TRUE),
median = median(x, na.rm = TRUE)
)
df %>% group_by(group) %>%
nest() %>%
mutate(data = map(data, f2),
stat = map(data, fstat)) %>%
unnest(stat)
输出
# A tibble: 10 x 5
# Groups: group [10]
group data mean sd median
<fct> <list> <dbl> <dbl> <dbl>
1 1 <dbl [100]> 0.0140 0.886 0.0513
2 2 <dbl [100]> 0.0398 1.11 -0.00458
3 3 <dbl [100]> -0.00975 1.22 0.00258
4 4 <dbl [100]> 0.0179 1.01 -0.0242
5 5 <dbl [100]> 0.0859 0.928 0.160
6 6 <dbl [100]> -0.0374 1.01 -0.00938
7 7 <dbl [100]> -0.0451 0.945 -0.0277
8 8 <dbl [100]> 0.0330 1.06 -0.0535
9 9 <dbl [100]> 0.103 0.964 0.0577
10 10 <dbl [100]> 0.112 1.08 0.0610