使用 R 从 data.frame 中删除离群值
Remove outlier values from a data.frame using R
我有一个 data.frame 列,其中包含水质值。
我想从每一列中删除离群值,并在值的位置添加无数据 NA。
编辑:
我想按如下方式删除异常值:
异常值 > 分位数 95
和
异常值 < 分位数 5
我该怎么做?
我有一个例子
df=read.table(text="st PH OD COD N
A 7.3 1.26301094 1.112359589 0.295842925
B 12.69875867 5.670646078 4.841748321 0.096958426
C 9.613564343 1.706277385 7.952266541 0.102672152
D 9.693461149 7.075560183 0.283503075 0.302494648
A 11.2031501 5.444756127 3.133271063 0.421172108
B 9.288552402 4.169068095 10.54049312 0.122900615
C 4.207333379 6.717653051 10.49073885 0.085634135
D 10.98593946 2.352068972 8.468436777 0.142284793
A 8.20679887 7.826764274 4.464242367 0.211200956
B 12.9165421 0.909886436 1.488358471 0.001640961
C 3.971088246 8.500668307 6.315208679 0.319835127
D 4.821068685 3.871082236 8.669284239 0.349317325
A 0.431563127 0.978922921 10.53756208 0.111929377
B 7.546887828 9.946840115 1.584013576 0.426681716
C 4.689617182 8.717656795 7.474709944 0.473463497
D 9.730568456 1.134763618 4.679810195 0.215744107
A 12.06381259 6.862549062 0.559497593 0.231984105
",
sep = "", header = TRUE)
结合使用 apply
、quantile
和 dplyr::na_if
,您可以:
df[-1] <- apply(df[-1], 2, as.numeric)
df[-1] <- apply(df[-1], 2,
function(x) na_if(x,x[which(x < quantile(x,probs=c(0.05)))]))
df[-1] <- apply(df[-1], 2,
function(x) na_if(x,x[which(x > quantile(x,probs=c(0.95),na.rm=T))]))
df
st PH OD COD N
1 A 7.300000 1.2630109 1.1123596 0.29584292
2 B 12.698759 5.6706461 4.8417483 0.09695843
3 C 9.613564 1.7062774 7.9522665 0.10267215
4 D 9.693461 7.0755602 NA 0.30249465
5 A 11.203150 5.4447561 3.1332711 0.42117211
6 B 9.288552 4.1690681 NA 0.12290062
7 C 4.207333 6.7176531 10.4907388 0.08563414
8 D 10.985939 2.3520690 8.4684368 0.14228479
9 A 8.206799 7.8267643 4.4642424 0.21120096
10 B NA NA 1.4883585 NA
11 C 3.971088 8.5006683 6.3152087 0.31983513
12 D 4.821069 3.8710822 8.6692842 0.34931733
13 A NA 0.9789229 10.5375621 0.11192938
14 B 7.546888 NA 1.5840136 0.42668172
15 C 4.689617 8.7176568 7.4747099 NA
16 D 9.730568 1.1347636 4.6798102 0.21574411
17 A 12.063813 6.8625491 0.5594976 0.23198410
# lq = lower quantile
# uq = upper quantile
rm_outlier <- function(x, lq=5/100, uq=95/100) {
qnts = quantile(x, probs=c(lq, uq))
ifelse(x < qnts[1] | x > qnts[2], NA, x)
}
do.call(cbind.data.frame, lapply(df[, -1], rm_outlier))
PH OD COD N
1 7.300000 1.2630109 1.1123596 0.29584292
2 12.698759 5.6706461 4.8417483 0.09695843
3 9.613564 1.7062774 7.9522665 0.10267215
4 9.693461 7.0755602 NA 0.30249465
5 11.203150 5.4447561 3.1332711 0.42117211
6 9.288552 4.1690681 NA 0.12290062
7 4.207333 6.7176531 10.4907388 0.08563414
8 10.985939 2.3520690 8.4684368 0.14228479
9 8.206799 7.8267643 4.4642424 0.21120096
10 NA NA 1.4883585 NA
11 3.971088 8.5006683 6.3152087 0.31983513
12 4.821069 3.8710822 8.6692842 0.34931733
13 NA 0.9789229 10.5375621 0.11192938
14 7.546888 NA 1.5840136 0.42668172
15 4.689617 8.7176568 7.4747099 NA
16 9.730568 1.1347636 4.6798102 0.21574411
17 12.063813 6.8625491 0.5594976 0.23198410
要覆盖数字列同时保持非数字列完整,我们可以这样做:
df[, -1] = lapply(df[, -1], rm_outlier)
df
st PH OD COD N
1 A 7.300000 1.2630109 1.1123596 0.29584292
2 B 12.698759 5.6706461 4.8417483 0.09695843
3 C 9.613564 1.7062774 7.9522665 0.10267215
4 D 9.693461 7.0755602 NA 0.30249465
5 A 11.203150 5.4447561 3.1332711 0.42117211
6 B 9.288552 4.1690681 NA 0.12290062
7 C 4.207333 6.7176531 10.4907388 0.08563414
8 D 10.985939 2.3520690 8.4684368 0.14228479
9 A 8.206799 7.8267643 4.4642424 0.21120096
10 B NA NA 1.4883585 NA
11 C 3.971088 8.5006683 6.3152087 0.31983513
12 D 4.821069 3.8710822 8.6692842 0.34931733
13 A NA 0.9789229 10.5375621 0.11192938
14 B 7.546888 NA 1.5840136 0.42668172
15 C 4.689617 8.7176568 7.4747099 NA
16 D 9.730568 1.1347636 4.6798102 0.21574411
17 A 12.063813 6.8625491 0.5594976 0.23198410
我有一个 data.frame 列,其中包含水质值。
我想从每一列中删除离群值,并在值的位置添加无数据 NA。
编辑:
我想按如下方式删除异常值:
异常值 > 分位数 95
和
异常值 < 分位数 5
我该怎么做?
我有一个例子
df=read.table(text="st PH OD COD N
A 7.3 1.26301094 1.112359589 0.295842925
B 12.69875867 5.670646078 4.841748321 0.096958426
C 9.613564343 1.706277385 7.952266541 0.102672152
D 9.693461149 7.075560183 0.283503075 0.302494648
A 11.2031501 5.444756127 3.133271063 0.421172108
B 9.288552402 4.169068095 10.54049312 0.122900615
C 4.207333379 6.717653051 10.49073885 0.085634135
D 10.98593946 2.352068972 8.468436777 0.142284793
A 8.20679887 7.826764274 4.464242367 0.211200956
B 12.9165421 0.909886436 1.488358471 0.001640961
C 3.971088246 8.500668307 6.315208679 0.319835127
D 4.821068685 3.871082236 8.669284239 0.349317325
A 0.431563127 0.978922921 10.53756208 0.111929377
B 7.546887828 9.946840115 1.584013576 0.426681716
C 4.689617182 8.717656795 7.474709944 0.473463497
D 9.730568456 1.134763618 4.679810195 0.215744107
A 12.06381259 6.862549062 0.559497593 0.231984105
",
sep = "", header = TRUE)
结合使用 apply
、quantile
和 dplyr::na_if
,您可以:
df[-1] <- apply(df[-1], 2, as.numeric)
df[-1] <- apply(df[-1], 2,
function(x) na_if(x,x[which(x < quantile(x,probs=c(0.05)))]))
df[-1] <- apply(df[-1], 2,
function(x) na_if(x,x[which(x > quantile(x,probs=c(0.95),na.rm=T))]))
df
st PH OD COD N
1 A 7.300000 1.2630109 1.1123596 0.29584292
2 B 12.698759 5.6706461 4.8417483 0.09695843
3 C 9.613564 1.7062774 7.9522665 0.10267215
4 D 9.693461 7.0755602 NA 0.30249465
5 A 11.203150 5.4447561 3.1332711 0.42117211
6 B 9.288552 4.1690681 NA 0.12290062
7 C 4.207333 6.7176531 10.4907388 0.08563414
8 D 10.985939 2.3520690 8.4684368 0.14228479
9 A 8.206799 7.8267643 4.4642424 0.21120096
10 B NA NA 1.4883585 NA
11 C 3.971088 8.5006683 6.3152087 0.31983513
12 D 4.821069 3.8710822 8.6692842 0.34931733
13 A NA 0.9789229 10.5375621 0.11192938
14 B 7.546888 NA 1.5840136 0.42668172
15 C 4.689617 8.7176568 7.4747099 NA
16 D 9.730568 1.1347636 4.6798102 0.21574411
17 A 12.063813 6.8625491 0.5594976 0.23198410
# lq = lower quantile
# uq = upper quantile
rm_outlier <- function(x, lq=5/100, uq=95/100) {
qnts = quantile(x, probs=c(lq, uq))
ifelse(x < qnts[1] | x > qnts[2], NA, x)
}
do.call(cbind.data.frame, lapply(df[, -1], rm_outlier))
PH OD COD N
1 7.300000 1.2630109 1.1123596 0.29584292
2 12.698759 5.6706461 4.8417483 0.09695843
3 9.613564 1.7062774 7.9522665 0.10267215
4 9.693461 7.0755602 NA 0.30249465
5 11.203150 5.4447561 3.1332711 0.42117211
6 9.288552 4.1690681 NA 0.12290062
7 4.207333 6.7176531 10.4907388 0.08563414
8 10.985939 2.3520690 8.4684368 0.14228479
9 8.206799 7.8267643 4.4642424 0.21120096
10 NA NA 1.4883585 NA
11 3.971088 8.5006683 6.3152087 0.31983513
12 4.821069 3.8710822 8.6692842 0.34931733
13 NA 0.9789229 10.5375621 0.11192938
14 7.546888 NA 1.5840136 0.42668172
15 4.689617 8.7176568 7.4747099 NA
16 9.730568 1.1347636 4.6798102 0.21574411
17 12.063813 6.8625491 0.5594976 0.23198410
要覆盖数字列同时保持非数字列完整,我们可以这样做:
df[, -1] = lapply(df[, -1], rm_outlier)
df
st PH OD COD N
1 A 7.300000 1.2630109 1.1123596 0.29584292
2 B 12.698759 5.6706461 4.8417483 0.09695843
3 C 9.613564 1.7062774 7.9522665 0.10267215
4 D 9.693461 7.0755602 NA 0.30249465
5 A 11.203150 5.4447561 3.1332711 0.42117211
6 B 9.288552 4.1690681 NA 0.12290062
7 C 4.207333 6.7176531 10.4907388 0.08563414
8 D 10.985939 2.3520690 8.4684368 0.14228479
9 A 8.206799 7.8267643 4.4642424 0.21120096
10 B NA NA 1.4883585 NA
11 C 3.971088 8.5006683 6.3152087 0.31983513
12 D 4.821069 3.8710822 8.6692842 0.34931733
13 A NA 0.9789229 10.5375621 0.11192938
14 B 7.546888 NA 1.5840136 0.42668172
15 C 4.689617 8.7176568 7.4747099 NA
16 D 9.730568 1.1347636 4.6798102 0.21574411
17 A 12.063813 6.8625491 0.5594976 0.23198410