R:用百分位数替换范围
R: Replacing Ranges with their Percentiles
我正在使用 R 编程语言。假设,我有以下数据框:
var_1 = rnorm(100,10,10)
var_2 = rnorm(100,10,10)
var_3 = rnorm(100,10,10)
d = data.frame(var_1, var_2, var_3)
head(d)
var_1 var_2 var_3
1 14.251923 14.877801 22.636207
2 7.325137 8.513718 21.021522
3 3.400001 -3.400397 11.274797
4 16.400597 8.623980 9.366115
5 7.065583 13.155570 17.891432
6 21.297912 4.341385 -11.337330
我的问题:对于每个变量中的每个元素,我想用它所属的百分位数替换该元素。
例如:
a = quantile(d$var_1, c( 0.15, 0.3, 0.35, 0.45, 0.5, 0.65, 0.7, 0.8, 0.85, 0.9, 0.95, 1))
b = quantile(d$var_2, c(0.16, 0.23, 0.65, 0.71, 0.95))
c = quantile(d$var_3, c(0.15, 0.28, 0.7, 0.73, 0.87))
> a
5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60% 65% 70% 75%
-0.8806901 0.3595086 1.1201300 3.0581928 5.0901641 7.0056228 7.6089831 8.9853805 9.9264540 10.2235212 11.5707533 13.2422940 15.1076889 16.5354881 17.9336020
80% 85% 90% 95% 100%
19.5312682 21.9264905 24.4511364 26.6820271 41.4419744
> b
16% 23% 65% 71% 95%
-2.795294 1.430715 11.070815 12.688064 25.270823
> c
15% 28% 70% 73% 87%
0.958404 5.767591 15.258532 16.013648 20.467892
例如:
- 如果
d$var_2 < -2.795294
,则d$var_2 = 16th percentile
- 如果
d$var_3 between (5.767591 , 15.258532)
,则d$var_3 = 70th percentile
我可以手动编写多个“if 语句”来执行此操作,但是有没有更快的方法来执行此操作?
谢谢!
您可以通过应用自定义函数来执行类似的操作:
library(tidyverse)
ApplyQuantiles <- function(x, y) {
cut(
x,
breaks = c(quantile(x, probs = y)),
labels = c(names(quantile(x, probs = y))[-1]),
include.lowest = TRUE
)
}
output <- d %>%
mutate(var_1 = ApplyQuantiles(var_1, c(0, 0.15, 0.3, 0.35, 0.45, 0.5, 0.65, 0.7, 0.8, 0.85, 0.9, 0.95, 1)),
var_2 = ApplyQuantiles(var_2, c(0, 0.16, 0.23, 0.65, 0.71, 0.95, 1.0)),
var_3 = ApplyQuantiles(var_3, c(0, 0.15, 0.28, 0.7, 0.73, 0.87, 1.0))) %>%
mutate(across(everything(), str_replace, pattern = "%", replacement = "th percentile"))
输出
head(output, 10)
var_1 var_2 var_3
1 45th percentile 95th percentile 87th percentile
2 35th percentile 100th percentile 70th percentile
3 70th percentile 95th percentile 70th percentile
4 80th percentile 65th percentile 70th percentile
5 30th percentile 16th percentile 28th percentile
6 15th percentile 95th percentile 28th percentile
7 30th percentile 16th percentile 15th percentile
8 45th percentile 16th percentile 70th percentile
9 65th percentile 95th percentile 70th percentile
10 45th percentile 65th percentile 70th percentile
将 d
ata 和 list
的 q
antile 概率放在 mapply
和 cut
ting 的 quantile
s 那里。
q <- list(c(0.15, 0.3, 0.35, 0.45, 0.5, 0.65, 0.7, 0.8, 0.85, 0.9, 0.95, 1),
c(0.16, 0.23, 0.65, 0.71, 0.95),
c(0.15, 0.28, 0.7, 0.73, 0.87))
r <-
mapply(\(x, y) {y <- union(0:1, y); cut(x, quantile(x, y), labels=y[-1])}, d, q)
给出:
head(as.data.frame(r))
# var_1 var_2 var_3
# 1 0.85 0.71 1
# 2 0.15 0.71 0.28
# 3 0.5 0.16 0.73
# 4 0.65 0.95 0.87
# 5 0.5 0.23 1
# 6 0.35 0.23 1
注: R >= 4.1
数据:
set.seed(42)
d <- data.frame(var_1=rnorm(100, 10, 10), var_2=rnorm(100, 10, 10),
var_3=rnorm(100, 10, 10))
用我的三德包:
library(santoku)
d$q_1 <- chop_quantiles(d$var_1, 0:100/100, labels = lbl_endpoint("%s"))
d$q_2 <- chop_quantiles(d$var_2, 0:100/100, labels = lbl_endpoint("%s"))
d$q_3 <- chop_quantiles(d$var_3, 0:100/100, labels = lbl_endpoint("%s"))
head(d)
var_1 var_2 var_3 q_1 q_2 q_3
1 10.9747361 7.463509 24.13691 0.55 0.44 0.9
2 24.8326562 -17.530453 10.83047 0.94 0 0.54
3 12.5138699 7.945799 19.37541 0.6 0.45 0.83
4 14.1343011 2.135220 7.09554 0.65 0.24 0.35
5 4.2622584 -13.138526 27.96278 0.26 0.02 0.95
6 0.8394213 9.369224 18.18695 0.17 0.58 0.8
这创造了因素。使用 lbl_endpoint()
给出左侧百分位数。如果您将其遗漏,您将获得类似 [2%-3%)
.
的标签
我正在使用 R 编程语言。假设,我有以下数据框:
var_1 = rnorm(100,10,10)
var_2 = rnorm(100,10,10)
var_3 = rnorm(100,10,10)
d = data.frame(var_1, var_2, var_3)
head(d)
var_1 var_2 var_3
1 14.251923 14.877801 22.636207
2 7.325137 8.513718 21.021522
3 3.400001 -3.400397 11.274797
4 16.400597 8.623980 9.366115
5 7.065583 13.155570 17.891432
6 21.297912 4.341385 -11.337330
我的问题:对于每个变量中的每个元素,我想用它所属的百分位数替换该元素。
例如:
a = quantile(d$var_1, c( 0.15, 0.3, 0.35, 0.45, 0.5, 0.65, 0.7, 0.8, 0.85, 0.9, 0.95, 1))
b = quantile(d$var_2, c(0.16, 0.23, 0.65, 0.71, 0.95))
c = quantile(d$var_3, c(0.15, 0.28, 0.7, 0.73, 0.87))
> a
5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60% 65% 70% 75%
-0.8806901 0.3595086 1.1201300 3.0581928 5.0901641 7.0056228 7.6089831 8.9853805 9.9264540 10.2235212 11.5707533 13.2422940 15.1076889 16.5354881 17.9336020
80% 85% 90% 95% 100%
19.5312682 21.9264905 24.4511364 26.6820271 41.4419744
> b
16% 23% 65% 71% 95%
-2.795294 1.430715 11.070815 12.688064 25.270823
> c
15% 28% 70% 73% 87%
0.958404 5.767591 15.258532 16.013648 20.467892
例如:
- 如果
d$var_2 < -2.795294
,则d$var_2 = 16th percentile
- 如果
d$var_3 between (5.767591 , 15.258532)
,则d$var_3 = 70th percentile
我可以手动编写多个“if 语句”来执行此操作,但是有没有更快的方法来执行此操作?
谢谢!
您可以通过应用自定义函数来执行类似的操作:
library(tidyverse)
ApplyQuantiles <- function(x, y) {
cut(
x,
breaks = c(quantile(x, probs = y)),
labels = c(names(quantile(x, probs = y))[-1]),
include.lowest = TRUE
)
}
output <- d %>%
mutate(var_1 = ApplyQuantiles(var_1, c(0, 0.15, 0.3, 0.35, 0.45, 0.5, 0.65, 0.7, 0.8, 0.85, 0.9, 0.95, 1)),
var_2 = ApplyQuantiles(var_2, c(0, 0.16, 0.23, 0.65, 0.71, 0.95, 1.0)),
var_3 = ApplyQuantiles(var_3, c(0, 0.15, 0.28, 0.7, 0.73, 0.87, 1.0))) %>%
mutate(across(everything(), str_replace, pattern = "%", replacement = "th percentile"))
输出
head(output, 10)
var_1 var_2 var_3
1 45th percentile 95th percentile 87th percentile
2 35th percentile 100th percentile 70th percentile
3 70th percentile 95th percentile 70th percentile
4 80th percentile 65th percentile 70th percentile
5 30th percentile 16th percentile 28th percentile
6 15th percentile 95th percentile 28th percentile
7 30th percentile 16th percentile 15th percentile
8 45th percentile 16th percentile 70th percentile
9 65th percentile 95th percentile 70th percentile
10 45th percentile 65th percentile 70th percentile
将 d
ata 和 list
的 q
antile 概率放在 mapply
和 cut
ting 的 quantile
s 那里。
q <- list(c(0.15, 0.3, 0.35, 0.45, 0.5, 0.65, 0.7, 0.8, 0.85, 0.9, 0.95, 1),
c(0.16, 0.23, 0.65, 0.71, 0.95),
c(0.15, 0.28, 0.7, 0.73, 0.87))
r <-
mapply(\(x, y) {y <- union(0:1, y); cut(x, quantile(x, y), labels=y[-1])}, d, q)
给出:
head(as.data.frame(r))
# var_1 var_2 var_3
# 1 0.85 0.71 1
# 2 0.15 0.71 0.28
# 3 0.5 0.16 0.73
# 4 0.65 0.95 0.87
# 5 0.5 0.23 1
# 6 0.35 0.23 1
注: R >= 4.1
数据:
set.seed(42)
d <- data.frame(var_1=rnorm(100, 10, 10), var_2=rnorm(100, 10, 10),
var_3=rnorm(100, 10, 10))
用我的三德包:
library(santoku)
d$q_1 <- chop_quantiles(d$var_1, 0:100/100, labels = lbl_endpoint("%s"))
d$q_2 <- chop_quantiles(d$var_2, 0:100/100, labels = lbl_endpoint("%s"))
d$q_3 <- chop_quantiles(d$var_3, 0:100/100, labels = lbl_endpoint("%s"))
head(d)
var_1 var_2 var_3 q_1 q_2 q_3
1 10.9747361 7.463509 24.13691 0.55 0.44 0.9
2 24.8326562 -17.530453 10.83047 0.94 0 0.54
3 12.5138699 7.945799 19.37541 0.6 0.45 0.83
4 14.1343011 2.135220 7.09554 0.65 0.24 0.35
5 4.2622584 -13.138526 27.96278 0.26 0.02 0.95
6 0.8394213 9.369224 18.18695 0.17 0.58 0.8
这创造了因素。使用 lbl_endpoint()
给出左侧百分位数。如果您将其遗漏,您将获得类似 [2%-3%)
.