根据人口计算一组分数的百分位等级

Question

我有一组关于一个人 31 项能力的分数

data <- data.frame(scores=sample(100,31,replace=T),row.names=paste0("X",1:31))

我有一个基准文件，其中包含 100 名（或任意数量）个人在相同 31 项能力上的分数

bmark <- data.frame(replicate(31,sample(0:100,100,rep=TRUE)))

我需要计算个人的每个能力分数相对于该能力基准的百分位排名，输出应如下所示（随机值，只是为了演示所需的格式）

data <- data.frame(scores=sample(100,31,replace=T),percentile=sample(100,31,replace=T),row.names=paste0("X",1:31))

我可以使用基本百分位公式计算每个分数的百分位排名：

length(bmark$X31[bmark$X31<data$scores[rownames(data)=="X31"]])/length(bmark$X31)*100

但我不知道如何让这项工作同时适用于整个数据集，以便 data$percentile 中的每一行都具有基准文件中相应能力的适当值。我可以用循环来做到这一点，但 R 向量化仍然不是很好。

编辑：这是下面后续 comments/questions 我的数据帧的输入。请注意，这是 bmark 数据的一个片段（只有几列/'competencies'）。数据采用长格式（因此具有所有能力）并且具有主题的列标识符 (data$Split)。 bmark

    bmark <- structure(list(FinPerf = c(2.41333333333333, 2.94047619047619, 
2.87538940809969, 2.16666666666667, 2.34146341463415, 2.425, 
2.58148148148148, 2.97297297297297, 2.325, 2.52542372881356, 
2.35593220338983, 1.69105691056911, 2.73493975903614, 2.01666666666667, 
1.94010416666667, 1.94603174603175, 2.56666666666667, 2.48550724637681, 
2.61846110878369, 2.15873015873016, 2.56565656565657, 2.23529411764706, 
2.98974358974359, 3.09195402298851, 2.59289617486339, 2.44306418219462, 
2.72345679012346, 2.85714285714286, 2.62962962962963, 2.90833333333333
), Opt = c(2.74, 2.83928571428571, 2.87383177570093, 2.56692506459948, 
2.64634146341463, 2.6625, 2.53333333333333, 3.31081081081081, 
2.4, 2.50847457627119, 2.55932203389831, 1.89024390243902, 2.60240963855422, 
2.3, 2.3203125, 2.24761904761905, 2.85, 2.76086956521739, 2.78064516129032, 
2.30952380952381, 2.65151515151515, 2.29411764705882, 3.06923076923077, 
3.12931034482759, 2.64754098360656, 2.08695652173913, 2.41111111111111, 
2.78571428571429, 2.88888888888889, 2.9625), SatExp = c(2.44, 
2.58928571428571, 2.70093457943925, 2.11111111111111, 2.29268292682927, 
2.525, 2.33888888888889, 3.10810810810811, 2.375, 2.73728813559322, 
2.6864406779661, 1.91463414634146, 2.65060240963855, 2.225, 2.01171875, 
1.94285714285714, 2.6, 2.51086956521739, 2.83225806451613, 2.28571428571429, 
2.84848484848485, 2.02941176470588, 3.13076923076923, 3.22413793103448, 
2.45491803278689, 2.5, 2.78518518518518, 2.85714285714286, 2.75925925925926, 
2.975)), .Names = c("FinPerf", "Opt", "SatExp"), class = "data.frame", row.names = c(NA, 
-30L))

和数据

    data <- structure(list(Area.short = structure(c(13L, 25L, 28L, 6L, 3L, 
16L, 12L, 9L, 7L, 20L, 21L, 14L, 5L, 4L, 26L, 8L, 11L, 18L, 2L, 
27L, 31L, 30L, 24L, 19L, 1L, 23L, 22L, 10L, 15L, 29L, 17L, 13L, 
25L, 28L, 6L, 3L, 16L, 12L, 9L, 7L, 20L, 21L, 14L, 5L, 4L, 26L, 
8L, 11L, 18L, 2L, 27L, 31L, 30L, 24L, 19L, 1L, 23L, 22L, 10L, 
15L, 29L, 17L, 13L, 25L, 28L, 6L, 3L, 16L, 12L, 9L, 7L, 20L, 
21L, 14L, 5L, 4L, 26L, 8L, 11L, 18L, 2L, 27L, 31L, 30L, 24L, 
19L, 1L, 23L, 22L, 10L, 15L, 29L, 17L, 13L, 25L, 28L, 6L, 3L, 
16L, 12L, 9L, 7L, 20L, 21L, 14L, 5L, 4L, 26L, 8L, 11L, 18L, 2L, 
27L, 31L, 30L, 24L, 19L, 1L, 23L, 22L, 10L, 15L, 29L, 17L, 13L, 
25L, 28L, 6L, 3L, 16L, 12L, 9L, 7L, 20L, 21L, 14L, 5L, 4L, 26L, 
8L, 11L, 18L, 2L, 27L, 31L, 30L, 24L, 19L, 1L, 23L, 22L, 10L, 
15L, 29L, 17L, 13L, 25L, 28L, 6L, 3L, 16L, 12L, 9L, 7L, 20L, 
21L, 14L, 5L, 4L, 26L, 8L, 11L, 18L, 2L, 27L, 31L, 30L, 24L, 
19L, 1L, 23L, 22L, 10L, 15L, 29L, 17L, 13L, 25L, 28L, 6L, 3L, 
16L, 12L, 9L, 7L, 20L, 21L, 14L, 5L, 4L, 26L, 8L, 11L, 18L, 2L, 
27L, 31L, 30L, 24L, 19L, 1L, 23L, 22L, 10L, 15L, 29L, 17L, 13L, 
25L, 28L, 6L, 3L, 16L, 12L, 9L, 7L, 20L, 21L, 14L, 5L, 4L, 26L, 
8L, 11L, 18L, 2L, 27L, 31L, 30L, 24L, 19L, 1L, 23L, 22L, 10L, 
15L, 29L, 17L), .Label = c("ACHIEVEMENT", "Apprec", "Bal", "Belong", 
"BrandPass", "Burn", "Care", "Comms", "Comp", "CONNECTION", "Consul", 
"Fam", "FinPerf", "Forw", "FRANCHISE PARTNERSHIP", "Ful", "GENSAT", 
"Harm", "Innov", "Integ", "LeadCom", "LEADERSHIP", "LIFESTYLE", 
"MarkSup", "Opt", "Part", "PracSup", "SatExp", "SUPPORT", "Syst", 
"Train"), class = "factor"), MM = c(22.8758169934641, 29.4117647058824, 
7.84313725490196, -29.4117647058824, 11.7647058823529, -3.26797385620914, 
7.84313725490196, 11.1111111111111, 15.6862745098039, 9.80392156862744, 
9.80392156862744, 5.88235294117648, 33.3333333333333, 17.6470588235294, 
13.7254901960784, 19.6078431372549, 11.7647058823529, 1.96078431372548, 
13.7254901960784, 21.5686274509804, 15.6862745098039, 11.1111111111111, 
21.5686274509804, 1.96078431372548, 20.4481792717087, -3.26797385620914, 
10.1307189542484, 20.0980392156863, 10.9803921568627, 14.7058823529412, 
11.7647058823529, 22.2222222222222, 16.6666666666667, 8.33333333333333, 
-25, 0, -11.1111111111111, 0, -5.55555555555557, 16.6666666666667, 
58.3333333333333, 16.6666666666667, 0, 25, 11.1111111111111, 
0, 33.3333333333333, 8.33333333333333, -11.1111111111111, 11.1111111111111, 
50, 0, 11.1111111111111, 16.6666666666667, 25, 16.6666666666667, 
-9.25925925925926, 13.8888888888889, 10.4166666666667, 8.33333333333333, 
19.4444444444444, 10.3448275862069, 100, 66.6666666666667, 66.6666666666667, 
-33.3333333333333, 100, 55.5555555555556, 100, 55.5555555555556, 
100, 33.3333333333333, 66.6666666666667, -11.1111111111111, 100, 
77.7777777777778, 33.3333333333333, 66.6666666666667, 66.6666666666667, 
11.1111111111111, 77.7777777777778, 66.6666666666667, 33.3333333333333, 
33.3333333333333, 100, 33.3333333333333, 80.952380952381, 55.5555555555556, 
44.4444444444444, 66.6666666666667, 53.3333333333333, 55.5555555555556, 
57.4712643678161, 7.40740740740741, 33.3333333333333, -5.55555555555557, 
-5.55555555555557, -5.55555555555557, -14.8148148148148, -22.2222222222222, 
-3.70370370370369, -22.2222222222222, -27.7777777777778, -16.6666666666667, 
7.40740740740741, 22.2222222222222, 7.40740740740741, 3.70370370370369, 
5.55555555555557, 5.55555555555557, 14.8148148148148, -3.70370370370369, 
5.55555555555557, 11.1111111111111, 0, 18.5185185185185, -11.1111111111111, 
11.1111111111111, -12.3456790123457, -10.1851851851852, 9.72222222222223, 
5.55555555555557, 5.55555555555557, 0.766283524904221, 25.9259259259259, 
27.7777777777778, 11.1111111111111, -55.5555555555556, 22.2222222222222, 
3.70370370370369, 27.7777777777778, 29.6296296296296, 38.8888888888889, 
11.1111111111111, 22.2222222222222, 11.1111111111111, 38.8888888888889, 
22.2222222222222, 29.6296296296296, 16.6666666666667, 11.1111111111111, 
-3.70370370370369, 22.2222222222222, 11.1111111111111, 27.7777777777778, 
18.5185185185185, 14.8148148148148, -5.55555555555557, 22.2222222222222, 
0, 22.2222222222222, 29.1666666666667, 11.1111111111111, 13.8888888888889, 
16.0919540229885, -6.17283950617284, 11.1111111111111, -18.5185185185185, 
14.8148148148148, -18.5185185185185, -28.3950617283951, -44.4444444444444, 
-16.0493827160494, -29.6296296296296, -25.9259259259259, -33.3333333333333, 
3.70370370370369, -3.70370370370369, -18.5185185185185, -18.5185185185185, 
-7.40740740740741, -14.8148148148148, -6.17283950617284, -28.3950617283951, 
3.70370370370369, -3.70370370370369, -16.0493827160494, -11.1111111111111, 
-7.40740740740741, -4.76190476190478, -20.1646090534979, -17.9012345679012, 
-14.8148148148148, -14.8148148148148, -8.02469135802468, -13.6653895274585, 
42.2222222222222, 53.3333333333333, 33.3333333333333, -86.6666666666667, 
26.6666666666667, 2.22222222222221, 46.6666666666667, 20, 60, 
46.6666666666667, 53.3333333333333, 6.66666666666667, 66.6666666666667, 
51.1111111111111, 42.2222222222222, 46.6666666666667, 33.3333333333333, 
11.1111111111111, 55.5555555555556, 33.3333333333333, 33.3333333333333, 
33.3333333333333, 55.5555555555556, 13.3333333333333, 42.8571428571429, 
-2.22222222222221, 33.3333333333333, 51.6666666666667, 36, 35.5555555555555, 
32.4137931034483, 77.7777777777778, 44.4444444444444, 44.4444444444444, 
-66.6666666666667, 77.7777777777778, 62.962962962963, 100, 77.7777777777778, 
77.7777777777778, 55.5555555555556, 66.6666666666667, 11.1111111111111, 
88.8888888888889, 70.3703703703704, 62.962962962963, 55.5555555555556, 
55.5555555555556, 11.1111111111111, 70.3703703703704, 55.5555555555556, 
44.4444444444444, 55.5555555555556, 62.962962962963, 11.1111111111111, 
58.7301587301587, 45.679012345679, 55.5555555555556, 72.2222222222222, 
46.6666666666667, 48.1481481481482, 53.639846743295), Split = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 
8L, 8L, 8L, 8L, 8L, 8L, 8L), .Label = c("Success4u", "NSW/ACT", 
"QLD", "SA/WA", "VIC/TAS", "Type 1", "Type 2", "Type 3", "Franchise Sector Benchmark"
), class = "factor")), .Names = c("Area.short", "MM", "Split"
), row.names = c(NA, 248L), class = "data.frame")

Answer 1

这是获取一个主题的每个基准的百分位数的方法。我们使用经验累积分布函数来计算百分位数：

# Fake data
set.seed(595)
dat <- data.frame(scores=sample(100,31,replace=T),row.names=paste0("X",1:31))
bmark <- data.frame(replicate(31,sample(100,100,rep=TRUE)))

# Get percentile on each benchmark for one subject
dat$percentile = mapply(function(ref, subj) {
  ecdf(ref)(subj)*100
}, ref=bmark, subj=dat$scores)

dat

    scores percentile
X1      28         25
X2      25         30
X3      91         92
...
X29     42         46
X30     76         71
X31      1          2

这是每个能力 bmark 分布的箱线图，以及显示受试者在每个能力上得分的红点：

boxplot(bmark)
points(1:31, data$scores, pch=16, col="red")

如果你有多个科目，你可以一次得到他们的百分位数。我们从上面获取 mapply 代码，它计算单个主题的百分位数，并将其包装在 sapply 中，它连续为每个主题提供 mapply 代码，并且 returns 所有单个矩阵中的结果数：

# Scores on 31 benchmarks for 20 subjects. Each column is a subject.
set.seed(58)
subjects = as.data.frame(replicate(20, sample(100, 31, replace=TRUE)))

# Get percentile on each benchmark for each subject
percentile.score = sapply(subjects, function(s) {
  mapply(function(ref, subj) {
    ecdf(ref)(subj)*100
  }, ref=bmark, subj=s)
})

percentile.score

    V1 V2 V3  V4 V5  V6 V7  V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
X1  28 95  2  24 95  18 61   6 44  11  82  50  28  37  59  41  91  51  35  76
X2  20 49 62   3 24  24 54  23 50  50  60  95  25  63  57  78  72  89  79  66
X3  69 37 40  49 59  30 91   5 92  43  56  22  34  33  28  54  35   1  59  29
...
X29 34 42 10  76 40  48 13  36 76  13  88  91  99  31  13  76  93  42  31  91
X30 16 66 86  56 21  67 86  45 81  16  70  66  24  11  23  18  32  53  31  32
X31 81 81 52   2  3  32 64  36 33  39  92 100  80   2  44  63  59   2  34  99

根据人口计算一组分数的百分位等级

Calculate the percentile rank of a set of scores based on a population

r

vectorization

ranking

percentile