一个观察值与其他观察值的因子水平比较
Comparison between one observation and the others by level of a factor
抱歉标题模糊,但我无法弄清楚应该如何调用此操作。我想这可能是一个 leave-one-out 过程,但我会解释。我有这个:
structure(list(Ident = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A_1_2", "A_1_4"), class = "factor"),
iduni = c("A_1_2_231", "A_1_2_233", "A_1_2_234", "A_1_2_235",
"A_1_2_236", "A_1_2_237", "A_1_4_200", "A_1_4_201", "A_1_4_202",
"A_1_4_203", "A_1_4_204", "A_1_4_205", "A_1_4_206"), Dhp = structure(c(1L,
3L, 13L, 7L, 11L, 12L, 8L, 9L, 10L, 2L, 6L, 4L, 5L), .Label = c("92",
"100", "102", "118", "126", "139", "155", "176", "196", "220",
"234", "241", "263"), class = "factor"), ratio = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("Ident",
"iduni", "score", "ratio"), row.names = c(1L, 3L, 4L, 5L, 6L, 7L,
70L, 71L, 72L, 73L, 74L, 75L, 76L), class = "data.frame")
基本上就是这个小df:
Ident iduni score ratio
1 A_1_2 A_1_2_231 92 NA
3 A_1_2 A_1_2_233 102 NA
4 A_1_2 A_1_2_234 263 NA
5 A_1_2 A_1_2_235 155 NA
6 A_1_2 A_1_2_236 234 NA
7 A_1_2 A_1_2_237 241 NA
70 A_1_4 A_1_4_200 176 NA
71 A_1_4 A_1_4_201 196 NA
72 A_1_4 A_1_4_202 220 NA
73 A_1_4 A_1_4_203 100 NA
74 A_1_4 A_1_4_204 139 NA
75 A_1_4 A_1_4_205 118 NA
76 A_1_4 A_1_4_206 126 NA
而且我希望能够 select 对 "Ident" 的特定水平进行一次观察,并将其分数与该水平内的所有其他分数进行比较。例如,我想获得 Ident 级别的一个分数与所有其他分数之间的比率之和:
ratio for i = Σ[ SCORE(line i)/SCORE(lines 1:i-1,i+1:n)]
或者更简单的例子,例如:
comparison for i = Σ[SCORE(lines 1:i-1,i+1:n)]
理想情况下,我必须能够对其他观察结果的 selection 设置条件(例如,仅针对高于分数 i 的分数)。
在此先感谢您的帮助,对于问题的尴尬表述方式,我们深表歉意。
如果您将 data.frame 命名为 df
那么这就是您想要的:
df$ratios = ave(df$score,# the variable to summarize
df$Ident,# the grouping variable(s)
FUN=function(x)# the function that performs calculations on `df$score` within each grouping.
# score ratio = this score / (sum of all scores but this one)
x / (sum(x) - x)
)
df$comparison_values = ave(df$score,# the variable to summarize
df$Ident,# the grouping variable(s)
FUN=function(x)# the function that performs calculations on `df$score` within each grouping.
# comparison for I = (sum of all scores but this one)
(sum(x) - x)
)
您可以使用 data.table.
library(data.table)
df$score=as.numeric(as.character(df$score))
df <- as.data.table(df)[, ratio1:=score/(sum(score)-score) , by = Ident]
df
Ident iduni score ratio ratio1
1: A_1_2 A_1_2_231 92 NA 0.09246231
2: A_1_2 A_1_2_233 102 NA 0.10355330
3: A_1_2 A_1_2_234 263 NA 0.31917476
4: A_1_2 A_1_2_235 155 NA 0.16630901
5: A_1_2 A_1_2_236 234 NA 0.27432591
6: A_1_2 A_1_2_237 241 NA 0.28486998
7: A_1_4 A_1_4_200 176 NA 0.19577308
8: A_1_4 A_1_4_201 196 NA 0.22298066
9: A_1_4 A_1_4_202 220 NA 0.25730994
10: A_1_4 A_1_4_203 100 NA 0.10256410
11: A_1_4 A_1_4_204 139 NA 0.14850427
12: A_1_4 A_1_4_205 118 NA 0.12330199
13: A_1_4 A_1_4_206 126 NA 0.13277134
因此,让我们在 Ident-column 的特定值内计算每个分数与项目 # 3 上方位置的分数平均值的比率。可以使用其他标准,但您在这一点上并不完全清楚。
#First need to convert the factor to a numeric.
dat$score <- as.numeric(as.character(dat$score))
i=3 # Pick a location
dat$rat_scr <- ave( dat$score, dat$Ident, FUN=function(x){
x/mean(x[(1+i):length(x) ]) })
dat
Ident iduni score ratio rat_scr
1 A_1_2 A_1_2_231 92 NA 0.4380952
3 A_1_2 A_1_2_233 102 NA 0.4857143
4 A_1_2 A_1_2_234 263 NA 1.2523810
5 A_1_2 A_1_2_235 155 NA 0.7380952
6 A_1_2 A_1_2_236 234 NA 1.1142857
7 A_1_2 A_1_2_237 241 NA 1.1476190
70 A_1_4 A_1_4_200 176 NA 1.4575569
71 A_1_4 A_1_4_201 196 NA 1.6231884
72 A_1_4 A_1_4_202 220 NA 1.8219462
73 A_1_4 A_1_4_203 100 NA 0.8281573
74 A_1_4 A_1_4_204 139 NA 1.1511387
75 A_1_4 A_1_4_205 118 NA 0.9772257
76 A_1_4 A_1_4_206 126 NA 1.0434783
抱歉标题模糊,但我无法弄清楚应该如何调用此操作。我想这可能是一个 leave-one-out 过程,但我会解释。我有这个:
structure(list(Ident = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A_1_2", "A_1_4"), class = "factor"),
iduni = c("A_1_2_231", "A_1_2_233", "A_1_2_234", "A_1_2_235",
"A_1_2_236", "A_1_2_237", "A_1_4_200", "A_1_4_201", "A_1_4_202",
"A_1_4_203", "A_1_4_204", "A_1_4_205", "A_1_4_206"), Dhp = structure(c(1L,
3L, 13L, 7L, 11L, 12L, 8L, 9L, 10L, 2L, 6L, 4L, 5L), .Label = c("92",
"100", "102", "118", "126", "139", "155", "176", "196", "220",
"234", "241", "263"), class = "factor"), ratio = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("Ident",
"iduni", "score", "ratio"), row.names = c(1L, 3L, 4L, 5L, 6L, 7L,
70L, 71L, 72L, 73L, 74L, 75L, 76L), class = "data.frame")
基本上就是这个小df:
Ident iduni score ratio
1 A_1_2 A_1_2_231 92 NA
3 A_1_2 A_1_2_233 102 NA
4 A_1_2 A_1_2_234 263 NA
5 A_1_2 A_1_2_235 155 NA
6 A_1_2 A_1_2_236 234 NA
7 A_1_2 A_1_2_237 241 NA
70 A_1_4 A_1_4_200 176 NA
71 A_1_4 A_1_4_201 196 NA
72 A_1_4 A_1_4_202 220 NA
73 A_1_4 A_1_4_203 100 NA
74 A_1_4 A_1_4_204 139 NA
75 A_1_4 A_1_4_205 118 NA
76 A_1_4 A_1_4_206 126 NA
而且我希望能够 select 对 "Ident" 的特定水平进行一次观察,并将其分数与该水平内的所有其他分数进行比较。例如,我想获得 Ident 级别的一个分数与所有其他分数之间的比率之和:
ratio for i = Σ[ SCORE(line i)/SCORE(lines 1:i-1,i+1:n)]
或者更简单的例子,例如:
comparison for i = Σ[SCORE(lines 1:i-1,i+1:n)]
理想情况下,我必须能够对其他观察结果的 selection 设置条件(例如,仅针对高于分数 i 的分数)。
在此先感谢您的帮助,对于问题的尴尬表述方式,我们深表歉意。
如果您将 data.frame 命名为 df
那么这就是您想要的:
df$ratios = ave(df$score,# the variable to summarize
df$Ident,# the grouping variable(s)
FUN=function(x)# the function that performs calculations on `df$score` within each grouping.
# score ratio = this score / (sum of all scores but this one)
x / (sum(x) - x)
)
df$comparison_values = ave(df$score,# the variable to summarize
df$Ident,# the grouping variable(s)
FUN=function(x)# the function that performs calculations on `df$score` within each grouping.
# comparison for I = (sum of all scores but this one)
(sum(x) - x)
)
您可以使用 data.table.
library(data.table)
df$score=as.numeric(as.character(df$score))
df <- as.data.table(df)[, ratio1:=score/(sum(score)-score) , by = Ident]
df
Ident iduni score ratio ratio1
1: A_1_2 A_1_2_231 92 NA 0.09246231
2: A_1_2 A_1_2_233 102 NA 0.10355330
3: A_1_2 A_1_2_234 263 NA 0.31917476
4: A_1_2 A_1_2_235 155 NA 0.16630901
5: A_1_2 A_1_2_236 234 NA 0.27432591
6: A_1_2 A_1_2_237 241 NA 0.28486998
7: A_1_4 A_1_4_200 176 NA 0.19577308
8: A_1_4 A_1_4_201 196 NA 0.22298066
9: A_1_4 A_1_4_202 220 NA 0.25730994
10: A_1_4 A_1_4_203 100 NA 0.10256410
11: A_1_4 A_1_4_204 139 NA 0.14850427
12: A_1_4 A_1_4_205 118 NA 0.12330199
13: A_1_4 A_1_4_206 126 NA 0.13277134
因此,让我们在 Ident-column 的特定值内计算每个分数与项目 # 3 上方位置的分数平均值的比率。可以使用其他标准,但您在这一点上并不完全清楚。
#First need to convert the factor to a numeric.
dat$score <- as.numeric(as.character(dat$score))
i=3 # Pick a location
dat$rat_scr <- ave( dat$score, dat$Ident, FUN=function(x){
x/mean(x[(1+i):length(x) ]) })
dat
Ident iduni score ratio rat_scr
1 A_1_2 A_1_2_231 92 NA 0.4380952
3 A_1_2 A_1_2_233 102 NA 0.4857143
4 A_1_2 A_1_2_234 263 NA 1.2523810
5 A_1_2 A_1_2_235 155 NA 0.7380952
6 A_1_2 A_1_2_236 234 NA 1.1142857
7 A_1_2 A_1_2_237 241 NA 1.1476190
70 A_1_4 A_1_4_200 176 NA 1.4575569
71 A_1_4 A_1_4_201 196 NA 1.6231884
72 A_1_4 A_1_4_202 220 NA 1.8219462
73 A_1_4 A_1_4_203 100 NA 0.8281573
74 A_1_4 A_1_4_204 139 NA 1.1511387
75 A_1_4 A_1_4_205 118 NA 0.9772257
76 A_1_4 A_1_4_206 126 NA 1.0434783