R:找到最接近平均值的观察值
R: find the closest observation to averages
我有一个 table 每个类型的描述统计信息(a、b 和 c 的平均值)
### stats
type <- c("a","b","c","d","e","f","g","h","i","j","k","l")
mean_a <- c(0,1,1,0,2,2,0,4,4,0,5,5)
mean_b<- c(4,7,8,0,3,10,5,4,7,0,1,6)
mean_c<- c(1,2,0,3,4,5,1,24,3,0,4,5)
stats <- data.frame(type, mean_a, mean_b, mean_c)
我有一个数据集,其中包含参数 a、b 和 c 的标本观察结果。
每个标本都有特定的类型
# data
Id <- c("ted","bert","test","john","elf","fea","goul","houl","ili","jok","ko","lol")
type <- c("a","a","b","d","f","f","c","d","a","b","k","l")
a <- c(2,1,3,2,1,2,0,1,2,1,5,5)
b<- c(1,3,4,7,5,4,5,6,5,0,1,6)
c<- c(3,5,2,6,8,5,1,5,3,1,6,6)
data <- data.frame(Id, type, a, b, c )
根据这两个table,我想根据stats
中的统计数据,从data
中获取最具代表性的样本。
最有代表性的,我想得到a,b和c的值最接近各自平均值的那个。
我在互联网上找不到遵循 3 个平均值(a、b 和 c)的想法。欢迎帮助!想要的输出(但不确定 ted、test 和 john 是否最接近类型 a、b 和 c 的平均值):
# output wanted
Id <- c("ted","test","john")
type <- c("a","b","c")
a <- c(2,3,2)
b<- c(1,4,7)
c<- c(3,2,6)
data2 <- data.frame(Id, type, a, b, c )
我已经为距离选择了一个二次度量,您可能希望将其调整为您喜欢的任何距离度量:
data$dist <- (data$a - stats[data$type, "mean_a"])^2 +
(data$b - stats[data$type, "mean_b"])^2 +
(data$c - stats[data$type, "mean_c"])^2
closest <- which.min(data$dist)
print(paste0("Closest is number ",closest, ": ",data[closest, "Id"] ))
您自己提到的 "most representative" 非常模糊,但这里尝试找出 data
的值与 stats
的 mean_values 之间的差异] 并保留平均值最低的那个。
由于我事先加入了数据框,您可以在代码末尾使用 select()
函数并相应地修改(keep/drop 变量)。
library(dplyr)
df1 <- merge(data1, stats, by = 'type')
df1 %>%
mutate(new = abs(rowMeans(mapply(`-`, df1[,(3:5)], df1[,(6:8)])))) %>%
group_by(type) %>%
filter(new == min(new)) %>%
select(-new)
#Source: local data frame [7 x 8]
#Groups: type [7]
# type Id a b c mean_a mean_b mean_c
# <fctr> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 a ted 2 1 3 0 4 1
#2 b test 3 4 2 1 7 2
#3 c goul 0 5 1 1 8 0
#4 d houl 1 6 5 0 0 3
#5 f elf 1 5 8 2 10 5
#6 k ko 5 1 6 5 1 4
#7 l lol 5 6 6 5 6 5
require(dplyr)
inner_join(stats, data) %>%
rowwise %>%
mutate(diff = sum((a - mean_a)^2,
(b - mean_b)^2,
(c - mean_c)^2)) %>%
group_by(type) %>%
filter(diff == min(diff)) %>%
select(Id, type, a, b, c)
# Id type a b c
# <fctr> <chr> <dbl> <dbl> <dbl>
# 1 ili a 2 5 3
# 2 test b 3 4 2
# 3 goul c 0 5 1
# 4 houl d 1 6 5
# 5 elf f 1 5 8
# 6 ko k 5 1 6
# 7 lol l 5 6 6
我有一个 table 每个类型的描述统计信息(a、b 和 c 的平均值)
### stats
type <- c("a","b","c","d","e","f","g","h","i","j","k","l")
mean_a <- c(0,1,1,0,2,2,0,4,4,0,5,5)
mean_b<- c(4,7,8,0,3,10,5,4,7,0,1,6)
mean_c<- c(1,2,0,3,4,5,1,24,3,0,4,5)
stats <- data.frame(type, mean_a, mean_b, mean_c)
我有一个数据集,其中包含参数 a、b 和 c 的标本观察结果。 每个标本都有特定的类型
# data
Id <- c("ted","bert","test","john","elf","fea","goul","houl","ili","jok","ko","lol")
type <- c("a","a","b","d","f","f","c","d","a","b","k","l")
a <- c(2,1,3,2,1,2,0,1,2,1,5,5)
b<- c(1,3,4,7,5,4,5,6,5,0,1,6)
c<- c(3,5,2,6,8,5,1,5,3,1,6,6)
data <- data.frame(Id, type, a, b, c )
根据这两个table,我想根据stats
中的统计数据,从data
中获取最具代表性的样本。
最有代表性的,我想得到a,b和c的值最接近各自平均值的那个。
我在互联网上找不到遵循 3 个平均值(a、b 和 c)的想法。欢迎帮助!想要的输出(但不确定 ted、test 和 john 是否最接近类型 a、b 和 c 的平均值):
# output wanted
Id <- c("ted","test","john")
type <- c("a","b","c")
a <- c(2,3,2)
b<- c(1,4,7)
c<- c(3,2,6)
data2 <- data.frame(Id, type, a, b, c )
我已经为距离选择了一个二次度量,您可能希望将其调整为您喜欢的任何距离度量:
data$dist <- (data$a - stats[data$type, "mean_a"])^2 +
(data$b - stats[data$type, "mean_b"])^2 +
(data$c - stats[data$type, "mean_c"])^2
closest <- which.min(data$dist)
print(paste0("Closest is number ",closest, ": ",data[closest, "Id"] ))
您自己提到的 "most representative" 非常模糊,但这里尝试找出 data
的值与 stats
的 mean_values 之间的差异] 并保留平均值最低的那个。
由于我事先加入了数据框,您可以在代码末尾使用 select()
函数并相应地修改(keep/drop 变量)。
library(dplyr)
df1 <- merge(data1, stats, by = 'type')
df1 %>%
mutate(new = abs(rowMeans(mapply(`-`, df1[,(3:5)], df1[,(6:8)])))) %>%
group_by(type) %>%
filter(new == min(new)) %>%
select(-new)
#Source: local data frame [7 x 8]
#Groups: type [7]
# type Id a b c mean_a mean_b mean_c
# <fctr> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 a ted 2 1 3 0 4 1
#2 b test 3 4 2 1 7 2
#3 c goul 0 5 1 1 8 0
#4 d houl 1 6 5 0 0 3
#5 f elf 1 5 8 2 10 5
#6 k ko 5 1 6 5 1 4
#7 l lol 5 6 6 5 6 5
require(dplyr)
inner_join(stats, data) %>%
rowwise %>%
mutate(diff = sum((a - mean_a)^2,
(b - mean_b)^2,
(c - mean_c)^2)) %>%
group_by(type) %>%
filter(diff == min(diff)) %>%
select(Id, type, a, b, c)
# Id type a b c
# <fctr> <chr> <dbl> <dbl> <dbl>
# 1 ili a 2 5 3
# 2 test b 3 4 2
# 3 goul c 0 5 1
# 4 houl d 1 6 5
# 5 elf f 1 5 8
# 6 ko k 5 1 6
# 7 lol l 5 6 6