从一个 table 中取一组并使用其他 table 来计算欧氏距离

Take a groups from one table and use other tables to calculate euclidean distance

我想计算特定配置文件之间的欧氏距离。最大的问题是如何将特定行放在一起以计算它们之间的距离。在第一个 table 中,我存储了包含来自不同 table 的行名称的组,应该将其用于距离计算。 首先 table 看起来像这样:

        Activity     Person ValueOfComp
    1   Football Mark_1_OUT           4
    2   Football Greg_1_OUT           4
    3   Football Mark_1_INT           4
    4   Football Greg_1_INT           4
    5 Volleyball  Tim_1_INT          6
    6 Volleyball  Tim_1_OUT          6
    7 Volleyball  Tom_1_INT          6
    8 Volleyball  Tom_1_OUT          6
    9 Volleyball  Sim_1_INT          6
    10 Volleyball  Sim_1_OUT          6
    11 Handball  Karl_1_OUT          8
    12 Handball  Karl_1_INT          8
    13 Handball  Matt_1_OUT          8
    14 Handball  Matt_1_INT          8
    15 Handball  Jake_1_INT          8
    16 Handball  Jake_1_OUT          8
    17 Handball  Sonya_1_OUT          8
    18 Handball  Sonya_1_INT          8

有两个 table 存储了提到的变量的配置文件,这些变量应该被用于欧氏距离计算。

Table 1 假设一个用于以 INT:

结尾的变量
                10         34        59 84        110       134       165       199
Mark_1 0.000000000 0.00000000 0.0000000  1 0.12345123 0.1160406 0.2847189 0.4636836
Greg_1 0.000000000 0.00000000 0.1719200  1 0.68940000 0.2087267 0.2469333 0.2358933
Tim_1 0.000000000 0.00000000 0.0000000  1 0.123415551 0.55321234 0.0000000 0.0000000
Tom_1 0.000000000 0.00000000 0.0000000  0 1 0.11234120 0.1755712 0.2344607
Sim_1 0.000000000 0.00000000 0.0000000  1 0.324532121 0.123412666 0.0000000 0.0000000
Karl_1 1 0.123256312 0.34312334  0 0.00000000 0.0000000 0.0000000 0.0000000
Matt_1 0.000000000 0.03978242 0.1272671  1 0.00000000 0.0000000 0.0000000 0.0000000
Moham_1 0.5123412423 0.12423561 0.1775713  1 0.01186404 0.0000000 0.0000000 0.0000000
Teraq_1 0.009915695 0.13451256 0.2211453  1 0.01186404 0.0000000 0.0000000 0.0000000
Jake_1 0.066915225 0.20623498 0.53215713  1 0.01186404 0.0000000 0.0000000 0.0000000
Sonya_1 0.000000000 0.21341411 0.5323123  1 0.00000000 0.0000000 0.0000000 0.0000000
Monique_1 1 0.4311223 0.22343212  0 0.00000000 0.0000000 0.0000000 0.0000000

Table 2 假设一个用于以 OUT:

结尾的变量
                10         34        59 84        110       134       165       199
Mark_1 0.000000000 0.00000000 0.0000000  1 0.33345123 0.2530406 0.2147189 0.4636836
Greg_1 0.000000000 0.00000000 0.1719200  1 0.48240000 0.22345726 0.2122233 0.2358933
Tim_1 0.000000000 0.00000000 0.0000000  1 0.623415551 0.35321234 0.0000000 0.0000000
Tom_1 0.000000000 0.00000000 0.0000000  0 1 0.4122120 0.3755712 0.2324607
Sim_1 0.000000000 0.00000000 0.0000000  1 0.33352121 0.223412666 0.0000000 0.0000000
Karl_1 1 0.553256312 0.24312334  0 0.00000000 0.0000000 0.0000000 0.0000000
Matt_1 0.000000000 0.11978242 0.1272671  1 0.00000000 0.0000000 0.0000000 0.0000000
Moham_1 0.5123412423 0.52423561 0.6775713  1 0.31186404 0.0000000 0.0000000 0.0000000
Teraq_1 0.119915695 0.16451256 0.2433253  1 0.09186404 0.0000000 0.0000000 0.0000000
Jake_1 0.264915225 0.33123498 0.39215713  1 0.11186404 0.0000000 0.0000000 0.0000000
Sonya_1 0.000000000 0.33341411 0.4323123  1 0.00000000 0.0000000 0.0000000 0.0000000
Monique_1 1 0.5511223 0.44343212  0 0.00000000 0.0000000 0.0000000 0.0000000

因此,基于第一个 table FootballVolleyballetc 的组,我想获取该组的所有配置文件并计算欧几里德他们之间的距离。这些配置文件可以在其他 table 中找到。即使取自同一个 table,也应计算该组所有配置文件之间的距离。

如果将结果存储为单独的 table 并带有对 activity 和计算的距离,那就太好了。

我的真实数据包含几千行,但我也有 CPU 能力 运行 循环。

有人可以帮我解答吗?

编辑:可重现的例子:

> dput(repr_data)
structure(list(Activity = structure(c(1L, 1L, 1L, 1L, 3L, 3L, 
3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Football", 
"Handball", "Volleyball"), class = "factor"), Person = structure(c(8L, 
7L, 2L, 1L, 15L, 16L, 17L, 18L, 11L, 12L, 6L, 5L, 10L, 9L, 3L, 
4L, 14L, 13L), .Label = c("Greg_1_INT", "Greg_1_OUT", "Jake_1_INT", 
"Jake_1_OUT", "Karl_1_INT", "Karl_1_OUT", "Mark_1_INT", "Mark_1_OUT", 
"Matt_1_INT", "Matt_1_OUT", "Sim_1_INT", "Sim_1_OUT", "Sonya_1_INT", 
"Sonya_1_OUT", "Tim_1_INT", "Tim_1_OUT", "Tom_1_INT", "Tom_1_OUT"
), class = "factor"), ValueOfComp = c(4, 4, 4, 4, 6, 6, 6, 6, 
6, 6, 8, 8, 8, 8, 8, 8, 8, 8)), .Names = c("Activity", "Person", 
"ValueOfComp"), row.names = c(NA, -18L), class = "data.frame")

Table 1:

> dput(INT_tbl)
structure(c(0, 0, 0, 0, 0, 1, 0.22123412423, 0.0123915695, 0.0126915225, 
0.4312, 1, 0, 0, 0, 0, 0, 0.323256312, 0.32423561, 0.44451256, 
0.33623498, 0.21341411, 0.321223, 0.232, 0.57192, 0, 0, 0, 0.31312334, 
0.2775713, 0.1311453, 0.63215713, 0.4423123, 0.132212, 1, 1, 
1, 0, 1, 0, 1, 1, 1, 1, 0, 0.55345123, 0.689875, 0.423415551, 
1, 0.444532121, 0, 0.01186404, 0.22132204, 0.21186404, 0, 0, 
0.234126, 0.33347267, 0.35321234, 0.4123412, 0.333412666, 0, 
0, 0, 0.3123, 0, 0, 0.1147189, 0.12343, 0.3155, 0.2755712, 0.123, 
0, 0, 0, 0, 0, 0, 0.1236836, 0.0058933, 0, 0.1344607, 0, 0, 0, 
0, 0, 0, 0), .Dim = c(11L, 8L), .Dimnames = list(c("Mark_1", 
"Greg_1", "Tim_1", "Tom_1", "Sim_1", "Karl_1", "Moham_1", "Teraq_1", 
"Jake_1", "Sonya_1", "Monique_1"), c("10", "34", "59", "84", 
"110", "134", "165", "199")))

Table 2:

> dput(OUT_tbl)
structure(c(0.236915225, 0, 0, 0, 0, 0, 1, 1, 0.22123412423, 
0.0123915695, 0.0126915225, 0.4312, 1, 0.26666498, 0, 0, 0, 0, 
0, 0.323256312, 0.52356312, 0.32423561, 0.44451256, 0.33623498, 
0.21341411, 0.321223, 0.123415713, 0.232, 0.57192, 0, 0, 0, 0.31312334, 
0.12342332, 0.2775713, 0.1311453, 0.63215713, 0.4423123, 0.132212, 
1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0.2235404, 0.55345123, 
0.689875, 0.423415551, 1, 0.444532121, 0, 0, 0.01186404, 0.22132204, 
0.21186404, 0, 0, 0.123, 0.234126, 0.33347267, 0.35321234, 0.4123412, 
0.333412666, 0, 0, 0, 0, 0.3123, 0, 0, 0, 0.1147189, 0.12343, 
0.3155, 0.2755712, 0.123, 0, 0, 0, 0, 0, 0, 0, 0, 0.1236836, 
0.0058933, 0, 0.1344607, 0, 0, 0, 0, 0, 0, 0, 0), .Dim = c(13L, 
8L), .Dimnames = list(c("Karsten_1", "Mark_1", "Greg_1", "Tim_1", 
"Tom_1", "Sim_1", "Karl_1", "Johan_1", "Moham_1", "Teraq_1", 
"Jake_1", "Sonya_1", "Monique_1"), c("10", "34", "59", "84", 
"110", "134", "165", "199")))

期望的输出:

        Activity     Person 1   Person 2    EUC.DIST
    1   Football Mark_1_OUT    Greg_1_OUT      XX
    2   Football Mark_1_OUT    Mark_1_INT      XX
    3   Football Mark_1_OUT    Greg_1_INT      XX
    4   Football Greg_1_INT    Greg_1_OUT      XX
    5   Football Greg_1_INT    Mark_1_INT      XX
    6   Football Greg_1_OUT    Mark_1_INT      XX
    ........
    and so on with other combinations withing rest of the groups.

好的,这可能会有点乱,但请耐心等待。

首先,我们采用 INT_tbl & OUT_tbl 并对其进行一些处理。我们制作它们数据框,将行名添加为一列,并在每个条目中添加一个后缀。这样做是为了 rbind Out 和 Int 表都变成一个完整的数据框,即

library(dplyr)
library(tidyr)

out <- setNames(data.frame(paste0(rownames(OUT_tbl), '_OUT'), OUT_tbl, 
                row.names = NULL, stringsAsFactors = FALSE), c('Person', colnames(OUT_tbl)))

int <- setNames(data.frame(paste0(rownames(INT_tbl), '_INT'), INT_tbl, 
                row.names = NULL, stringsAsFactors = FALSE), c('Person', colnames(INT_tbl)))

full_d <- rbind(out, int)

#which gives,
rbind(head(full_d, 3), tail(full_d, 3))
#          Person         10        34        59 84       110       134       165       199
#1  Karsten_1_OUT 0.23691523 0.2666650 0.1234157  1 0.2235404 0.1230000 0.0000000 0.0000000
#2     Mark_1_OUT 0.00000000 0.0000000 0.2320000  1 0.5534512 0.2341260 0.1147189 0.1236836
#3     Greg_1_OUT 0.00000000 0.0000000 0.5719200  1 0.6898750 0.3334727 0.1234300 0.0058933
#22    Jake_1_INT 0.01269152 0.3362350 0.6321571  1 0.2118640 0.3123000 0.0000000 0.0000000
#23   Sonya_1_INT 0.43120000 0.2134141 0.4423123  1 0.0000000 0.0000000 0.0000000 0.0000000
#24 Monique_1_INT 1.00000000 0.3212230 0.1322120  0 0.0000000 0.0000000 0.0000000 0.0000000

然后我们创建一个函数来计算所有可能的人对之间的距离,即

#define the Euclidean distance first
euc.dist <- function(i, j) {sqrt(sum((i - j) ^ 2))}

#Create the function
Get_dist <- function(x){
    d12 <- setNames(as.data.frame(cbind(as.character(x$Activity[1]), t(combn(as.character(x$Person), 2))), 
                                  stringsAsFactors = FALSE), c('Activity', 'Person1', 'Person2')) 
    new_d <- d12 %>% 
      gather(new, label, -Activity) %>% 
      left_join(., full_d, by = c('label' = 'Person'))
    l1 <- split(new_d, new_d$new)
    d12$EUC.DIST <- as.numeric(mapply(euc.dist, as.data.frame(t(l1[[1]][-c(1:3)])), 
                                      as.data.frame(t(l1[[2]][-c(1:3)]))))
    return(d12)
}

应用函数

我们将数据框拆分为 Activity,应用该函数并使用 bind_rows 将其(从列表)转换为数据框。即

final_d <- bind_rows(lapply(split(df, df$Activity), Get_dist))

final_d
#     Activity     Person1     Person2  EUC.DIST
#1    Football  Mark_1_OUT  Mark_1_INT 0.0000000
#2    Football  Mark_1_OUT  Greg_1_OUT 0.3974635
#3    Football  Mark_1_OUT  Greg_1_INT 0.3974635
#4    Football  Mark_1_INT  Greg_1_OUT 0.3974635
#5    Football  Mark_1_INT  Greg_1_INT 0.3974635
#6    Football  Greg_1_OUT  Greg_1_INT 0.0000000
#7    Handball  Karl_1_OUT  Karl_1_INT 0.0000000
#8    Handball  Karl_1_OUT  Matt_1_OUT        NA
#9    Handball  Karl_1_OUT  Matt_1_INT        NA
#10   Handball  Karl_1_OUT  Jake_1_INT 1.4896801

如果您想从最终数据框中排除 NA 值,那么只需

final_d <- final_d[!is.na(final_d$EUC.DIST),]

请检查这个。

#Convert to data.frame and cleanup
INT_tbl = as.data.frame(INT_tbl)
OUT_tbl = as.data.frame(OUT_tbl)
INT_tbl$Remarks = "INT"
OUT_tbl$Remarks = "OUT"
INT_tbl$Names = rownames(INT_tbl)
OUT_tbl$Names = rownames(OUT_tbl)
rownames(INT_tbl) = NULL
rownames(OUT_tbl) = NULL

# Initiate empty lists
Name_Pair1 = list()
Name_Pair2 = list()
EDistance = list()

m = 1

#Compute distance between all names in INT and OUT and add to lists
while (m < nrow(INT_tbl)*nrow(OUT_tbl)){
    for (i in 1:nrow(INT_tbl)){
        for (j in 1:nrow(OUT_tbl)){
            Name_Pair1[m] = paste(INT_tbl$Names[i],"_INT-",OUT_tbl$Names[j],"_OUT",sep="")
            Name_Pair2[m] = paste(OUT_tbl$Names[j],"_OUT-",INT_tbl$Names[i],"_INT",sep="")
            EDistance[m] = sqrt((INT_tbl$`10`[i]-OUT_tbl$`10`[i])^2+
                                    (INT_tbl$`34`[i]-OUT_tbl$`34`[i])^2+
                                    (INT_tbl$`59`[i]-OUT_tbl$`59`[i])^2+
                                    (INT_tbl$`84`[i]-OUT_tbl$`84`[i])^2+
                                    (INT_tbl$`110`[i]-OUT_tbl$`110`[i])^2+
                                    (INT_tbl$`134`[i]-OUT_tbl$`134`[i])^2+
                                    (INT_tbl$`165`[i]-OUT_tbl$`165`[i])^2+
                                    (INT_tbl$`199`[i]-OUT_tbl$`199`[i])^2)
            m = m+1
        }
    }
}

#COmbine lists into data.frame and cleanup 
DDistance = data.frame(cbind(Name_Pair1,Name_Pair2,EDistance))
DDistance$Name_Pair1 = as.character(DDistance$Name_Pair1)
DDistance$Name_Pair2 = as.character(DDistance$Name_Pair2)
DDistance$EDistance = as.numeric(DDistance$EDistance)

#Initiate OUTPUT data.frame 
Out.put = data.frame(V1 = NA,V2=NA,Name_Pair=NA,EDistance=NA,Activity=NA)

#Obtain list of unique Activity 
Activity = as.character(unique(repr_data$Activity))

for (i in 1:length(Activity)){
    df = repr_data[repr_data$Activity == Activity[i],] #Subset for unique activity
    x = as.data.frame(combn(df$Person,2,simplify = FALSE)) #Get all combination of names in the subset
    x= t(x)
    rownames(x) = NULL
    x= as.data.frame(x)

    #Lookup distance for each row based on Name1(V1) and Name2(V2)
    for (j in 1:nrow(x)){
        x$Name_Pair[j] = paste(x$V1[j],x$V2[j],sep="-")
        for (k in 1:nrow(DDistance)){
            if (x$Name_Pair[j] == DDistance$Name_Pair1[k] | x$Name_Pair[j] == DDistance$Name_Pair2[k])
                x$EDistance = DDistance$EDistance[k]
            next
        }
        x$Activity = Activity[i]
    }
    Out.put = rbind(Out.put,x) #Append to Out.put
}

Out.put = Out.put[2:nrow(Out.put),] #Cleanup

这是使用 dplyr 的替代方法。我认为在相应地更新行名称后组合 INT_tblOUT_tbl 效果更好(也许更容易理解):

rownames(INT_tbl) <- paste0(rownames(INT_tbl), "_INT")
rownames(OUT_tbl) <- paste0(rownames(OUT_tbl), "_OUT")
BOTH_tbl <- rbind(INT_tbl, OUT_tbl)

您在 repr_data 中有一个数据中不存在的名称。如果您需要两人保持 NA 距离,请使用第一种解决方案;如果您不 want/need 数据中的对,请使用第二个解决方案。 (性能本质上是一样的。)无论如何,要处理它,我们需要提前知道所有可能的情况:

allpeople <- rownames(BOTH_tbl)
library(dplyr)

解决方案一

如果您需要在 NA 距离内保持失踪人员可见:

repr_data %>%
  group_by(Activity) %>%
  do({
    people <- as.character(unique(.$Person))
    peoplei <- match(people, allpeople)
    d <- dist(BOTH_tbl[peoplei,])
    n <- length(people) - 1
    data.frame(
      Person1 = rev(people[-1])[unlist(mapply(`:`, n:1, 1))],
      Person2 = rep(people, times = n:0),
      Dist = unclass(d),
      stringsAsFactors = FALSE
    )
  }) %>%
  ungroup()
# # A tibble: 49 × 4
#    Activity    Person1    Person2      Dist
#      <fctr>      <chr>      <chr>     <dbl>
# 1  Football Mark_1_INT Mark_1_OUT 0.0000000
# 2  Football Greg_1_OUT Mark_1_OUT 0.3974635
# 3  Football Greg_1_INT Mark_1_OUT 0.3974635
# 4  Football Greg_1_OUT Mark_1_INT 0.3974635
# 5  Football Greg_1_INT Mark_1_INT 0.3974635
# 6  Football Greg_1_INT Greg_1_OUT 0.0000000
# 7  Handball Karl_1_INT Karl_1_OUT 0.0000000
# 8  Handball Matt_1_OUT Karl_1_OUT        NA
# 9  Handball Matt_1_INT Karl_1_OUT        NA
# 10 Handball Jake_1_INT Karl_1_OUT 1.4896801
# # ... with 39 more rows

方案二

如果缺少人员可以从结果中省略。

repr_data %>%
  group_by(Activity) %>%
  do({
    people <- intersect(as.character(unique(.$Person)), allpeople)
    d <- dist(BOTH_tbl[people,])
    n <- length(people) - 1
    data.frame(
      Person1 = rev(people[-1])[unlist(mapply(`:`, n:1, 1))],
      Person2 = rep(people, times = n:0),
      Dist = unclass(d),
      stringsAsFactors = FALSE
    )
  }) %>%
  ungroup()
# # A tibble: 36 × 4
#    Activity     Person1    Person2      Dist
#      <fctr>       <chr>      <chr>     <dbl>
# 1  Football  Mark_1_INT Mark_1_OUT 0.0000000
# 2  Football  Greg_1_OUT Mark_1_OUT 0.3974635
# 3  Football  Greg_1_INT Mark_1_OUT 0.3974635
# 4  Football  Greg_1_OUT Mark_1_INT 0.3974635
# 5  Football  Greg_1_INT Mark_1_INT 0.3974635
# 6  Football  Greg_1_INT Greg_1_OUT 0.0000000
# 7  Handball  Karl_1_INT Karl_1_OUT 0.0000000
# 8  Handball  Jake_1_INT Karl_1_OUT 1.4896801
# 9  Handball  Jake_1_OUT Karl_1_OUT 1.4896801
# 10 Handball Sonya_1_OUT Karl_1_OUT 1.1628794
# # ... with 26 more rows