计算 R 中 group_by 之后所有列的两个数据帧的相关性

Calculate correlation for two data frames for all columns after group_by in R

示例数据:

A <- data.frame(region = c("US","US", "UK","UK","AUS","AUS"), a = c(1,2,3,4,5,8), b = c(4,5,6,7,8,2), c = c(9,6,5,43,2,5))
B <- data.frame(region = c("US","US", "UK","UK","AUS","AUS"),a = c(7,4,3,6,9,81), b = c(9,4,3,7,0,35), c = c(22,5,6,2,9,33))

预期输出: (x 是该区域中两个数据帧之间列的相关性)

我试过: 将两个数据帧绑定为一个数据帧,并计算一个数据帧中两列之间的相关性。键入每个列名有点乏味,这也会创建太多列。有更简单的方法吗?

如果我的理解没有错,那么这是使用 dplyrtidyr 的解决方案。

library(dplyr)
library(tidyr)

rbind(cbind(set = "A", A), cbind(set = "B", B)) %>% 
  pivot_longer(-c(set, region)) %>% 
  group_by(region, name) %>% 
  summarise(value = cor(value[set == "A"], value[set == "B"]), .groups = "drop") %>% 
  pivot_wider()

输出

# A tibble: 3 x 4
  region     a     b     c
  <chr>  <dbl> <dbl> <dbl>
1 AUS        1    -1     1
2 UK         1     1    -1
3 US        -1    -1     1

这有点复杂,但它是一种替代方法。

library(tidyverse)

A <- data.frame(region = c("US","US", "UK","UK","AUS","AUS"), a = c(1,2,3,4,5,8), b = c(4,5,6,7,8,2), c = c(9,6,5,43,2,5))
B <- data.frame(region = c("US","US", "UK","UK","AUS","AUS"),a = c(7,4,3,6,9,81), b = c(9,4,3,7,0,35), c = c(22,5,6,2,9,33))

(df <- map(list(A, B), ~nest_by(.x, region)) %>% 
       reduce(inner_join, by = 'region')) 
#> # A tibble: 3 × 3
#> # Rowwise:  region
#>   region             data.x             data.y
#>   <chr>  <list<tibble[,3]>> <list<tibble[,3]>>
#> 1 AUS               [2 × 3]            [2 × 3]
#> 2 UK                [2 × 3]            [2 × 3]
#> 3 US                [2 × 3]            [2 × 3]

bind_cols(select(df, region), map2_dfr(df$data.x, df$data.y, ~map2_dfc(.x, .y, ~cor(.x, .y))))
#> # A tibble: 3 × 4
#> # Rowwise:  region
#>   region     a     b     c
#>   <chr>  <dbl> <dbl> <dbl>
#> 1 AUS        1    -1     1
#> 2 UK         1     1    -1
#> 3 US        -1    -1     1

reprex package (v2.0.1)

创建于 2022-01-06