运行 300 个用户的关联和基于显着关联的子集

Question

提前为这个问题道歉；我对我正在尝试做的事情只有模糊的了解，所以寻求帮助并没有产生非常有用的信息。

基本上我的问题是这样的。我有一个看起来像这样的数据框，300 hh_ids 中的每一行有 12 行，每个月一个：

 hh_ids      date    income consumption alcohol cleaning_materials  clothing
1  KELDK01 2012-11-1  62.70588    40.52941       0           0.000000  0.000000
2  KELDK01 2012-12-1  17.64706    42.43530       0           1.058824  7.058824
3  KELDK01 2013-01-1  91.76471    48.23529       0           0.000000  0.000000
4  KELDK01 2013-02-1  91.76470   107.52940       0           0.000000  0.000000
5  KELDK01 2013-03-1 116.47060   114.47060       0           0.000000  0.000000
6  KELDK01 2013-04-1 124.41180   118.29410       0           2.705882 17.647060
7  KELDK01 2013-05-1 137.23530   105.00000       0           1.411765  1.882353
8  KELDK01 2013-06-1 131.52940   109.54120       0           4.352942  2.941176
9  KELDK01 2013-07-1 121.52940   113.47060       0           2.352941 25.882350
10 KELDK01 2013-08-1 123.32940    86.50588       0           2.588235  2.941176

我想看看一年中每个家庭的支出类别 "clothing" 与其他支出类别（大约 10）之间是否存在任何相关性。然后我想创建一个新的数据框，其中只包含 "clothing" 与另一个支出类别之间具有显着相关性的家庭。

关于我将如何解决这个问题有什么想法吗？

(p.s。我正在尝试调查这是否是 "clothing" 和其他支出类别之间的任何交叉产品替代，并隔离确实显示该行为的 HH。如果我我是个白痴，有更好的方法，我很乐意听听你的想法！）

编辑：为了回应到目前为止看到工作的请求：这相当令人尴尬，但我一直在手动进行 - 我认为我会花费大约相同的时间来弄清楚如何正确地完成它。

我在 df_cloth 中对 df 进行了子集化（对于全年布料支出 >0 的家庭），即 140 HH。

然后我做了：

df_cloth_cor<-select(df_cloth,income,consumption,alcohol,cleaning_material, clothing)
cor(df_cloth_cor)

然后我按家庭记录了 excel 中的相关系数，每个变量布都有一列与之相关。

Answer 1

我稍微更改了您的示例以包含 2 个不同的 ID。另外，我不确定 "significant correlation" 是什么意思。大值，还是统计显着？我在这里包括了这两种情况。

1.相关值与p值

library(dplyr)

# example dataset
dt = read.table(text="hh_ids      date    income consumption alcohol cleaning_materials  clothing
                KELDK01 2012-11-1  62.70588    40.52941       0           0.000000  0.000000
                KELDK01 2012-12-1  17.64706    42.43530       0           1.058824  7.058824
                KELDK01 2013-01-1  91.76471    48.23529       0           0.000000  0.000000
                KELDK01 2013-02-1  91.76470   107.52940       0           0.000000  0.000000
                KELDK01 2013-03-1 116.47060   114.47060       0           0.000000  0.000000
                KELDK01 2013-04-1 124.41180   118.29410       0           2.705882 17.647060
                KELDK02 2013-05-1 137.23530   105.00000       0           1.411765  1.882353
                KELDK02 2013-06-1 131.52940   109.54120       0           4.352942  2.941176
                KELDK02 2013-07-1 121.52940   113.47060       0           2.352941 25.882350
                KELDK02 2013-08-1 123.32940    86.50588       0           2.588235  2.941176", 
                sep="", header=T, stringsAsFactors = F)

dt

#     hh_ids      date    income consumption alcohol cleaning_materials  clothing
# 1  KELDK01 2012-11-1  62.70588    40.52941       0           0.000000  0.000000
# 2  KELDK01 2012-12-1  17.64706    42.43530       0           1.058824  7.058824
# 3  KELDK01 2013-01-1  91.76471    48.23529       0           0.000000  0.000000
# 4  KELDK01 2013-02-1  91.76470   107.52940       0           0.000000  0.000000
# 5  KELDK01 2013-03-1 116.47060   114.47060       0           0.000000  0.000000
# 6  KELDK01 2013-04-1 124.41180   118.29410       0           2.705882 17.647060
# 7  KELDK02 2013-05-1 137.23530   105.00000       0           1.411765  1.882353
# 8  KELDK02 2013-06-1 131.52940   109.54120       0           4.352942  2.941176
# 9  KELDK02 2013-07-1 121.52940   113.47060       0           2.352941 25.882350
# 10 KELDK02 2013-08-1 123.32940    86.50588       0           2.588235  2.941176


# create a function that calculates correlation and p value given 2 vectors
Get_cor_and_pval = function(d,n1,n2,id){

  # create 2 vectors based on names of variables and the id
  x = d[,n1][dt$hh_ids==id]
  y = d[,n2][dt$hh_ids==id]

  # calculate correlation and p value
  test = cor.test(x,y)
  c = test$estimate   # keep correlation value
  p = test$p.value    # keep p value

  return(data.frame(c = c, p = p, row.names = NULL))
}


# specify combinations of variables to calculate correlation
names1 = "clothing"
names2 = c("income","consumption","alcohol","cleaning_materials")

dt_combs = expand.grid(names1=names1, names2=names2, stringsAsFactors = F)

dt_combs

#     names1             names2
# 1 clothing             income
# 2 clothing        consumption
# 3 clothing           alcohol
# 4 clothing cleaning_materials


# process to get correlations and p values for each variable combination and each id 
dt %>%
  select(hh_ids) %>% distinct() %>%                                       # select unique ids
  group_by(hh_ids) %>%                                                    # for each id
  do(data.frame(.,dt_combs)) %>%                                          # get all combinations of interest
  rowwise() %>%                                                           # for each id and combination
  do(data.frame(.,                                                        # keep id and combination
                Get_cor_and_pval(dt,.$names1,.$names2,.$hh_ids),          # get correlation and p value
                stringsAsFactors=F)) %>%                                  # factor variables as character
  ungroup()                                                               # forget groupings

# # A tibble: 8 x 5
#    hh_ids   names1             names2          c            p
# *   <chr>   <fctr>              <chr>      <dbl>        <dbl>
# 1 KELDK01 clothing             income  0.1713298 7.455198e-01
# 2 KELDK01 clothing        consumption  0.3220463 5.336309e-01
# 3 KELDK01 clothing            alcohol         NA           NA
# 4 KELDK01 clothing cleaning_materials  0.9999636 1.989337e-09
# 5 KELDK02 clothing             income -0.6526867 3.473133e-01
# 6 KELDK02 clothing        consumption  0.5376850 4.623150e-01
# 7 KELDK02 clothing            alcohol         NA           NA
# 8 KELDK02 clothing cleaning_materials -0.1416633 8.583367e-01

最后一个数据框显示了每个 id 的所有兴趣对之间的相关性。 Alcohol 变量始终为 0 并创建此 NA 值。您可以使用自己的过滤器来保留您喜欢的行。

请注意，对于 300 个 ID 和 6 个变量，它会很好地工作。对于更多数量的 ID（数百万）和许多变量，它可能会变得更慢，并且可能有更有效的方法来做到这一点。

2。相关值

如果您只对相关值而不是 p 值感兴趣，那么代码会更短：

library(dplyr)

# example dataset
dt = read.table(text="hh_ids      date    income consumption alcohol cleaning_materials  clothing
                KELDK01 2012-11-1  62.70588    40.52941       0           0.000000  0.000000
                KELDK01 2012-12-1  17.64706    42.43530       0           1.058824  7.058824
                KELDK01 2013-01-1  91.76471    48.23529       0           0.000000  0.000000
                KELDK01 2013-02-1  91.76470   107.52940       0           0.000000  0.000000
                KELDK01 2013-03-1 116.47060   114.47060       0           0.000000  0.000000
                KELDK01 2013-04-1 124.41180   118.29410       0           2.705882 17.647060
                KELDK02 2013-05-1 137.23530   105.00000       0           1.411765  1.882353
                KELDK02 2013-06-1 131.52940   109.54120       0           4.352942  2.941176
                KELDK02 2013-07-1 121.52940   113.47060       0           2.352941 25.882350
                KELDK02 2013-08-1 123.32940    86.50588       0           2.588235  2.941176", 
                sep="", header=T, stringsAsFactors = F)


dt %>% 
  group_by(hh_ids) %>%                  # for each id
  do(data.frame(cor(.[,3:7]))[5,]) %>%  # keep columns 3 to 7 (numeric columns), get the correlation matrix and keep row 5 (row for income and all other)
  ungroup()

# # A tibble: 2 x 6
#    hh_ids     income consumption alcohol cleaning_materials clothing
#     <chr>      <dbl>       <dbl>   <dbl>              <dbl>    <dbl>
# 1 KELDK01  0.1713298   0.3220463      NA          0.9999636        1
# 2 KELDK02 -0.6526867   0.5376850      NA         -0.1416633        1

还有一个使用 corrr 包的替代方案

library(dplyr)
library(corrr)

# example dataset
dt = read.table(text="hh_ids      date    income consumption alcohol cleaning_materials  clothing
                KELDK01 2012-11-1  62.70588    40.52941       0           0.000000  0.000000
                KELDK01 2012-12-1  17.64706    42.43530       0           1.058824  7.058824
                KELDK01 2013-01-1  91.76471    48.23529       0           0.000000  0.000000
                KELDK01 2013-02-1  91.76470   107.52940       0           0.000000  0.000000
                KELDK01 2013-03-1 116.47060   114.47060       0           0.000000  0.000000
                KELDK01 2013-04-1 124.41180   118.29410       0           2.705882 17.647060
                KELDK02 2013-05-1 137.23530   105.00000       0           1.411765  1.882353
                KELDK02 2013-06-1 131.52940   109.54120       0           4.352942  2.941176
                KELDK02 2013-07-1 121.52940   113.47060       0           2.352941 25.882350
                KELDK02 2013-08-1 123.32940    86.50588       0           2.588235  2.941176", 
                sep="", header=T, stringsAsFactors = F)


dt %>% 
  group_by(hh_ids) %>%                               # for each id
  do( correlate(.[,3:7]) %>% focus(clothing) ) %>%   # keep columns 3 to 7, get correlations but return ones that have to do with variable "clothing"
  ungroup()

# # A tibble: 8 x 3
#    hh_ids            rowname   clothing
#     <chr>              <chr>      <dbl>
# 1 KELDK01             income  0.1713298
# 2 KELDK01        consumption  0.3220463
# 3 KELDK01            alcohol         NA
# 4 KELDK01 cleaning_materials  0.9999636
# 5 KELDK02             income -0.6526867
# 6 KELDK02        consumption  0.5376850
# 7 KELDK02            alcohol         NA
# 8 KELDK02 cleaning_materials -0.1416633

运行 300 个用户的关联和基于显着关联的子集

Running a correlation for 300 users and subsetting based on significant correlations

r

economics

correlation