匹配两个不同数据集的响应并将一个数据集的一部分添加到另一个数据集

Matching up responses of two different data sets and adding parts of one to the other

我在 R 中有两个不相关的数据集。

数据集 A 包含数百个作为实验响应给出的单词。有些词出现了好几次,每个参与者有 25 个响应,有 112 个参与者。它的格式如下:

ID  Duration  Gender  Age  response1  response1_rt  response2  response2_rt  response3  response3_rt ...
1     2910      2     26     word        3.55          apple      2.89          house       4.21     ...
2     1873      1     20     paper       2.53          hungry     3.25          tent        2.55     ...
...   ...         ...           ...        ...           ...         ...      ...
112   3420      2     18     pen         1.83          salad      4.26          family      3.22     ...

数据集 B 包含数千个已在不同维度上进行评级的单词。它的格式如下:

        Word        measure1  measure2  measure3 ...
1       aardvark    6.26       2.21      19.00   ...
2       abalone     5.3        1.59      20.00   ...
...     ...         ...        ...       ...     ...
8690    paper       5.42       1.62      40      ...
...     ...         ...        ...       ...     ... 
13915   zucchini    6.3        2.36      20      ...

我想在每个响应旁边向数据集 A 添加新列,显示赋予数据集 B measure1 中单词的值,如果该单词未出现在数据集中,则显示 NA...

例如:

ID   Duration  Gender  Age  response1   measure1.r1   response1_rt  response2  measure1.r2  response2_rt  response3...
1     2910       2     26    word         5.05         3.55          apple        NA        2.89          house
2     1873       1     20    paper        5.42         2.53          hungry       3.54      3.25          tent
...
112   3420       2     18    pen          5.63         1.83          salad        6.35      4.26          family

关于如何执行此操作的任何建议?

编辑:示例数据

dataset_a <- structure(list(Duration = c("2910", "3515", "5096", 
"2702", "2175", "2162"), Gender = c("2", "2", "2", "2", "2", "3"
), Age = c("19", "18", "38", "27", "20", "20"), response1 = c("Clock", 
"clock", "clock", "space", "clock", "clock"), response1_rt = c("4.781", 
"4.24", "12.351", "3.841", "5.912", "6.658"), response2 = c("Dress", 
"human", "cat", "woman", "stick figure", "hood"), response2_rt = c("4.77", 
"4.476", "4.214", "4.494", "3.735", "4.998"), response3 = c("2002", 
"date", "party", "2022", "calendar", "month"), response3_rt = c("3.557", 
"2.82", "7.517", "4.371", "4.868", "4.219")), row.names = c(NA, 
-6L), class = c("tbl_df", "tbl", "data.frame"))


dataset_b <- structure(list(Word = c("abandon", "alarm_clock", "calendar", "chip", 
"doll", "dress", "fig", "hotel", "human", "kayak", "light", "part", 
"string", "woman", "zone"), measure1 = c("6.26", 
"5.3", "2.84", "2.63", "5.85", "5.43", "4.48", "2.42", "2.05", 
"5.52", "5.57", "4", "5.15", "3.53", "3.05"), measure2 = c(2.21, 
1.59, 1.54, 1.74, 1.69, 1.75, 1.59, 1.61, 1.31, 1.75, 1.75, 1.29, 
1.79, 1.22, 1.81), measure3 = c(19, 20, 19, 30, 20, 21, 23, 
19, 36, 21, 23, 19, 20, 40, 19)), row.names = c(NA, -15L), class = c("tbl_df", 
"tbl", "data.frame"), problems = structure(list(expected = c("a double", "1/0/T/F/TRUE/FALSE", "a double", 
"1/0/T/F/TRUE/FALSE", "a double", "1/0/T/F/TRUE/FALSE", "a double", 
"1/0/T/F/TRUE/FALSE"), actual = c("entering", "12", "auto", "10", 
"B", "8", "State", "13")), row.names = c(NA, -8L
), class = c("tbl_df", "tbl", "data.frame")))

这是 tidyverse 中的解决方案。为了避免一些曲折的解析,我保留了名称略有不同的列,并且顺序与您描述的不同。但是,数据的基本结构和内容符合要求。

解决方案

library(tidyverse)


# ...
# Code to generate your 'responses' and 'words' datasets.
# ...


# Filter the desired measures.
words <- words %>%
  # Pivot the 'measure*' columns into the two columns:
  # - 'm': which measure for that word
  # - 'measure': the value measured
  pivot_longer(
    cols = matches("measure\d+"),
    names_to = c(".value", "m"),
    names_pattern = "(measure)(\d+)",
    # Homogenize everything within the new columns:
    values_transform = list(
      # Decimals within 'measure'
      measure = as.double,
      # Integers within 'm'
      m = as.integer
    )
  ) %>%
  # Filter down to desired measures by number.
  filter(m %in% c(
    # You want only measure #1.
    1 #, 2, 3, ...
  )) %>%
  # Pivot back to the original 'measure*' format, now with only the desired
  # columns.
  pivot_wider(
    names_from = m,
    values_from = measure,
    names_glue = "{.value}{.name}"
  )


# Match the measures into the responses.
measured_responses <- responses %>%
  # Pivot the 'response*' columns into two columns:
  # - 'n': which response for that ID
  # - 'response': the word itself
  pivot_longer(
    cols = matches("^response\d+$"),
    names_to = c(".value", "n"),
    names_pattern = "^(response)(\d+)$",
    # Homogenize everything within the new columns:
    values_transform = list(
      # Strings within 'response'
      response = as.character,
      # Integers within 'n'
      n = as.integer
    )
  ) %>%
  # Match by word the responses with their measures; and retain (via LEFT join)
  # the responses that lack matches.
  left_join(
    words,
    by = c("response" = "Word")
  ) %>%
  # Pivot the dataset back into its original 'response*' format, now with the
  # new 'measure*.*' columns included.
  pivot_wider(
    names_from = n,
    values_from = c(response, matches("measure\d+")),
    names_sep = "."
  )


# View the results.
measured_responses

结果

给定一个这样的 responses 数据集

responses <- tibble::tribble(
  ~ID, ~Duration, ~Gender, ~Age, ~response1, ~response1_rt, ~response2, ~response2_rt, ~response3, ~response3_rt, # ...
  1,   2910,      2,       26,   "word",     3.55,          "apple",    2.89,          "house",    4.21,          # ...
  2,   1873,      1,       20,   "paper",    2.53,          "hungry",   3.25,          "tent",     2.55,          # ...
  112, 3420,      2,       18,   "pen",      1.83,          "salad",    4.26,          "family",   3.22           # ...
)

和这样的 words 数据集

words <- tibble::tribble(
  ~Word,      ~measure1, ~measure2, ~measure3, # ...
  "apple",    5.19,      1.84,      25.00,     # ...
  "paper",    5.42,      1.62,      40,        # ...
  "pen",      5.63,      1.71,      20,        # ...
  "salad",    6.07,      2.19,      40,        # ...
  "tent",     4.96,      1.55,      30,        # ...
  "word",     5.05,      2.26,      19.00,     # ...
)

此解决方案应生成如下 measured_responses 数据集:

# A tibble: 3 x 13
     ID Duration Gender   Age response1_rt response2_rt response3_rt response.1 response.2 response.3 measure1.1 measure1.2 measure1.3
  <dbl>    <dbl>  <dbl> <dbl>        <dbl>        <dbl>        <dbl> <chr>      <chr>      <chr>           <dbl>      <dbl>      <dbl>
1     1     2910      2    26         3.55         2.89         4.21 word       apple      house            5.05       5.19      NA   
2     2     1873      1    20         2.53         3.25         2.55 paper      hungry     tent             5.42      NA          4.96
3   112     3420      2    18         1.83         4.26         3.22 pen        salad      family           5.63       6.07      NA