将一个数据框中的一些列和行插入到另一个具有更多列和行的数据框中

Insert a few columns and rows from one dataframe into another dataframe with many more columns and rows

我有一个非常大的数据框(超过 50 列和数千行),其中的一个片段在 df1 中给出,还有一个非常小的数据框,只有很少的列和行,df2,其中包含 df1:

的更正数据

大:

df1 <- structure(list(File = c("F01", "F01", "F01", "F01", "F01", "F01", 
"F01"), Line = c(991L, 992L, 993L, 950L, 951L, 952L, 953L), Speaker = c("ID01.B", 
NA, "ID01.A", "ID01.C", NA, "ID01.A", "ID01.B"), Sequ = c(30L, 
30L, 30L, 28L, 28L, 28L, 28L), Q = c("q_pol", "q_pol", "q_pol", 
"q_pol", "q_pol", "q_pol", "q_pol"), N_ipu = c(1L, 0L, 1L, 1L, 
0L, 1L, 1L), Utterance = c("=do you guys need to go back to Ikea anytime soon¿ (.) I don't want=", 
"(0.038)", "=always=", "=so Monday?=", "(0.008)", "=hol[ler:::] if you want a ride", 
"[yeah]"), Timestamp = c("00:21:20.235 - 00:21:22.879", "00:21:22.879 - 00:21:22.917", 
"00:21:22.917 - 00:21:23.498", "00:20:31.650 - 00:20:32.222", 
"00:20:32.222 - 00:20:32.230", "00:20:32.230 - 00:20:34.527", 
"00:20:32.785 - 00:20:33.205"), Q_by = c("B", NA, NA, "C", NA, 
NA, NA), Answ_by = c("A", NA, NA, "B", NA, NA, NA), Aaoi = c("*B*B*B*", 
"*", "*", "*", "*", "*C*", "*"), Baoi = c("A*A", "A", "A", "*C", 
"C", "C*A", "C"), Caoi = c("A*B*B*", "*", "*A", "*", "*", "*A", 
"*"), Adur = c("732, 166, 884, 149, 451, 149, 113", "38", "581", 
"572", "8", "24, 102, 2171", "420"), Bdur = c("3, 321, 2320", 
"38", "581", "359, 213", "8", "1308, 571, 418", "420"), Cdur = c("400, 256, 244, 689, 728, 327", 
"38", "5, 576", "572", "8", "1428, 869", "420"), A_Area_av = c("601.4205,602.9385,608.225,611.9615,610.5655,606.266,613.946,604.808,606.0685,595.0545,588.8595,597.4305,590.1285,584.708,585.726,592.8125,586.041,587.092,585.0985,590.9365,588.0725,585.5485,579.7915,585.4685,580.6295,577.608,572.518,575.467,569.742,570.089,563.1095,559.8635,564.6895,563.046,562.6775,561.488,562.8215,567.387,562.438,559.921,571.306,564.5025,565.853,565.1765,541.471,558.825,598.7505,610.0795,593.092,599.506,591.9885,599.718,602.8275,608.196,603.5275,598.9005,607.75,612.698,609.5625,604.1775,601.688,609.648,615.5035,614.1635,615.255,626.5105,624.352,629.6365,628.1505,625.4695,635.789,635.255,651.533,652.318,654.3625,655.559,664.403,658.9375,660.656,659.857,662.136,675.203,674.083,687.4895,683.8915,685.74,698.915,702.4895,701.7415,697.51,703.551,708.525,703.544,706.546,702.0095,708.532,717.171,715.0815,712.4205,711.6725,722.6945,718.984,719.2275,725.508,732.165,723.4745,726.265,737.4115,730.967,728.307,733.776,720.9105,729.172,723.2725,738.4195,732.045,729.2465,735.0775,729.828,728.5605,727.8755,737.312,737.9625,738.217,731.503,736.6975,725.4055,737.313,731.536,725.0055,727.677,727.411,733.272,734.7145,734.8845,730.7515,741.0095,746.2205,743.1225,747.0055,735.6235,733.425,739.0075,739.7375,735.5125,737.8905,731.887,727.981,731.4055,727.752,719.715,713.2015,718.707,720.7465,713.2125,715.263,718.9595,717.134,716.4635,714.182", 
"714.182,702.4825,680.9495", "680.9495,694.641,697.939,692.9615,706.298,707.3705,716.398,723.045,720.074,727.4415,719.681,724.5255,727.8885,738.541,744.5745,747.894,765.7335,760.4365,757.6545,751.15,750.697,749.252,745.6615,736.7915,742.468,729.083,732.048,720.187,733.2215,712.975,731.7685,724.7865,725.428,734.7755,727.465,728.6105", 
"574.3445,577.8675,568.954,570.1815,568.3655,571.756,578.5075,570.2235,577.112,575.8575,580.1065,582.1745,576.067,582.5905,582.9315,581.9255,581.888,584.157,581.059,587.708,590.8625,592.9645,593.0105,591.879,597.643,594.915,601.693,601.9125,603.235,558.439,585.057,636.6355,688.616,726.5445,726.7685,732.7715", 
"732.7715", "732.7715,732.2835,742.525,732.129,736.7395,742.4785,741.538,741.134,740.0665,704.622,621.436,618.107,633.682,641.7885,633.3825,628.869,628.3315,624.074,619.971,621.133,614.118,608.018,602.63,598.8605,594.5405,589.108,591.7935,585.108,575.0235,582.0335,588.871,575.2915,587.99,581.7825,582.4725,576.851,585.7895,576.763,581.4595,585.989,581.46,587.1015,590.708,584.07,582.2705,589.4475,591.706,590.8365,592.9135,600.371,593.7015,601.208,594.07,605.5185,600.6945,609.5385,607.438,607.159,608.1535,608.954,601.5205,605.5925,607.205,603.8995,598.885,601.657,604.3665,608.3525,608.4205,610.66,604.0435,598.986,600.9705,596.2165,603.1145,598.789,594.9195,593.6315,593.909,595.245,591.667,593.7845,589.164,589.863,590.8055,596.932,593.0385,595.458,592.276,595.4075,598.09,600.6265,596.3265,594.757,594.7905,594.5465,588.441,591.6315,588.924,593.9225,585.254,591.2645,594.7905,591.9845,597.202,593.0845,596.483,592.9395,597.4135,591.844,592.969,589.7095,599.2825,595.434,593.1245,588.5705,597.613,592.668,591.312,594.684,590.88,598.1365,595.126,591.6835,598.502,603.444,601.417,608.247,602.8635,613.3065,604.812,610.6205,607.2485,612.363,604.8595,602.965,609.627,601.151,605.183", 
"581.7825,582.4725,576.851,585.7895,576.763,581.4595,585.989,581.46,587.1015,590.708,584.07,582.2705,589.4475,591.706,590.8365,592.9135,600.371,593.7015,601.208,594.07,605.5185,600.6945,609.5385,607.438,607.159,608.1535,608.954"
)), class = "data.frame", row.names = c(NA, -7L))

小:

df2 <- structure(list(Utterance = c("=so Monday?=", "(0.008)", "=hol[ler:::] if you 
want a ride", "[yeah]"), Aaoi = c("B*C", "C", "C*B", "B"), Baoi = c("*C", "C", 
"C*A", "C"), Caoi = c("B", "B", "B*A", "B*"), Adur = c("463, 52, 57", 
"8", "91, 43, 2163", "420"), Bdur = c("356, 216", "8", "1256, 322, 719", 
"420"), Cdur = c("572", "8", "635, 410, 1252", "80, 340"), Sequ = c(28, 
28, 28, 28)), row.names = c(NA, -4L), class = "data.frame")

我需要做的是df2中修正后的数据插入df1中。这两个数据框共享 df2 中的列。 df1df2 中包含相同值的列是 UtteranceSequ。包含更正数据的列是:AaoiBaoiCaoiAdurBdurCdur。如何在不一一插入更改的情况下将这些列从 df2 汇集到 df1

尊重您处理的复杂数据。让我用一个更简单的例子来回答这个问题。考虑这些数据帧 d1d2:

d1
#   id1 id2 X1 X2 X3 X4 X5
# 1   B   A  5  5  5  5  5
# 2   E   A  1  1  1  1  1
# 3   C   B  2  2  2  2  2
# 4   A   A  3  3  3  3  3
# 5   D   A  4  4  4  4  4

d2
#   id1 id2  X1  X2  X3  X4
# 2   E   A 200 300 400 100
# 1   B   A 300 400 100 200
# 3   C   B 100 200 300 400

您想用 d2 中的数据替换 d1 中的数据,同时识别正确的行。

为了实现这一点,您可以先 match 两个数据框的标识列,然后 paste 将它们放在一起,这将为您提供正确对应的 d1 的行 ID d2.

bys <- c('id1', 'id2')
(rows1 <- match(Reduce(paste, d2[bys]), Reduce(paste, d1[bys])))
# [1] 1 3 2

之后,只需使用要替换的列的向量对两个数据框进行子集; d1 您还将在第一步中按找到的行 ID 进行子集化。

cols <- c('X1', 'X2', 'X3')
d1[rows1, cols] <- d2[, cols]

给出:

d1
#   id1 id2  X1  X2  X3 X4 X5
# 1   A   B 200 300 100  4  4
# 2   E   B 400 200 300  1  1
# 3   D   B 100 400 200  5  5
# 4   C   A   2   2   2  2  2
# 5   B   B   3   3   3  3  3

数据:

d1 <- structure(list(id1 = c("A", "E", "D", "C", "B"), id2 = c("B", 
"B", "B", "A", "B"), X1 = c(4L, 1L, 5L, 2L, 3L), X2 = c(4L, 1L, 
5L, 2L, 3L), X3 = c(4L, 1L, 5L, 2L, 3L), X4 = c(4L, 1L, 5L, 2L, 
3L), X5 = c(4L, 1L, 5L, 2L, 3L)), class = "data.frame", row.names = c(NA, 
-5L))

d2 <- structure(list(id1 = c("A", "D", "E"), id2 = c("B", "B", "B"), 
    X1 = c(200, 100, 400), X2 = c(300, 400, 200), X3 = c(100, 
    200, 300), X4 = c(400, 300, 100)), row.names = c(1L, 3L, 
2L), class = "data.frame")

一个可能的解决方案,基于 tidyverse。我的解决思路如下:

  • 绑定两个dataframes,添加新列id区分两个dataframes。

  • 然后按 UtteranceSequ 分组,对于相关列,一个采用 last 值(使用 mutateacross).

  • 引用的last值包含df2的信息。

library(tidyverse)

df1 %>% 
  bind_rows(c(df2, id = 2)) %>% 
  group_by(Utterance, Sequ) %>% 
  mutate(across(c("Aaoi", "Baoi", "Caoi", "Adur", "Bdur", "Cdur"), last)) %>% 
  ungroup %>% filter(is.na(id)) %>% select(-id)

#> # A tibble: 7 × 17
#>   File   Line Speaker  Sequ Q     N_ipu Utterance  Timestamp Q_by  Answ_by Aaoi 
#>   <chr> <int> <chr>   <dbl> <chr> <int> <chr>      <chr>     <chr> <chr>   <chr>
#> 1 F01     991 ID01.B     30 q_pol     1 =do you g… 00:21:20… B     A       *B*B…
#> 2 F01     992 <NA>       30 q_pol     0 (0.038)    00:21:22… <NA>  <NA>    *    
#> 3 F01     993 ID01.A     30 q_pol     1 =always=   00:21:22… <NA>  <NA>    *    
#> 4 F01     950 ID01.C     28 q_pol     1 =so Monda… 00:20:31… C     B       B*C  
#> 5 F01     951 <NA>       28 q_pol     0 (0.008)    00:20:32… <NA>  <NA>    C    
#> 6 F01     952 ID01.A     28 q_pol     1 =hol[ler:… 00:20:32… <NA>  <NA>    *C*  
#> 7 F01     953 ID01.B     28 q_pol     1 [yeah]     00:20:32… <NA>  <NA>    B    
#> # … with 6 more variables: Baoi <chr>, Caoi <chr>, Adur <chr>, Bdur <chr>,
#> #   Cdur <chr>, A_Area_av <chr>