通过将来自不同组的行附加到同一行来重塑数据

Reshaping data by appending rows from different groups to the same row

我有数据如下:

DT <- structure(list(Area = c("A", "A", "A", "A", "B", "B", "B", "B"
), Year = c(1, 1, 2, 2, 1, 1, 2, 2), Group = c(1, 2, 1, 2, 1, 
2, 1, 2), Population_Count = c(10, 12, 10, 12, 10, 13, 10, 11
), Male_Count = c(5, 7, 5, 4, 5, 8, 5, 6), Female_Count = c(5, 
5, 5, 8, 5, 5, 5, 5)), row.names = c(NA, -8L), class = c("tbl_df", 
"tbl", "data.frame"))

# A tibble: 8 x 6
  Area   Year Group Population_Count Male_Count Female_Count
  <chr> <dbl> <dbl>            <dbl>      <dbl>        <dbl>
1 A         1     1               10          5            5
2 A         1     2               12          7            5
3 A         2     1               10          5            5
4 A         2     2               12          4            8
5 B         1     1               10          5            5
6 B         1     2               13          8            5
7 B         2     1               10          5            5
8 B         2     2               11          6            5

我想在不丢失任何信息的情况下,对每个 Area-Year 保留一个观察结果。我试过

DTcast <- dcast(DT, Area + Year ~ Group + Population_Count + Male_Count + Female_Count)

但这会导致大量垃圾:

  Area Year 1_10_5_5 2_11_6_5 2_12_4_8 2_12_7_5 2_13_8_5
1    A    1        5       NA       NA        5       NA
2    A    2        5       NA        8       NA       NA
3    B    1        5       NA       NA       NA        5
4    B    2        5        5       NA       NA       NA

此外,当我将其应用于实际数据时,我得到:

Using 'H_FEMALE' as value column. Use 'value.var' to override
Error in CJ(1:72284, 1:1333365) : 
  Cross product of elements provided to CJ() would result in 96380955660 rows which exceeds .Machine$integer.max == 2147483647

所以我觉得我做错了什么。我认为这可能与我不知道如何 select.

value.var 有关

想要的结果:

# A tibble: 4 x 9
  Area   Year Group `Population_Count_ Group_1` `Male_Count_ Group_1` `Female_Count_ Group_1` `Population_Count_ Group_2` `Male_Count_ Group_2` `Female_Count_ Group_2`
  <chr> <dbl> <dbl>                       <dbl>                 <dbl>                   <dbl>                       <dbl>                 <dbl>                   <dbl>
1 A         1     1                          10                     5                       5                          12                     7                       5
2 A         2     1                          10                     5                       5                          12                     4                       8
3 B         1     1                          10                     5                       5                          13                     8                       5
4 B         2     1                          10                     5                       5                          11                     6                       5
library(tidyverse)

DT %>% pivot_wider(id_cols = c("Area", "Year"), names_from = "Group", values_from = 4:6)

> DT %>% pivot_wider(id_cols = c("Area", "Year"), names_from = "Group", values_from = 4:6)
# A tibble: 4 x 8
  Area   Year Population_Count_1 Population_Count_2 Male_Count_1 Male_Count_2 Female_Count_1 Female_Count_2
  <chr> <dbl>              <dbl>              <dbl>        <dbl>        <dbl>          <dbl>          <dbl>
1 A         1                 10                 12            5            7              5              5
2 A         2                 10                 12            5            4              5              8
3 B         1                 10                 13            5            8              5              5
4 B         2                 10                 11            5            6              5              5

这将根据需要命名您的列

DT %>% pivot_wider(id_cols = c("Area", "Year"), 
                   names_from = "Group", 
                   values_from = 4:6,
                   names_sep = "_Group_")

使用data.table

library(data.table)
dt <- structure(list(Area = c("A", "A", "A", "A", "B", "B", "B", "B"
), Year = c(1, 1, 2, 2, 1, 1, 2, 2), Group = c(1, 2, 1, 2, 1, 
                                               2, 1, 2), Population_Count = c(10, 12, 10, 12, 10, 13, 10, 11
                                               ), Male_Count = c(5, 7, 5, 4, 5, 8, 5, 6), Female_Count = c(5, 
                                                                                                           5, 5, 8, 5, 5, 5, 5)), row.names = c(NA, -8L), class = c("tbl_df", 
                                                                                                                                                                    "tbl", "data.frame"))
setDT(dt)

dcast(
  dt, 
  formula = Area + Year ~ Group, 
  value.var = grep("_Count", names(dt), value = T)
)
#>    Area Year Population_Count_1 Population_Count_2 Male_Count_1 Male_Count_2
#> 1:    A    1                 10                 12            5            7
#> 2:    A    2                 10                 12            5            4
#> 3:    B    1                 10                 13            5            8
#> 4:    B    2                 10                 11            5            6
#>    Female_Count_1 Female_Count_2
#> 1:              5              5
#> 2:              5              8
#> 3:              5              5
#> 4:              5              5

reprex package (v0.3.0)

于 2020-12-18 创建