更改数据帧结构(两个数据帧到一个)
Change dataframe structure (two dataframes to one)
我有两个 data.frames(data1
和 data2
)。
data1 <- data.frame(id = c(1,2,3),
var1 = c("x", "x", "x"),
var2 = c("y", "y", "y") )
data2 <- data.frame(id = c(1,2,3,4,5,6,7,8),
data1Id = c(1,1,2,2,2,3,3,3),
var3 = c(0, 3, 5, 2, 5, 2, 9, 8),
var4 = c(1, 6, 3, 6, 2, 8, 7, 5))
data1
中的每个条目在 data2
中都有多个条目。两个表都通过变量 data1Id
链接在一起。我的目标是拥有一首单曲data.frame。所以我想将 data2
的列添加到 data1
并为 data2
.
中的每个条目将列名称递增 1
在上面的例子中
id var1 var2 var3_1 var3_2 var3_3 var4_1 var4_2 var4_3
----------------------------------------------------------------------------
1 x y 0 3 NA 1 6 NA
2 x y 5 2 5 3 6 2
3 x y 2 9 8 8 7 5
任何人都可以告诉我如何实现这一目标吗?
你必须重塑你的 data2
然后加入 data1
:
library(tidyverse)
data2 %>%
select(-id) %>%
group_by(data1Id) %>% # for each data1Id
mutate(indx = row_number()) %>% # create a row index (useful to reshape)
ungroup() %>%
gather(var, value, -data1Id, -indx) %>% # reshape dataset
unite(var, var, indx) %>% # combine those to columns to create new column names for reshaping
spread(var, value) %>% # reshape again
right_join(data1, by = c("data1Id" = "id")) %>% # join to data1
select(id = data1Id, var1, var2, everything()) # update column order
# # A tibble: 3 x 9
# id var1 var2 var3_1 var3_2 var3_3 var4_1 var4_2 var4_3
# <dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 x y 0 3 NA 1 6 NA
# 2 2 x y 5 2 5 3 6 2
# 3 3 x y 2 9 8 8 7 5
这是另一种 tidyverse
使用 inner_join
的方法
library(tidyverse)
inner_join(data1, data2, by = c("id" = "data1Id")) %>%
gather(key, value, -(1:3)) %>%
filter(key != "id.y") %>%
group_by(id) %>%
mutate(key = paste(key, row_number(), sep = "_")) %>%
spread(key, value)
# A tibble: 3 x 10
# Groups: id [3]
# id var1 var2 var3_1 var3_2 var3_3 var4_3 var4_4 var4_5 var4_6
# <dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 x y 0 3 NA 1 6 NA NA
#2 2 x y 5 2 5 NA 3 6 2
#3 3 x y 2 9 8 NA 8 7 5
上面弄乱了列名,以后可以手动更改或使用以下方法(按照@AntoniosK 的建议)
inner_join(data1, data2, by = c("id" = "data1Id")) %>%
gather(key, value, -(1:3)) %>%
filter(key != "id.y") %>%
group_by(id, key) %>%
mutate(key1 = paste(key, row_number(), sep = "_")) %>%
ungroup() %>%
select(-key) %>%
spread(key1, value)
# A tibble: 3 x 9
# id var1 var2 var3_1 var3_2 var3_3 var4_1 var4_2 var4_3
# <dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 x y 0 3 NA 1 6 NA
#2 2 x y 5 2 5 3 6 2
#3 3 x y 2 9 8 8 7 5
我们还可以使用 data.table
连接和 melt/dcast
library(data.table)
setDT(data1)[dcast(melt(setDT(data2)[, id := rowid(data1Id)],
id.var = c('id', 'data1Id'))[, variable := paste(variable, id,
sep="_")], data1Id ~ variable, value.var = 'value'), on = .(id = data1Id)]
# id var1 var2 var3_1 var3_2 var3_3 var4_1 var4_2 var4_3
#1: 1 x y 0 3 NA 1 6 NA
#2: 2 x y 5 2 5 3 6 2
#3: 3 x y 2 9 8 8 7 5
我有两个 data.frames(data1
和 data2
)。
data1 <- data.frame(id = c(1,2,3),
var1 = c("x", "x", "x"),
var2 = c("y", "y", "y") )
data2 <- data.frame(id = c(1,2,3,4,5,6,7,8),
data1Id = c(1,1,2,2,2,3,3,3),
var3 = c(0, 3, 5, 2, 5, 2, 9, 8),
var4 = c(1, 6, 3, 6, 2, 8, 7, 5))
data1
中的每个条目在 data2
中都有多个条目。两个表都通过变量 data1Id
链接在一起。我的目标是拥有一首单曲data.frame。所以我想将 data2
的列添加到 data1
并为 data2
.
在上面的例子中
id var1 var2 var3_1 var3_2 var3_3 var4_1 var4_2 var4_3
----------------------------------------------------------------------------
1 x y 0 3 NA 1 6 NA
2 x y 5 2 5 3 6 2
3 x y 2 9 8 8 7 5
任何人都可以告诉我如何实现这一目标吗?
你必须重塑你的 data2
然后加入 data1
:
library(tidyverse)
data2 %>%
select(-id) %>%
group_by(data1Id) %>% # for each data1Id
mutate(indx = row_number()) %>% # create a row index (useful to reshape)
ungroup() %>%
gather(var, value, -data1Id, -indx) %>% # reshape dataset
unite(var, var, indx) %>% # combine those to columns to create new column names for reshaping
spread(var, value) %>% # reshape again
right_join(data1, by = c("data1Id" = "id")) %>% # join to data1
select(id = data1Id, var1, var2, everything()) # update column order
# # A tibble: 3 x 9
# id var1 var2 var3_1 var3_2 var3_3 var4_1 var4_2 var4_3
# <dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 x y 0 3 NA 1 6 NA
# 2 2 x y 5 2 5 3 6 2
# 3 3 x y 2 9 8 8 7 5
这是另一种 tidyverse
使用 inner_join
library(tidyverse)
inner_join(data1, data2, by = c("id" = "data1Id")) %>%
gather(key, value, -(1:3)) %>%
filter(key != "id.y") %>%
group_by(id) %>%
mutate(key = paste(key, row_number(), sep = "_")) %>%
spread(key, value)
# A tibble: 3 x 10
# Groups: id [3]
# id var1 var2 var3_1 var3_2 var3_3 var4_3 var4_4 var4_5 var4_6
# <dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 x y 0 3 NA 1 6 NA NA
#2 2 x y 5 2 5 NA 3 6 2
#3 3 x y 2 9 8 NA 8 7 5
上面弄乱了列名,以后可以手动更改或使用以下方法(按照@AntoniosK 的建议)
inner_join(data1, data2, by = c("id" = "data1Id")) %>%
gather(key, value, -(1:3)) %>%
filter(key != "id.y") %>%
group_by(id, key) %>%
mutate(key1 = paste(key, row_number(), sep = "_")) %>%
ungroup() %>%
select(-key) %>%
spread(key1, value)
# A tibble: 3 x 9
# id var1 var2 var3_1 var3_2 var3_3 var4_1 var4_2 var4_3
# <dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 x y 0 3 NA 1 6 NA
#2 2 x y 5 2 5 3 6 2
#3 3 x y 2 9 8 8 7 5
我们还可以使用 data.table
连接和 melt/dcast
library(data.table)
setDT(data1)[dcast(melt(setDT(data2)[, id := rowid(data1Id)],
id.var = c('id', 'data1Id'))[, variable := paste(variable, id,
sep="_")], data1Id ~ variable, value.var = 'value'), on = .(id = data1Id)]
# id var1 var2 var3_1 var3_2 var3_3 var4_1 var4_2 var4_3
#1: 1 x y 0 3 NA 1 6 NA
#2: 2 x y 5 2 5 3 6 2
#3: 3 x y 2 9 8 8 7 5