合并和转换两组数据
Merging and transforming two sets of data
我有两个相当大的数据文件,需要通过以下方式合并为一个:
A <- tibble(
id=1:2,
firstName=c("Alice", "Bob")
)
B <- tibble(
id=c(1,1,2),
email=c("alice@wonder.land.com", "alice2@wonderland.com", "bob@builder.com")
)
desiredResult <- tibble(
id=1:2,
firstName=c("Alice", "Bob"),
email1=c("alice@wonderland.com", "bob@builder.com"),
email2=c("alice2@wonderland.com", NA)
)
如何有效地完成这项工作?我尝试使用 spread()
但没有成功,只能拼凑出一个糟糕的解决方案:
notGood <-
inner_join(A, B, by = "id") %>%
split(., .$id) %>%
map_dfr(function(x) as.tibble(t(unlist(x)))) %>%
replace(is.na(.), "") %>%
unite(id, id1, id, sep = "") %>%
unite(firstName, firstName1, firstName, sep = "") %>%
unite(email, email1, email, sep = "") %>%
select(id, firstName, matches("email"))
编辑:
建议的解决方案效果很好,但我如何将它们应用于多个列?就像这个例子:
A <- tibble(
id=1:2,
firstName=c("Alice", "Bob")
)
B <- tibble(
id=c(1,1,2),
email=c("alice@wonder.land.com", "alice2@wonderland.com", "bob@builder.com"),
phone=c("123", "456", "789")
)
desiredResult <- tibble(
id=1:2,
firstName=c("Alice", "Bob"),
email1=c("alice@wonderland.com", "bob@builder.com"),
email2=c("alice2@wonderland.com", NA),
phone1=c("123", "789"),
phone2=c("456", NA)
)
简单地向建议的答案添加更多的列名称并不完全有效:
A %>%
left_join(B, by='id') %>%
group_by(id)%>%
mutate(rn=paste0('email',row_number())) %>%
mutate(rn2=paste0('phone',row_number())) %>%
spread(rn, email) %>%
spread(rn2, phone)
检查这个解决方案:
B %>%
group_by(id) %>%
mutate(rn = paste0('email', row_number())) %>%
spread(rn, email) %>%
right_join(A) %>%
select(id, firstName, everything())
添加问题的答案:
A %>%
left_join(
B %>%
gather(key, val, -id) %>%
group_by(id, key) %>%
mutate(key2 = paste0(key, row_number())) %>%
ungroup() %>%
select(-key) %>%
spread(key2, val)
)
desiredResult <-
A %>%
inner_join(B %>%
group_by(id) %>%
mutate(ColName = paste0("email",row_number())) %>%
ungroup() %>%
spread(ColName, email), by = "id")
我有两个相当大的数据文件,需要通过以下方式合并为一个:
A <- tibble(
id=1:2,
firstName=c("Alice", "Bob")
)
B <- tibble(
id=c(1,1,2),
email=c("alice@wonder.land.com", "alice2@wonderland.com", "bob@builder.com")
)
desiredResult <- tibble(
id=1:2,
firstName=c("Alice", "Bob"),
email1=c("alice@wonderland.com", "bob@builder.com"),
email2=c("alice2@wonderland.com", NA)
)
如何有效地完成这项工作?我尝试使用 spread()
但没有成功,只能拼凑出一个糟糕的解决方案:
notGood <-
inner_join(A, B, by = "id") %>%
split(., .$id) %>%
map_dfr(function(x) as.tibble(t(unlist(x)))) %>%
replace(is.na(.), "") %>%
unite(id, id1, id, sep = "") %>%
unite(firstName, firstName1, firstName, sep = "") %>%
unite(email, email1, email, sep = "") %>%
select(id, firstName, matches("email"))
编辑:
建议的解决方案效果很好,但我如何将它们应用于多个列?就像这个例子:
A <- tibble(
id=1:2,
firstName=c("Alice", "Bob")
)
B <- tibble(
id=c(1,1,2),
email=c("alice@wonder.land.com", "alice2@wonderland.com", "bob@builder.com"),
phone=c("123", "456", "789")
)
desiredResult <- tibble(
id=1:2,
firstName=c("Alice", "Bob"),
email1=c("alice@wonderland.com", "bob@builder.com"),
email2=c("alice2@wonderland.com", NA),
phone1=c("123", "789"),
phone2=c("456", NA)
)
简单地向建议的答案添加更多的列名称并不完全有效:
A %>%
left_join(B, by='id') %>%
group_by(id)%>%
mutate(rn=paste0('email',row_number())) %>%
mutate(rn2=paste0('phone',row_number())) %>%
spread(rn, email) %>%
spread(rn2, phone)
检查这个解决方案:
B %>%
group_by(id) %>%
mutate(rn = paste0('email', row_number())) %>%
spread(rn, email) %>%
right_join(A) %>%
select(id, firstName, everything())
添加问题的答案:
A %>%
left_join(
B %>%
gather(key, val, -id) %>%
group_by(id, key) %>%
mutate(key2 = paste0(key, row_number())) %>%
ungroup() %>%
select(-key) %>%
spread(key2, val)
)
desiredResult <-
A %>%
inner_join(B %>%
group_by(id) %>%
mutate(ColName = paste0("email",row_number())) %>%
ungroup() %>%
spread(ColName, email), by = "id")