id 重复时按行绑定数据框
Binds dataframe in rows when duplicated id
我在 R 中有一个具有重复 ID 的数据框,我想将其对齐到同一行。
df <- data_frame(id = c("A1", "A2", "C2", "A2", "C2", "A2"),
date = c("2010-01-15", "2016-03-05", "2017-05-21", "2013-09-03", "2015-11-25", "2011-07-07"),
iT = c("z", "z", "v", "w", "z", "v"))
我想用 for 循环将同一行上的每个 ID 对齐,但一次只能对一个变量进行子集化(实际上我实际上有大约 10 个)。
datalist = list()
datalist1 = list()
unique_id <- unique(df$id)
for (i in unique_id) {
subdf <- subset(df$date, df$id == i)
datalist[[i]] <- subdf
subdf1 <- subset(df$iT, df$id == i)
datalist1[[i]] <- subdf1
}
df1 <- plyr::ldply(datalist, rbind)
df2 <- plyr::ldply(datalist1, rbind)
df3 <- merge.data.frame(df1,df2, by.x = ".id", by.y = ".id")
我有数千行这样的 ID 有 2 到 7 个重复
这是我想要的:
df3 <- data_frame(id = c("A1", "A2", "C2"),
date1 = c("2010-01-15", "2016-03-05", "2017-05-21"),
iT = c("z", "z", "v"),
date2 = c("NA", "2013-09-03","2015-11-25"),
iT.2 = c("NA", "w", "z"),
date3 = c("NA", "2011-07-07", "NA"),
iT.3 = c("NA", "v", "NA"))
"align on the same row" = 重塑。 ;)
Base R(变形)
df <- as.data.frame(df)[order(df$id),] # Convert to data frame and order by id
df$time <- ave(df$id, df$id, FUN=seq_along) # Add "time" variable.
stats::reshape(df, direction="wide", v.names=c("date","iT")) # just one line.
id date.1 iT.1 date.2 iT.2 date.3 iT.3
1 A1 2010-01-15 z <NA> <NA> <NA> <NA>
2 A2 2016-03-05 z 2013-09-03 w 2011-07-07 v
3 C2 2017-05-21 v 2015-11-25 z <NA> <NA>
使用dplyr
和tidyr
,您可以获得长格式的数据,为每个id
和列名的组合创建一个唯一的id,并将数据恢复为宽格式.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -id) %>%
group_by(id, name) %>%
mutate(name1 = paste0(name, row_number())) %>%
ungroup() %>%
select(-name) %>%
pivot_wider(names_from = name1, values_from = value)
# id date1 iT1 date2 iT2 date3 iT3
# <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 A1 2010-01-15 z NA NA NA NA
#2 A2 2016-03-05 z 2013-09-03 w 2011-07-07 v
#3 C2 2017-05-21 v 2015-11-25 z NA NA
dcast()
可以一次重塑多个值列:
library(data.table)
dcast(setDT(df), id ~ rowid(id), value.var = c("date", "iT"))
id date_1 date_2 date_3 iT_1 iT_2 iT_3
1: A1 2010-01-15 <NA> <NA> z <NA> <NA>
2: A2 2016-03-05 2013-09-03 2011-07-07 z w v
3: C2 2017-05-21 2015-11-25 <NA> v z <NA>
考虑使用 transform
、ave
、seq_along
、reshape
、grep
、merge
、paste0
的基数 R , 和 outer
:
# CREATE A RUNNING GROUP NUMBER FOR RESHAPING
df$id_num <- with(transform(df, n=1), ave(n, id, FUN=seq_along))
# MERGE TWO WIDE FORMAT SETS FOR date and iT
df <- merge(reshape(df[c("id", "id_num", names(df)[grep("date", names(df))])],
v.names = "date", timevar = "id_num", direction = "wide"),
reshape(df[c("id", "id_num", names(df)[grep("iT", names(df))])],
v.names = "iT", timevar = "id_num", direction = "wide"),
by = "id", suffices = c("", "_"))
# RE-ORDER COLUMNS BY PAIR COMBINATIONS
df <- df[c("id", outer(c("date.", "iT."), c(1:3), paste0))]
df
# id date.1 iT.1 date.2 iT.2 date.3 iT.3
# 1 A1 2010-01-15 z <NA> <NA> <NA> <NA>
# 2 A2 2016-03-05 z 2013-09-03 w 2011-07-07 v
# 3 C2 2017-05-21 v 2015-11-25 z <NA> <NA>
@Uwe 的回答很好。对于小数据,我还使用 tidyverse
.
找到了这种方式
%>% group_by(id) %>%
summarise_at(vars(date, iT), paste, collapse = "; ") %>%
separate(col = date, paste("date", 1:10, sep=""), sep = "; ", extra = "warn", fill = "right") %>%
separate(col = iT, paste("iT", 1:10, sep=""), sep = "; ", extra = "warn", fill = "right") %>%
purrr::keep(~!all(is.na(.)))
正如我所说,最好将它用于小数据,或者如果您想将重复 ID 的信息合并到一个单元格中。
我在 R 中有一个具有重复 ID 的数据框,我想将其对齐到同一行。
df <- data_frame(id = c("A1", "A2", "C2", "A2", "C2", "A2"),
date = c("2010-01-15", "2016-03-05", "2017-05-21", "2013-09-03", "2015-11-25", "2011-07-07"),
iT = c("z", "z", "v", "w", "z", "v"))
我想用 for 循环将同一行上的每个 ID 对齐,但一次只能对一个变量进行子集化(实际上我实际上有大约 10 个)。
datalist = list()
datalist1 = list()
unique_id <- unique(df$id)
for (i in unique_id) {
subdf <- subset(df$date, df$id == i)
datalist[[i]] <- subdf
subdf1 <- subset(df$iT, df$id == i)
datalist1[[i]] <- subdf1
}
df1 <- plyr::ldply(datalist, rbind)
df2 <- plyr::ldply(datalist1, rbind)
df3 <- merge.data.frame(df1,df2, by.x = ".id", by.y = ".id")
我有数千行这样的 ID 有 2 到 7 个重复
这是我想要的:
df3 <- data_frame(id = c("A1", "A2", "C2"),
date1 = c("2010-01-15", "2016-03-05", "2017-05-21"),
iT = c("z", "z", "v"),
date2 = c("NA", "2013-09-03","2015-11-25"),
iT.2 = c("NA", "w", "z"),
date3 = c("NA", "2011-07-07", "NA"),
iT.3 = c("NA", "v", "NA"))
"align on the same row" = 重塑。 ;)
Base R(变形)
df <- as.data.frame(df)[order(df$id),] # Convert to data frame and order by id
df$time <- ave(df$id, df$id, FUN=seq_along) # Add "time" variable.
stats::reshape(df, direction="wide", v.names=c("date","iT")) # just one line.
id date.1 iT.1 date.2 iT.2 date.3 iT.3
1 A1 2010-01-15 z <NA> <NA> <NA> <NA>
2 A2 2016-03-05 z 2013-09-03 w 2011-07-07 v
3 C2 2017-05-21 v 2015-11-25 z <NA> <NA>
使用dplyr
和tidyr
,您可以获得长格式的数据,为每个id
和列名的组合创建一个唯一的id,并将数据恢复为宽格式.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -id) %>%
group_by(id, name) %>%
mutate(name1 = paste0(name, row_number())) %>%
ungroup() %>%
select(-name) %>%
pivot_wider(names_from = name1, values_from = value)
# id date1 iT1 date2 iT2 date3 iT3
# <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 A1 2010-01-15 z NA NA NA NA
#2 A2 2016-03-05 z 2013-09-03 w 2011-07-07 v
#3 C2 2017-05-21 v 2015-11-25 z NA NA
dcast()
可以一次重塑多个值列:
library(data.table)
dcast(setDT(df), id ~ rowid(id), value.var = c("date", "iT"))
id date_1 date_2 date_3 iT_1 iT_2 iT_3 1: A1 2010-01-15 <NA> <NA> z <NA> <NA> 2: A2 2016-03-05 2013-09-03 2011-07-07 z w v 3: C2 2017-05-21 2015-11-25 <NA> v z <NA>
考虑使用 transform
、ave
、seq_along
、reshape
、grep
、merge
、paste0
的基数 R , 和 outer
:
# CREATE A RUNNING GROUP NUMBER FOR RESHAPING
df$id_num <- with(transform(df, n=1), ave(n, id, FUN=seq_along))
# MERGE TWO WIDE FORMAT SETS FOR date and iT
df <- merge(reshape(df[c("id", "id_num", names(df)[grep("date", names(df))])],
v.names = "date", timevar = "id_num", direction = "wide"),
reshape(df[c("id", "id_num", names(df)[grep("iT", names(df))])],
v.names = "iT", timevar = "id_num", direction = "wide"),
by = "id", suffices = c("", "_"))
# RE-ORDER COLUMNS BY PAIR COMBINATIONS
df <- df[c("id", outer(c("date.", "iT."), c(1:3), paste0))]
df
# id date.1 iT.1 date.2 iT.2 date.3 iT.3
# 1 A1 2010-01-15 z <NA> <NA> <NA> <NA>
# 2 A2 2016-03-05 z 2013-09-03 w 2011-07-07 v
# 3 C2 2017-05-21 v 2015-11-25 z <NA> <NA>
@Uwe 的回答很好。对于小数据,我还使用 tidyverse
.
%>% group_by(id) %>%
summarise_at(vars(date, iT), paste, collapse = "; ") %>%
separate(col = date, paste("date", 1:10, sep=""), sep = "; ", extra = "warn", fill = "right") %>%
separate(col = iT, paste("iT", 1:10, sep=""), sep = "; ", extra = "warn", fill = "right") %>%
purrr::keep(~!all(is.na(.)))
正如我所说,最好将它用于小数据,或者如果您想将重复 ID 的信息合并到一个单元格中。