合并具有相等和不相等数据的行
Merge rows with equal and unequal data
我正在努力合并一些杂乱的数据。
我有一个这样的数据框:
df <- data.frame(name = c("A", "A", "B", "B", "C", "C"),
number = c(1, 1, 2, 2, 3, 3),
product = c("fixed", "variable", "aggregate", "variable", "fixed", "fixed"),
vol = c(1, 9, 2, 6, 4, 7)
)
这是我正在努力的方向:
result <- data.frame(name = c("A", "B", "C"),
number = c(1, 2, 3),
new_product = c("fixed variable", "aggregate variable", "fixed"),
vol = c(10, 8, 11)
)
我的问题是我需要合并数据框中所有相等的行。如果它们不是唯一的,我需要将它们合并成一个类似于结果中的名称。
我试过 dplyr,但在 dplyr 中我无法让 new_product 以任何有意义的方式合并,因为我无法再次引用同一列。
df %>% group_by(name) %>% summarize (name = name,
number = number,
newproduct = paste(product, product) # ????
非常感谢任何帮助!
下面是我将如何使用 data.table
来解决这个问题,尽管我不确定你是如何定义 number
library(data.table)
result <- setDT(df)[,.(new_product = toString(unique(product)), vol = sum(vol)), by = name]
result[, number := .I]
result
# name new_product vol number
# 1: A fixed, variable 10 1
# 2: B aggregate, variable 8 2
# 3: C fixed 11 3
注意:如果您更喜欢输出,可以使用 paste(unique(product), collapse = " ")
而不是 toString
。
或与 dplyr
类似
df %>%
group_by(name) %>%
summarise(new_product = toString(unique(product)), vol=sum(vol)) %>%
mutate(number = row_number())
其他人已经回复了,但我的解决方案是:
df %>%
group_by (name) %>%
summarise(
new_product = paste (unique(product), collapse=" "),
vol = sum(vol)) %>%
mutate(number = row_number()) %>%
select(name, number, new_product, vol)
基础 R 加一些咖喱:
library(functional)
aggregStrFunc = Compose(unique, Curry(paste, collapse=','))
setNames(cbind(
aggregate(df$vol, by=list(name=df$name), sum),
aggregate(df$product, by=list(df$name), aggregStrFunc)[-1]
), c('Name', 'Vol', 'New_Product'))
# Name Vol New_Product
#1 A 10 fixed,variable
#2 B 8 aggregate,variable
#3 C 11 fixed
这里再介绍两种比较纯粹的基础方式:
df <- data.frame(name = c("A", "A", "B", "B", "C", "C"),
number = rep(1:3, times = 2, each = 1),
product = c("fixed", "variable", "aggregate", "variable", "fixed", "fixed"),
vol = c(1, 9, 2, 6, 4, 7)
)
- 这个只是使用
ave
作用于原始数据框,然后删除重复项
within(df, {
new_product <- ave(seq_along(name), name, FUN = function(x)
toString(unique(df[x, 'product'])))
vol <- ave(vol, name, FUN = sum)
product <- NULL
})[!duplicated(df$name), ]
# name number vol new_product
# 1 A 1 10 fixed, variable
# 3 B 3 8 aggregate, variable
# 5 C 2 11 fixed
- 这个比较迂回,用
aggregate
创建new_product
,然后匹配回原来的,最后再用aggregate按group[=24求和=]
(tmp <- aggregate(product ~ name, df, function(x)
paste0(unique(x), collapse = ' ')))
# name product
# 1 A fixed variable
# 2 B aggregate variable
# 3 C fixed
df$new_product <- tmp[match(df$name, tmp$name), 'product']
res <- aggregate(vol ~ name + new_product, df, sum)
within(res[order(res$name), ], {
number <- 1:nrow(res)
})
# name new_product vol number
# 3 A fixed variable 10 1
# 1 B aggregate variable 8 2
# 2 C fixed 11 3
我正在努力合并一些杂乱的数据。
我有一个这样的数据框:
df <- data.frame(name = c("A", "A", "B", "B", "C", "C"),
number = c(1, 1, 2, 2, 3, 3),
product = c("fixed", "variable", "aggregate", "variable", "fixed", "fixed"),
vol = c(1, 9, 2, 6, 4, 7)
)
这是我正在努力的方向:
result <- data.frame(name = c("A", "B", "C"),
number = c(1, 2, 3),
new_product = c("fixed variable", "aggregate variable", "fixed"),
vol = c(10, 8, 11)
)
我的问题是我需要合并数据框中所有相等的行。如果它们不是唯一的,我需要将它们合并成一个类似于结果中的名称。
我试过 dplyr,但在 dplyr 中我无法让 new_product 以任何有意义的方式合并,因为我无法再次引用同一列。
df %>% group_by(name) %>% summarize (name = name,
number = number,
newproduct = paste(product, product) # ????
非常感谢任何帮助!
下面是我将如何使用 data.table
来解决这个问题,尽管我不确定你是如何定义 number
library(data.table)
result <- setDT(df)[,.(new_product = toString(unique(product)), vol = sum(vol)), by = name]
result[, number := .I]
result
# name new_product vol number
# 1: A fixed, variable 10 1
# 2: B aggregate, variable 8 2
# 3: C fixed 11 3
注意:如果您更喜欢输出,可以使用 paste(unique(product), collapse = " ")
而不是 toString
。
或与 dplyr
df %>%
group_by(name) %>%
summarise(new_product = toString(unique(product)), vol=sum(vol)) %>%
mutate(number = row_number())
其他人已经回复了,但我的解决方案是:
df %>%
group_by (name) %>%
summarise(
new_product = paste (unique(product), collapse=" "),
vol = sum(vol)) %>%
mutate(number = row_number()) %>%
select(name, number, new_product, vol)
基础 R 加一些咖喱:
library(functional)
aggregStrFunc = Compose(unique, Curry(paste, collapse=','))
setNames(cbind(
aggregate(df$vol, by=list(name=df$name), sum),
aggregate(df$product, by=list(df$name), aggregStrFunc)[-1]
), c('Name', 'Vol', 'New_Product'))
# Name Vol New_Product
#1 A 10 fixed,variable
#2 B 8 aggregate,variable
#3 C 11 fixed
这里再介绍两种比较纯粹的基础方式:
df <- data.frame(name = c("A", "A", "B", "B", "C", "C"),
number = rep(1:3, times = 2, each = 1),
product = c("fixed", "variable", "aggregate", "variable", "fixed", "fixed"),
vol = c(1, 9, 2, 6, 4, 7)
)
- 这个只是使用
ave
作用于原始数据框,然后删除重复项
within(df, {
new_product <- ave(seq_along(name), name, FUN = function(x)
toString(unique(df[x, 'product'])))
vol <- ave(vol, name, FUN = sum)
product <- NULL
})[!duplicated(df$name), ]
# name number vol new_product
# 1 A 1 10 fixed, variable
# 3 B 3 8 aggregate, variable
# 5 C 2 11 fixed
- 这个比较迂回,用
aggregate
创建new_product
,然后匹配回原来的,最后再用aggregate按group[=24求和=]
(tmp <- aggregate(product ~ name, df, function(x)
paste0(unique(x), collapse = ' ')))
# name product
# 1 A fixed variable
# 2 B aggregate variable
# 3 C fixed
df$new_product <- tmp[match(df$name, tmp$name), 'product']
res <- aggregate(vol ~ name + new_product, df, sum)
within(res[order(res$name), ], {
number <- 1:nrow(res)
})
# name new_product vol number
# 3 A fixed variable 10 1
# 1 B aggregate variable 8 2
# 2 C fixed 11 3