将篮子数据框融化为没有循环的单个数据框
Melt basket dataframe to single dataframe without loops
我有一个篮子格式的数据框,如下所示:
V1 <- c('milk', 'beer', 'wrench', 'milk' )
V2 <- c('eggs', 'elbow grease', '', 'beer')
V3 <- c('water', '', '', '')
df <- data.frame(V1, V2, V3)
输出:
V1 V2 V3
1 milk eggs water
2 beer elbow grease
3 wrench
4 milk beer
我想要生成的是像这样的单一格式的数据框:
transaction product
1 1 milk
2 1 eggs
3 1 water
4 2 beer
5 2 elbow grease
6 3 wrench
7 4 milk
8 4 beer
现在,我想要数据框中的数据,这样我就可以在切换到 apriori R 包使用的事务格式之前进行过滤。
将此数据框从篮子格式转换为单一格式的最快方法是什么?
现在我正在使用一个非常慢的循环。
dfSingle <- data.frame(product = character(),
transaction = integer())
for (row in 1:nrow(df)) {
# Create a list of products
productList <- unname(unlist(df[row, ]))
# Remove blank spaces
productList <- productList[!productList %in% ""]
# Convert to a dataframe
dfTemp <- as.data.frame(productList)
colnames(dfTemp) <- "product"
dfTemp$transaction <- row
# Bind to larger dataframe with previous rows
dfSingle <- rbind(dfSingle, dfTemp)
}
我考虑过使用 apply
将此函数应用于每一行,但我对如何将多个结果行绑定到前几行的结果感到困惑。
您可以使用 stack
。诀窍是转置您的数据框,即
df1 <- stack(as.data.frame(t(df), stringsAsFactors = FALSE))
df1[df1$values != '',]
values ind
#1 milk V1
#2 eggs V1
#3 water V1
#4 beer V2
#5 elbow grease V2
#7 wrench V3
#10 milk V4
#11 beer V4
注意: 一个简单的 rgex 只能从 ind
列中提取数字,即
df1$ind <- gsub('\D+', '', df1$ind)
这会给,
values ind
1 milk 1
2 eggs 1
3 water 1
4 beer 2
5 elbow grease 2
7 wrench 3
10 milk 4
11 beer 4
使用 tidyverse
你可以:
df %>%
mutate_all(funs(ifelse(. == "", NA_character_, paste0(.)))) %>%
rowid_to_column(var = "transaction") %>%
gather(var, product, -transaction, na.rm = TRUE) %>%
select(-var) %>%
arrange(transaction)
transaction product
1 1 milk
2 1 eggs
3 1 water
4 2 beer
5 2 elbow grease
6 3 wrench
7 4 milk
8 4 beer
首先,它将空行替换为 NA_character_。其次,它创建一个行 ID 为 "transaction" 的变量。第三,它将数据从宽格式转换为长格式,并删除带有 NA_character_ 的行。最后,它按照"transaction".
排列数据
或data.table方法
(单行)
首先从行名中获取交易:setDT(df)[, transaction := .I ]
然后融化,使用事务作为 id 列:melt( ... , id = "transaction" )
最后删除空值和 return 第一列和第三列:...[!value == "", c(1,3) ]
melt( setDT(df)[, transaction := .I ], id = "transaction" )[!value == "", c(1,3) ]
# transaction value
# 1: 1 milk
# 2: 2 beer
# 3: 3 wrench
# 4: 4 milk
# 5: 1 eggs
# 6: 2 elbow grease
# 7: 4 beer
# 8: 1 water
将字符""
替换为合适的格式NA
后,就可以创建一个新的列事务,然后使用reshape2::melt
:
df[df == ""] <- NA
df$transaction <- 1:nrow(df)
然后:
melted_df <- na.omit(reshape2::melt(data=df, id.vars="transaction"))
产生:
> melted_df
transaction variable value
1 1 V1 milk
2 2 V1 beer
3 3 V1 wrench
4 4 V1 milk
5 1 V2 eggs
6 2 V2 elbow grease
8 4 V2 beer
9 1 V3 water
这个函数的好处是它会给你一个列variable
,它给你前面df
data.frame的列的名称。如果它与您无关,请使用 df$variable <- NULL
删除此列。如果您还想通过增加交易顺序对结果进行排序:
out <- melted_df[order(melted_df$transaction), ]
最终产生:
> out
transaction value
1 1 milk
5 1 eggs
9 1 water
2 2 beer
6 2 elbow grease
3 3 wrench
4 4 milk
8 4 beer
另一个基地R
替代:
do.call(
rbind,
sapply(seq_along(df), function(i) cbind(transaction = i, product = df[[i]][nzchar(df[[i]])]))
)
transaction product
[1,] "1" "milk"
[2,] "1" "beer"
[3,] "1" "wrench"
[4,] "1" "milk"
[5,] "2" "eggs"
[6,] "2" "elbow grease"
[7,] "2" "beer"
[8,] "3" "water"
我有一个篮子格式的数据框,如下所示:
V1 <- c('milk', 'beer', 'wrench', 'milk' )
V2 <- c('eggs', 'elbow grease', '', 'beer')
V3 <- c('water', '', '', '')
df <- data.frame(V1, V2, V3)
输出:
V1 V2 V3
1 milk eggs water
2 beer elbow grease
3 wrench
4 milk beer
我想要生成的是像这样的单一格式的数据框:
transaction product
1 1 milk
2 1 eggs
3 1 water
4 2 beer
5 2 elbow grease
6 3 wrench
7 4 milk
8 4 beer
现在,我想要数据框中的数据,这样我就可以在切换到 apriori R 包使用的事务格式之前进行过滤。
将此数据框从篮子格式转换为单一格式的最快方法是什么?
现在我正在使用一个非常慢的循环。
dfSingle <- data.frame(product = character(),
transaction = integer())
for (row in 1:nrow(df)) {
# Create a list of products
productList <- unname(unlist(df[row, ]))
# Remove blank spaces
productList <- productList[!productList %in% ""]
# Convert to a dataframe
dfTemp <- as.data.frame(productList)
colnames(dfTemp) <- "product"
dfTemp$transaction <- row
# Bind to larger dataframe with previous rows
dfSingle <- rbind(dfSingle, dfTemp)
}
我考虑过使用 apply
将此函数应用于每一行,但我对如何将多个结果行绑定到前几行的结果感到困惑。
您可以使用 stack
。诀窍是转置您的数据框,即
df1 <- stack(as.data.frame(t(df), stringsAsFactors = FALSE))
df1[df1$values != '',]
values ind
#1 milk V1
#2 eggs V1
#3 water V1
#4 beer V2
#5 elbow grease V2
#7 wrench V3
#10 milk V4
#11 beer V4
注意: 一个简单的 rgex 只能从 ind
列中提取数字,即
df1$ind <- gsub('\D+', '', df1$ind)
这会给,
values ind 1 milk 1 2 eggs 1 3 water 1 4 beer 2 5 elbow grease 2 7 wrench 3 10 milk 4 11 beer 4
使用 tidyverse
你可以:
df %>%
mutate_all(funs(ifelse(. == "", NA_character_, paste0(.)))) %>%
rowid_to_column(var = "transaction") %>%
gather(var, product, -transaction, na.rm = TRUE) %>%
select(-var) %>%
arrange(transaction)
transaction product
1 1 milk
2 1 eggs
3 1 water
4 2 beer
5 2 elbow grease
6 3 wrench
7 4 milk
8 4 beer
首先,它将空行替换为 NA_character_。其次,它创建一个行 ID 为 "transaction" 的变量。第三,它将数据从宽格式转换为长格式,并删除带有 NA_character_ 的行。最后,它按照"transaction".
排列数据或data.table方法 (单行)
首先从行名中获取交易:setDT(df)[, transaction := .I ]
然后融化,使用事务作为 id 列:melt( ... , id = "transaction" )
最后删除空值和 return 第一列和第三列:...[!value == "", c(1,3) ]
melt( setDT(df)[, transaction := .I ], id = "transaction" )[!value == "", c(1,3) ]
# transaction value
# 1: 1 milk
# 2: 2 beer
# 3: 3 wrench
# 4: 4 milk
# 5: 1 eggs
# 6: 2 elbow grease
# 7: 4 beer
# 8: 1 water
将字符""
替换为合适的格式NA
后,就可以创建一个新的列事务,然后使用reshape2::melt
:
df[df == ""] <- NA
df$transaction <- 1:nrow(df)
然后:
melted_df <- na.omit(reshape2::melt(data=df, id.vars="transaction"))
产生:
> melted_df
transaction variable value
1 1 V1 milk
2 2 V1 beer
3 3 V1 wrench
4 4 V1 milk
5 1 V2 eggs
6 2 V2 elbow grease
8 4 V2 beer
9 1 V3 water
这个函数的好处是它会给你一个列variable
,它给你前面df
data.frame的列的名称。如果它与您无关,请使用 df$variable <- NULL
删除此列。如果您还想通过增加交易顺序对结果进行排序:
out <- melted_df[order(melted_df$transaction), ]
最终产生:
> out
transaction value
1 1 milk
5 1 eggs
9 1 water
2 2 beer
6 2 elbow grease
3 3 wrench
4 4 milk
8 4 beer
另一个基地R
替代:
do.call(
rbind,
sapply(seq_along(df), function(i) cbind(transaction = i, product = df[[i]][nzchar(df[[i]])]))
)
transaction product
[1,] "1" "milk"
[2,] "1" "beer"
[3,] "1" "wrench"
[4,] "1" "milk"
[5,] "2" "eggs"
[6,] "2" "elbow grease"
[7,] "2" "beer"
[8,] "3" "water"