R项目列表到宽格式
R item lists to wide format
我有一个项目列表数据框,其中数据框中的每一行都包含 LHS 和 RHS 关联规则以及相应的支持度、置信度和提升度。
这是数据:
structure(list(rules = structure(c(13L, 4L, 28L, 1L, 24L, 15L
), .Label = c("{butter,jam} => {whole milk}", "{butter,rice} => {whole milk}",
"{canned fish,hygiene articles} => {whole milk}", "{curd,cereals} => {whole milk}",
"{domestic eggs,rice} => {whole milk}", "{grapes,onions} => {other vegetables}",
"{hamburger meat,bottled beer} => {whole milk}", "{hamburger meat,curd} => {whole milk}",
"{hard cheese,oil} => {other vegetables}", "{herbs,fruit/vegetable juice} => {other vegetables}",
"{herbs,rolls/buns} => {whole milk}", "{herbs,shopping bags} => {other vegetables}",
"{liquor,red/blush wine} => {bottled beer}", "{meat,margarine} => {other vegetables}",
"{napkins,house keeping products} => {whole milk}", "{oil,mustard} => {whole milk}",
"{onions,butter milk} => {other vegetables}", "{onions,waffles} => {other vegetables}",
"{pastry,sweet spreads} => {whole milk}", "{pickled vegetables,chocolate} => {whole milk}",
"{pork,butter milk} => {other vegetables}", "{rice,bottled water} => {whole milk}",
"{rice,sugar} => {whole milk}", "{soups,bottled beer} => {whole milk}",
"{tropical fruit,herbs} => {whole milk}", "{turkey,curd} => {other vegetables}",
"{whipped/sour cream,house keeping products} => {whole milk}",
"{yogurt,cereals} => {whole milk}", "{yogurt,rice} => {other vegetables}"
), class = "factor"), support = c(0.00193187595322827, 0.00101677681748856,
0.00172852058973055, 0.00101677681748856, 0.00111845449923742,
0.00132180986273513), confidence = c(0.904761904761905, 0.909090909090909,
0.80952380952381, 0.833333333333333, 0.916666666666667, 0.8125
), lift = c(11.2352693602694, 3.55786275006331, 3.16819206791352,
3.26137418755803, 3.58751160631383, 3.17983983286908)), .Names = c("rules",
"support", "confidence", "lift"), row.names = c(NA, 6L), class = "data.frame")
我需要的是将这些规则构建成宽格式,其中对于规则的每个 LHS 部分中的每个项目都有一个值为 1 的指定列(以指示该规则在其 LHD 中包含该项目部分),同样适用于规则的 RHS,例如采取 2 个第一条规则:
{liquor,red/blush wine} => {bottled beer} 0.0019 0.90 11.2
{curd,cereals} => {whole milk} 0.0010 0.91 3.6
结果应该是一个如下所示的数据框:
'rules_id' 'lhs_liquor' 'lhs_red/blush wine' 'lhs_curd' 'lhs_cereals' 'rhs_bottled beer' 'rhd_whole milk' 'support' 'confidence' 'lift'
1 1 1 0 0 1 0 0.0019 0.90 11.2
2 0 0 1 1 0 1 0.0010 0.91 3.6
由于我是 R 的新手并且堆栈溢出,如果问题定义不明确请告诉我
任何帮助表示赞赏
你可以这样做
library(dplyr)
library(tidyr)
library(reshape2)
rules %>%
mutate(id = seq_len(n())) %>%
separate(rules, c("lhs", "rhs"), "\} => \{") %>%
separate_rows(lhs) %>% filter(lhs!="") %>%
gather(value, var, lhs, rhs) %>%
mutate(var=paste(value, sub("}", "", var, fixed=T), sep="_")) %>%
dcast(id+support+confidence+lift~var, fun.aggregate = function(x) (length(x)>0)+0L)
# id support confidence lift lhs_beer lhs_blush lhs_bottled lhs_butter lhs_cereals
# 1 1 0.001931876 0.9047619 11.235269 0 1 0 0 0
# 2 2 0.001016777 0.9090909 3.557863 0 0 0 0 1
# 3 3 0.001728521 0.8095238 3.168192 0 0 0 0 1
# 4 4 0.001016777 0.8333333 3.261374 0 0 0 1 0
# 5 5 0.001118454 0.9166667 3.587512 1 0 1 0 0
# 6 6 0.001321810 0.8125000 3.179840 0 0 0 0 0
# lhs_curd lhs_house lhs_jam lhs_keeping lhs_liquor lhs_napkins lhs_products lhs_red
# 1 0 0 0 0 1 0 0 1
# 2 1 0 0 0 0 0 0 0
# 3 0 0 0 0 0 0 0 0
# 4 0 0 1 0 0 0 0 0
# 5 0 0 0 0 0 0 0 0
# 6 0 1 0 1 0 1 1 0
# lhs_soups lhs_wine lhs_yogurt rhs_bottled beer rhs_whole milk
# 1 0 1 0 1 0
# 2 0 0 0 0 1
# 3 0 0 1 0 1
# 4 0 0 0 0 1
# 5 1 0 0 0 1
# 6 0 0 0 0 1
随意使用 tidyr 的 spread
而不是 reshape2 的 dcast
- 我仍然觉得 dcast 更不直观...
你可以做到这一点。
dummies <- function(x, prefix) {
x.names <- unique(unlist(strsplit(x, ',')))
out <- array(0L, c(nrow(df), length(x.names)), list(NULL, x.names))
mapply(function(i, val) out[i, val] <<- 1L, 1:nrow(out), strsplit(x, ','))
if (!missing(prefix))
colnames(out) <- paste0(prefix, colnames(out))
out
}
pat <- '[{](.*)[}] => [{](.*)[}]'
cbind(as.data.frame(
cbind(dummies(sub(pat, '\1', df$rules), 'lhs.'),
dummies(sub(pat, '\2', df$rules), 'rhs.'))),
df[c('support','confidence','lift')])
输出如下:
lhs.liquor lhs.red/blush wine lhs.curd lhs.cereals lhs.yogurt lhs.butter
1 1 1 0 0 0 0
2 0 0 1 1 0 0
3 0 0 0 1 1 0
4 0 0 0 0 0 1
5 0 0 0 0 0 0
6 0 0 0 0 0 0
lhs.jam lhs.soups lhs.bottled beer lhs.napkins lhs.house keeping products
1 0 0 0 0 0
2 0 0 0 0 0
3 0 0 0 0 0
4 1 0 0 0 0
5 0 1 1 0 0
6 0 0 0 1 1
rhs.bottled beer rhs.whole milk support confidence lift
1 1 0 0.001931876 0.9047619 11.235269
2 0 1 0.001016777 0.9090909 3.557863
3 0 1 0.001728521 0.8095238 3.168192
4 0 1 0.001016777 0.8333333 3.261374
5 0 1 0.001118454 0.9166667 3.587512
6 0 1 0.001321810 0.8125000 3.179840
我有一个项目列表数据框,其中数据框中的每一行都包含 LHS 和 RHS 关联规则以及相应的支持度、置信度和提升度。 这是数据:
structure(list(rules = structure(c(13L, 4L, 28L, 1L, 24L, 15L
), .Label = c("{butter,jam} => {whole milk}", "{butter,rice} => {whole milk}",
"{canned fish,hygiene articles} => {whole milk}", "{curd,cereals} => {whole milk}",
"{domestic eggs,rice} => {whole milk}", "{grapes,onions} => {other vegetables}",
"{hamburger meat,bottled beer} => {whole milk}", "{hamburger meat,curd} => {whole milk}",
"{hard cheese,oil} => {other vegetables}", "{herbs,fruit/vegetable juice} => {other vegetables}",
"{herbs,rolls/buns} => {whole milk}", "{herbs,shopping bags} => {other vegetables}",
"{liquor,red/blush wine} => {bottled beer}", "{meat,margarine} => {other vegetables}",
"{napkins,house keeping products} => {whole milk}", "{oil,mustard} => {whole milk}",
"{onions,butter milk} => {other vegetables}", "{onions,waffles} => {other vegetables}",
"{pastry,sweet spreads} => {whole milk}", "{pickled vegetables,chocolate} => {whole milk}",
"{pork,butter milk} => {other vegetables}", "{rice,bottled water} => {whole milk}",
"{rice,sugar} => {whole milk}", "{soups,bottled beer} => {whole milk}",
"{tropical fruit,herbs} => {whole milk}", "{turkey,curd} => {other vegetables}",
"{whipped/sour cream,house keeping products} => {whole milk}",
"{yogurt,cereals} => {whole milk}", "{yogurt,rice} => {other vegetables}"
), class = "factor"), support = c(0.00193187595322827, 0.00101677681748856,
0.00172852058973055, 0.00101677681748856, 0.00111845449923742,
0.00132180986273513), confidence = c(0.904761904761905, 0.909090909090909,
0.80952380952381, 0.833333333333333, 0.916666666666667, 0.8125
), lift = c(11.2352693602694, 3.55786275006331, 3.16819206791352,
3.26137418755803, 3.58751160631383, 3.17983983286908)), .Names = c("rules",
"support", "confidence", "lift"), row.names = c(NA, 6L), class = "data.frame")
我需要的是将这些规则构建成宽格式,其中对于规则的每个 LHS 部分中的每个项目都有一个值为 1 的指定列(以指示该规则在其 LHD 中包含该项目部分),同样适用于规则的 RHS,例如采取 2 个第一条规则:
{liquor,red/blush wine} => {bottled beer} 0.0019 0.90 11.2
{curd,cereals} => {whole milk} 0.0010 0.91 3.6
结果应该是一个如下所示的数据框:
'rules_id' 'lhs_liquor' 'lhs_red/blush wine' 'lhs_curd' 'lhs_cereals' 'rhs_bottled beer' 'rhd_whole milk' 'support' 'confidence' 'lift'
1 1 1 0 0 1 0 0.0019 0.90 11.2
2 0 0 1 1 0 1 0.0010 0.91 3.6
由于我是 R 的新手并且堆栈溢出,如果问题定义不明确请告诉我 任何帮助表示赞赏
你可以这样做
library(dplyr)
library(tidyr)
library(reshape2)
rules %>%
mutate(id = seq_len(n())) %>%
separate(rules, c("lhs", "rhs"), "\} => \{") %>%
separate_rows(lhs) %>% filter(lhs!="") %>%
gather(value, var, lhs, rhs) %>%
mutate(var=paste(value, sub("}", "", var, fixed=T), sep="_")) %>%
dcast(id+support+confidence+lift~var, fun.aggregate = function(x) (length(x)>0)+0L)
# id support confidence lift lhs_beer lhs_blush lhs_bottled lhs_butter lhs_cereals
# 1 1 0.001931876 0.9047619 11.235269 0 1 0 0 0
# 2 2 0.001016777 0.9090909 3.557863 0 0 0 0 1
# 3 3 0.001728521 0.8095238 3.168192 0 0 0 0 1
# 4 4 0.001016777 0.8333333 3.261374 0 0 0 1 0
# 5 5 0.001118454 0.9166667 3.587512 1 0 1 0 0
# 6 6 0.001321810 0.8125000 3.179840 0 0 0 0 0
# lhs_curd lhs_house lhs_jam lhs_keeping lhs_liquor lhs_napkins lhs_products lhs_red
# 1 0 0 0 0 1 0 0 1
# 2 1 0 0 0 0 0 0 0
# 3 0 0 0 0 0 0 0 0
# 4 0 0 1 0 0 0 0 0
# 5 0 0 0 0 0 0 0 0
# 6 0 1 0 1 0 1 1 0
# lhs_soups lhs_wine lhs_yogurt rhs_bottled beer rhs_whole milk
# 1 0 1 0 1 0
# 2 0 0 0 0 1
# 3 0 0 1 0 1
# 4 0 0 0 0 1
# 5 1 0 0 0 1
# 6 0 0 0 0 1
随意使用 tidyr 的 spread
而不是 reshape2 的 dcast
- 我仍然觉得 dcast 更不直观...
你可以做到这一点。
dummies <- function(x, prefix) {
x.names <- unique(unlist(strsplit(x, ',')))
out <- array(0L, c(nrow(df), length(x.names)), list(NULL, x.names))
mapply(function(i, val) out[i, val] <<- 1L, 1:nrow(out), strsplit(x, ','))
if (!missing(prefix))
colnames(out) <- paste0(prefix, colnames(out))
out
}
pat <- '[{](.*)[}] => [{](.*)[}]'
cbind(as.data.frame(
cbind(dummies(sub(pat, '\1', df$rules), 'lhs.'),
dummies(sub(pat, '\2', df$rules), 'rhs.'))),
df[c('support','confidence','lift')])
输出如下:
lhs.liquor lhs.red/blush wine lhs.curd lhs.cereals lhs.yogurt lhs.butter
1 1 1 0 0 0 0
2 0 0 1 1 0 0
3 0 0 0 1 1 0
4 0 0 0 0 0 1
5 0 0 0 0 0 0
6 0 0 0 0 0 0
lhs.jam lhs.soups lhs.bottled beer lhs.napkins lhs.house keeping products
1 0 0 0 0 0
2 0 0 0 0 0
3 0 0 0 0 0
4 1 0 0 0 0
5 0 1 1 0 0
6 0 0 0 1 1
rhs.bottled beer rhs.whole milk support confidence lift
1 1 0 0.001931876 0.9047619 11.235269
2 0 1 0.001016777 0.9090909 3.557863
3 0 1 0.001728521 0.8095238 3.168192
4 0 1 0.001016777 0.8333333 3.261374
5 0 1 0.001118454 0.9166667 3.587512
6 0 1 0.001321810 0.8125000 3.179840