从长 table 创建二进制宽 table (如 tidyr::spread() )
Create a binary wide table from a long table (like tidyr::spread() )
作为树模型的输入,我在 SQL 中创建了一个分析 table。现在我想将它转移到 R,因为以 table 作为输入的模型在 R 中也是 运行。
我无法转换为 R 的 SQL 步之一。
分析table具有以下形式:
df <- data.frame(
pseudonym = c("a", "a", "a", "b", "c", "c"),
var1 = c(1,1,0,1,1,0),
var2 = c(1,0,0,0,0,1),
var3 = c(0,0,0,0,0,1))
> df
pseudonym var1 var2 var3
1 a 1 1 0
2 a 1 0 0
3 a 0 0 0
4 b 1 0 0
5 c 1 0 0
6 c 0 1 1
在下一步中,我需要 假名 的不同行,同时保留来自其他列的信息 (1) var1, var2, var3. (在 SQL 中,这是通过
max(case when...then 1 else 0 end) as var1
)
因此从 df1 创建的结果 df2 应该是
df2 <- data.frame(
pseudonym = c("a", "b", "c"),
var1 = c(1,1,1),
var2 = c(1,0,1),
var3 = c(0,0,1))
> df2
pseudonym var1 var2 var3
1 a 1 1 0
2 b 1 0 0
3 c 1 1 1
如果有人有想法会很有帮助。
这是一种方法:
library(dplyr)
library(tidyr)
df <- data.frame(
pseudonym = c("a", "a", "a", "b", "c", "c"),
var1 = c(1,1,0,1,1,0),
var2 = c(1,0,0,0,0,1),
var3 = c(0,0,0,0,0,1))
df %>%
pivot_longer(cols = var1:var3) %>%
group_by(pseudonym, name) %>%
filter(max(value) == value) %>%
ungroup() %>%
distinct() %>%
pivot_wider(names_from = name, values_from = value)
#># A tibble: 3 x 4
#> pseudonym var1 var2 var3
#> <fct> <dbl> <dbl> <dbl>
#>1 a 1 1 0
#>2 b 1 0 0
#>3 c 1 1 1
另一种 dplyr 方法,可能不是很复杂但有效:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df <- data.frame(
pseudonym = c("a", "a", "a", "b", "c", "c"),
var1 = c(1,1,0,1,1,0),
var2 = c(1,0,0,0,0,1),
var3 = c(0,0,0,0,0,1)); df
#> pseudonym var1 var2 var3
#> 1 a 1 1 0
#> 2 a 1 0 0
#> 3 a 0 0 0
#> 4 b 1 0 0
#> 5 c 1 0 0
#> 6 c 0 1 1
df2 <- df %>% group_by(pseudonym) %>% mutate(var1 = case_when(1 %in% var1 ~ 1),
var2 = case_when(1 %in% var2 ~ 1),
var3 = case_when(1 %in% var3 ~ 1)) %>%
unique() %>% replace(is.na(.), 0) %>%
ungroup(); df2
#> # A tibble: 3 x 4
#> pseudonym var1 var2 var3
#> <fct> <dbl> <dbl> <dbl>
#> 1 a 1 1 0
#> 2 b 1 0 0
#> 3 c 1 1 1
由 reprex package (v0.3.0)
于 2020-04-21 创建
我们可以使用max
library(data.table)
setDT(df)[, lapply(.SD, max), pseudonym]
# pseudonym var1 var2 var3
#1: a 1 1 0
#2: b 1 0 0
#3: c 1 1 1
作为树模型的输入,我在 SQL 中创建了一个分析 table。现在我想将它转移到 R,因为以 table 作为输入的模型在 R 中也是 运行。 我无法转换为 R 的 SQL 步之一。
分析table具有以下形式:
df <- data.frame(
pseudonym = c("a", "a", "a", "b", "c", "c"),
var1 = c(1,1,0,1,1,0),
var2 = c(1,0,0,0,0,1),
var3 = c(0,0,0,0,0,1))
> df
pseudonym var1 var2 var3
1 a 1 1 0
2 a 1 0 0
3 a 0 0 0
4 b 1 0 0
5 c 1 0 0
6 c 0 1 1
在下一步中,我需要 假名 的不同行,同时保留来自其他列的信息 (1) var1, var2, var3. (在 SQL 中,这是通过
max(case when...then 1 else 0 end) as var1
)
因此从 df1 创建的结果 df2 应该是
df2 <- data.frame(
pseudonym = c("a", "b", "c"),
var1 = c(1,1,1),
var2 = c(1,0,1),
var3 = c(0,0,1))
> df2
pseudonym var1 var2 var3
1 a 1 1 0
2 b 1 0 0
3 c 1 1 1
如果有人有想法会很有帮助。
这是一种方法:
library(dplyr)
library(tidyr)
df <- data.frame(
pseudonym = c("a", "a", "a", "b", "c", "c"),
var1 = c(1,1,0,1,1,0),
var2 = c(1,0,0,0,0,1),
var3 = c(0,0,0,0,0,1))
df %>%
pivot_longer(cols = var1:var3) %>%
group_by(pseudonym, name) %>%
filter(max(value) == value) %>%
ungroup() %>%
distinct() %>%
pivot_wider(names_from = name, values_from = value)
#># A tibble: 3 x 4
#> pseudonym var1 var2 var3
#> <fct> <dbl> <dbl> <dbl>
#>1 a 1 1 0
#>2 b 1 0 0
#>3 c 1 1 1
另一种 dplyr 方法,可能不是很复杂但有效:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df <- data.frame(
pseudonym = c("a", "a", "a", "b", "c", "c"),
var1 = c(1,1,0,1,1,0),
var2 = c(1,0,0,0,0,1),
var3 = c(0,0,0,0,0,1)); df
#> pseudonym var1 var2 var3
#> 1 a 1 1 0
#> 2 a 1 0 0
#> 3 a 0 0 0
#> 4 b 1 0 0
#> 5 c 1 0 0
#> 6 c 0 1 1
df2 <- df %>% group_by(pseudonym) %>% mutate(var1 = case_when(1 %in% var1 ~ 1),
var2 = case_when(1 %in% var2 ~ 1),
var3 = case_when(1 %in% var3 ~ 1)) %>%
unique() %>% replace(is.na(.), 0) %>%
ungroup(); df2
#> # A tibble: 3 x 4
#> pseudonym var1 var2 var3
#> <fct> <dbl> <dbl> <dbl>
#> 1 a 1 1 0
#> 2 b 1 0 0
#> 3 c 1 1 1
由 reprex package (v0.3.0)
于 2020-04-21 创建我们可以使用max
library(data.table)
setDT(df)[, lapply(.SD, max), pseudonym]
# pseudonym var1 var2 var3
#1: a 1 1 0
#2: b 1 0 0
#3: c 1 1 1