如何将存储在单个值(每个记录)中的字符列表转换为可以分析的格式? R
How to convert a list of characters that is stored in a single value (per record) into a format that can be analyzed? R
所以我有以下数据。在这个数据框中,我有一个用户名、日期和一个包列表。我的目标是将该列 packages
转换为我可以分析的格式。
> print(data_example)
# A tibble: 4 x 3
username date packages
<chr> <chr> <chr>
1 John 1/5/2015 "compiler 4.1.2, magrittr 2.0.1, ellipsis 0.3.2, tools 4.1.2, pillar 1.6.4"
2 Karen 1/5/2015 "compiler 4.1.2, tools 4.1.2\""
3 Mike 1/5/2015 "evaluate 0.14, highr 0.9, httr 1.4.2, pillar 1.6.4, rlang 0.4.12"
4 Zoe 1/6/2015 "httr 1.4.2, viridisLite 0.4.0, jsonlite 1.7.2, splines 4.1.2"
我试过做这样的事情,但这并没有给我多大价值。
packages <- as.list(parsed_log$packages)
我的目标是尝试“延长支点”并获得这样的结果?我不确定这是否是分析此类数据的最佳方法。
username date packages
John 1/5/2015 compiler 4.1.2
John 1/5/2015 magrittr 2.0.1
John 1/5/2015 ellipsis 0.3.2
John 1/5/2015 tools 4.1.2
ect...
一个解决方案,基于tidyr::separate
:
library(tidyr)
df <- data.frame(
stringsAsFactors = FALSE,
username = c("John", "Karen", "Mike", "Zoe"),
date = c("1/5/2015", "1/5/2015", "1/5/2015", "1/6/2015"),
packages = c("compiler 4.1.2, magrittr 2.0.1, ellipsis 0.3.2, tools 4.1.2, pillar 1.6.4","compiler 4.1.2, tools 4.1.2\\"",
"evaluate 0.14, highr 0.9, httr 1.4.2, pillar 1.6.4, rlang 0.4.12",
"httr 1.4.2, viridisLite 0.4.0, jsonlite 1.7.2, splines 4.1.2")
)
separate_rows(df, packages, sep = ", ")
#> # A tibble: 16 × 3
#> username date packages
#> <chr> <chr> <chr>
#> 1 John 1/5/2015 "compiler 4.1.2"
#> 2 John 1/5/2015 "magrittr 2.0.1"
#> 3 John 1/5/2015 "ellipsis 0.3.2"
#> 4 John 1/5/2015 "tools 4.1.2"
#> 5 John 1/5/2015 "pillar 1.6.4"
#> 6 Karen 1/5/2015 "compiler 4.1.2"
#> 7 Karen 1/5/2015 "tools 4.1.2\\""
#> 8 Mike 1/5/2015 "evaluate 0.14"
#> 9 Mike 1/5/2015 "highr 0.9"
#> 10 Mike 1/5/2015 "httr 1.4.2"
#> 11 Mike 1/5/2015 "pillar 1.6.4"
#> 12 Mike 1/5/2015 "rlang 0.4.12"
#> 13 Zoe 1/6/2015 "httr 1.4.2"
#> 14 Zoe 1/6/2015 "viridisLite 0.4.0"
#> 15 Zoe 1/6/2015 "jsonlite 1.7.2"
#> 16 Zoe 1/6/2015 "splines 4.1.2"
一种选择是使用stringi
拆分:
s <- strsplit(as.character(df$packages), ',')
data.frame(
packages = unlist(s),
date = rep(df$date, length(s)),
username = rep(df$username, length(s))
)
packages date username
1 compiler 4.1.2 1/5/2015 John
2 magrittr 2.0.1 1/5/2015 Karen
3 ellipsis 0.3.2 1/5/2015 Mike
4 tools 4.1.2 1/6/2015 Zoe
5 pillar 1.6.4 1/5/2015 John
6 compiler 4.1.2 1/5/2015 Karen
7 tools 4.1.2\" 1/5/2015 Mike
8 evaluate 0.14 1/6/2015 Zoe
9 highr 0.9 1/5/2015 John
10 httr 1.4.2 1/5/2015 Karen
11 pillar 1.6.4 1/5/2015 Mike
12 rlang 0.4.12 1/6/2015 Zoe
13 httr 1.4.2 1/5/2015 John
14 viridisLite 0.4.0 1/5/2015 Karen
15 jsonlite 1.7.2 1/5/2015 Mike
16 splines 4.1.2 1/6/2015 Zoe
另一种选择是使用 cSplit
来自 splitstackshape
:
library(splitstackshape)
cSplit(df, "packages", ",", direction = "long")
data.table
选项:
library(data.table)
dt <- as.data.table(df)
dt[, strsplit(as.character(packages), ","), by = c("username", "date")]
数据(感谢@PaulS 提供数据)
df <- structure(list(username = c("John", "Karen", "Mike", "Zoe"),
date = c("1/5/2015", "1/5/2015", "1/5/2015", "1/6/2015"),
packages = c("compiler 4.1.2, magrittr 2.0.1, ellipsis 0.3.2, tools 4.1.2, pillar 1.6.4",
"compiler 4.1.2, tools 4.1.2\\"", "evaluate 0.14, highr 0.9, httr 1.4.2, pillar 1.6.4, rlang 0.4.12",
"httr 1.4.2, viridisLite 0.4.0, jsonlite 1.7.2, splines 4.1.2"
)), class = "data.frame", row.names = c(NA, -4L))
所以我有以下数据。在这个数据框中,我有一个用户名、日期和一个包列表。我的目标是将该列 packages
转换为我可以分析的格式。
> print(data_example)
# A tibble: 4 x 3
username date packages
<chr> <chr> <chr>
1 John 1/5/2015 "compiler 4.1.2, magrittr 2.0.1, ellipsis 0.3.2, tools 4.1.2, pillar 1.6.4"
2 Karen 1/5/2015 "compiler 4.1.2, tools 4.1.2\""
3 Mike 1/5/2015 "evaluate 0.14, highr 0.9, httr 1.4.2, pillar 1.6.4, rlang 0.4.12"
4 Zoe 1/6/2015 "httr 1.4.2, viridisLite 0.4.0, jsonlite 1.7.2, splines 4.1.2"
我试过做这样的事情,但这并没有给我多大价值。
packages <- as.list(parsed_log$packages)
我的目标是尝试“延长支点”并获得这样的结果?我不确定这是否是分析此类数据的最佳方法。
username date packages
John 1/5/2015 compiler 4.1.2
John 1/5/2015 magrittr 2.0.1
John 1/5/2015 ellipsis 0.3.2
John 1/5/2015 tools 4.1.2
ect...
一个解决方案,基于tidyr::separate
:
library(tidyr)
df <- data.frame(
stringsAsFactors = FALSE,
username = c("John", "Karen", "Mike", "Zoe"),
date = c("1/5/2015", "1/5/2015", "1/5/2015", "1/6/2015"),
packages = c("compiler 4.1.2, magrittr 2.0.1, ellipsis 0.3.2, tools 4.1.2, pillar 1.6.4","compiler 4.1.2, tools 4.1.2\\"",
"evaluate 0.14, highr 0.9, httr 1.4.2, pillar 1.6.4, rlang 0.4.12",
"httr 1.4.2, viridisLite 0.4.0, jsonlite 1.7.2, splines 4.1.2")
)
separate_rows(df, packages, sep = ", ")
#> # A tibble: 16 × 3
#> username date packages
#> <chr> <chr> <chr>
#> 1 John 1/5/2015 "compiler 4.1.2"
#> 2 John 1/5/2015 "magrittr 2.0.1"
#> 3 John 1/5/2015 "ellipsis 0.3.2"
#> 4 John 1/5/2015 "tools 4.1.2"
#> 5 John 1/5/2015 "pillar 1.6.4"
#> 6 Karen 1/5/2015 "compiler 4.1.2"
#> 7 Karen 1/5/2015 "tools 4.1.2\\""
#> 8 Mike 1/5/2015 "evaluate 0.14"
#> 9 Mike 1/5/2015 "highr 0.9"
#> 10 Mike 1/5/2015 "httr 1.4.2"
#> 11 Mike 1/5/2015 "pillar 1.6.4"
#> 12 Mike 1/5/2015 "rlang 0.4.12"
#> 13 Zoe 1/6/2015 "httr 1.4.2"
#> 14 Zoe 1/6/2015 "viridisLite 0.4.0"
#> 15 Zoe 1/6/2015 "jsonlite 1.7.2"
#> 16 Zoe 1/6/2015 "splines 4.1.2"
一种选择是使用stringi
拆分:
s <- strsplit(as.character(df$packages), ',')
data.frame(
packages = unlist(s),
date = rep(df$date, length(s)),
username = rep(df$username, length(s))
)
packages date username
1 compiler 4.1.2 1/5/2015 John
2 magrittr 2.0.1 1/5/2015 Karen
3 ellipsis 0.3.2 1/5/2015 Mike
4 tools 4.1.2 1/6/2015 Zoe
5 pillar 1.6.4 1/5/2015 John
6 compiler 4.1.2 1/5/2015 Karen
7 tools 4.1.2\" 1/5/2015 Mike
8 evaluate 0.14 1/6/2015 Zoe
9 highr 0.9 1/5/2015 John
10 httr 1.4.2 1/5/2015 Karen
11 pillar 1.6.4 1/5/2015 Mike
12 rlang 0.4.12 1/6/2015 Zoe
13 httr 1.4.2 1/5/2015 John
14 viridisLite 0.4.0 1/5/2015 Karen
15 jsonlite 1.7.2 1/5/2015 Mike
16 splines 4.1.2 1/6/2015 Zoe
另一种选择是使用 cSplit
来自 splitstackshape
:
library(splitstackshape)
cSplit(df, "packages", ",", direction = "long")
data.table
选项:
library(data.table)
dt <- as.data.table(df)
dt[, strsplit(as.character(packages), ","), by = c("username", "date")]
数据(感谢@PaulS 提供数据)
df <- structure(list(username = c("John", "Karen", "Mike", "Zoe"),
date = c("1/5/2015", "1/5/2015", "1/5/2015", "1/6/2015"),
packages = c("compiler 4.1.2, magrittr 2.0.1, ellipsis 0.3.2, tools 4.1.2, pillar 1.6.4",
"compiler 4.1.2, tools 4.1.2\\"", "evaluate 0.14, highr 0.9, httr 1.4.2, pillar 1.6.4, rlang 0.4.12",
"httr 1.4.2, viridisLite 0.4.0, jsonlite 1.7.2, splines 4.1.2"
)), class = "data.frame", row.names = c(NA, -4L))