如何在 R 中解压元组格式?
How do I unpack tuple format in R?
这是数据集。
library(data.table)
x <- structure(list(id = c("A", "B" ),
segment_stemming = c("[('Brownie', 'Noun'), ('From', 'Josa'), ('Pi', 'Noun')]",
"[('Dung-caroon-gye', 'Noun'), ('in', 'Josa'), ('innovation', 'Noun')]" )),
row.names = c(NA, -2L),
class = c("data.table", "data.frame" ))
x
# id segment_stemming
# 1: A [('Brownie', 'Noun'), ('From', 'Josa'), ('Pi', 'Noun')]
# 2: B [('Dung-caroon-gye', 'Noun'), ('in', 'Josa'), ('innovation', 'Noun')]
我想将元组拆分成行。这是我的预期结果。
id segment_stemming
A ('Brownie', 'Noun')
A ('From', 'Josa')
A ('Pi', 'Noun')
B ('Dung-caroon-gye', 'Noun')
B ('in', 'Josa')
B ('innovation', 'Noun')
我已经使用 R 搜索了元组格式,但找不到任何线索来得出结果。
这是使用 separate_rows
的方法:
library(tidyverse)
x %>%
mutate(segment_stemming = gsub("\[|\]", "", segment_stemming)) %>%
separate_rows(segment_stemming, sep = ",\s*(?![^()]*\))")
# A tibble: 6 x 2
id segment_stemming
<chr> <chr>
1 A ('Brownie', 'Noun')
2 A ('From', 'Josa')
3 A ('Pi', 'Noun')
4 B ('Dung-caroon-gye', 'Noun')
5 B ('in', 'Josa')
6 B ('innovation', 'Noun')
通过一些操作获得更好结果的一种方法(unnest_wider
不是必需的)。
x %>%
mutate(segment_stemming = gsub("\[|\]", "", segment_stemming)) %>%
separate_rows(segment_stemming, sep = ",\s*(?![^()]*\))") %>%
mutate(segment_stemming = segment_stemming %>%
str_remove_all("[()',]") %>%
str_split(" ")) %>%
unnest_wider(segment_stemming)
# A tibble: 6 x 3
id ...1 ...2
<chr> <chr> <chr>
1 A Brownie Noun
2 A From Josa
3 A Pi Noun
4 B Dung-caroon-gye Noun
5 B in Josa
6 B innovation Noun
data.table
接近
这是一个使用 data.table
+ reticulate
的选项
library(reticulate)
library(data.table)
setDT(x)[
,
segment_stemming := gsub("(\(.*?\))", '\"\1\"', segment_stemming)
][
,
lapply(.SD, py_eval),
id
]
这给出了
id segment_stemming
1: A ('Brownie', 'Noun')
2: A ('From', 'Josa')
3: A ('Pi', 'Noun')
4: B ('Dung-caroon-gye', 'Noun')
5: B ('in', 'Josa')
6: B ('innovation', 'Noun')
另一个 data.table
选项使用 strsplit
+ trimws
如下所示
library(data.table)
setDT(x)[
,
.(segment_stemming = trimws(
unlist(strsplit(segment_stemming, "(?<=\)),\s+(?=\()", perl = TRUE)),
whitespace = "\[|\]"
)),
id
]
给予
id segment_stemming
1: A ('Brownie', 'Noun')
2: A ('From', 'Josa')
3: A ('Pi', 'Noun')
4: B ('Dung-caroon-gye', 'Noun')
5: B ('in', 'Josa')
6: B ('innovation', 'Noun')
基础 R
一些基本的 R 选项也应该有效
with(
x,
setNames(
rev(
stack(
tapply(
segment_stemming,
id,
function(v) {
trimws(
unlist(strsplit(v, "(?<=\)),\s+(?=\()", perl = TRUE)),
whitespace = "\[|\]"
)
}
)
)
),
names(x)
)
)
或
with(
x,
setNames(
rev(
stack(
setNames(
regmatches(segment_stemming, gregexpr("\(.*?\)", segment_stemming)),
id
)
)
),
names(x)
)
)
这是另一个可能的选择:
library(data.table)
dt <- structure(list(id = c("A", "B" ), segement_stemming = c("[('Brownie', 'Noun'), ('From', 'Josa'), ('Pi', 'Noun')]", "[('Dung-caroon-gye', 'Noun'), ('in', 'Josa'), ('innovation', 'Noun')]" )), row.names = c(NA, -2L), class = c("data.table", "data.frame" ))
dt2 <- dt[, c(segement_stemming = strsplit(segement_stemming, "(?<=[^']),", perl = TRUE)), by = id]
dt2[, names(dt2) := lapply(.SD, function(x) gsub("\[|\]", "", x))]
dt2
#> id segement_stemming
#> 1: A ('Brownie', 'Noun')
#> 2: A ('From', 'Josa')
#> 3: A ('Pi', 'Noun')
#> 4: B ('Dung-caroon-gye', 'Noun')
#> 5: B ('in', 'Josa')
#> 6: B ('innovation', 'Noun')
由 reprex package (v2.0.1)
创建于 2022-03-11
x[,.(segment_stemming = unlist(str_extract_all(segment_stemming, "\(.*?\)"))), by = id]
或者您可以使用 tidyr::unnest
。这样就只有一次调用 str_extract_all
:
x[, segment_stemming := str_extract_all(segment_stemming, "\(.*?\)")]
unnest(x, segment_stemming)
A data.table 方式如下:
library(stringr)
x [, segment_stemming:=gsub("\[|\]", "", segment_stemming, perl = T)] #remove brackets
x [, parsed := str_split(segment_stemming, "\),")] # split string
out <- x[, .(unlist(parsed, recursive = F)), by = .(id)] # unlist elements
out [ , V1 := gsub("\)?$",")", V1)][] # adjust format
id V1
<char> <char>
1: A ('Brownie', 'Noun')
2: A ('From', 'Josa')
3: A ('Pi', 'Noun')
4: B ('Dung-caroon-gye', 'Noun')
5: B ('in', 'Josa')
6: B ('innovation', 'Noun')
这是数据集。
library(data.table)
x <- structure(list(id = c("A", "B" ),
segment_stemming = c("[('Brownie', 'Noun'), ('From', 'Josa'), ('Pi', 'Noun')]",
"[('Dung-caroon-gye', 'Noun'), ('in', 'Josa'), ('innovation', 'Noun')]" )),
row.names = c(NA, -2L),
class = c("data.table", "data.frame" ))
x
# id segment_stemming
# 1: A [('Brownie', 'Noun'), ('From', 'Josa'), ('Pi', 'Noun')]
# 2: B [('Dung-caroon-gye', 'Noun'), ('in', 'Josa'), ('innovation', 'Noun')]
我想将元组拆分成行。这是我的预期结果。
id segment_stemming
A ('Brownie', 'Noun')
A ('From', 'Josa')
A ('Pi', 'Noun')
B ('Dung-caroon-gye', 'Noun')
B ('in', 'Josa')
B ('innovation', 'Noun')
我已经使用 R 搜索了元组格式,但找不到任何线索来得出结果。
这是使用 separate_rows
的方法:
library(tidyverse)
x %>%
mutate(segment_stemming = gsub("\[|\]", "", segment_stemming)) %>%
separate_rows(segment_stemming, sep = ",\s*(?![^()]*\))")
# A tibble: 6 x 2
id segment_stemming
<chr> <chr>
1 A ('Brownie', 'Noun')
2 A ('From', 'Josa')
3 A ('Pi', 'Noun')
4 B ('Dung-caroon-gye', 'Noun')
5 B ('in', 'Josa')
6 B ('innovation', 'Noun')
通过一些操作获得更好结果的一种方法(unnest_wider
不是必需的)。
x %>%
mutate(segment_stemming = gsub("\[|\]", "", segment_stemming)) %>%
separate_rows(segment_stemming, sep = ",\s*(?![^()]*\))") %>%
mutate(segment_stemming = segment_stemming %>%
str_remove_all("[()',]") %>%
str_split(" ")) %>%
unnest_wider(segment_stemming)
# A tibble: 6 x 3
id ...1 ...2
<chr> <chr> <chr>
1 A Brownie Noun
2 A From Josa
3 A Pi Noun
4 B Dung-caroon-gye Noun
5 B in Josa
6 B innovation Noun
data.table
接近
这是一个使用 data.table
+ reticulate
library(reticulate)
library(data.table)
setDT(x)[
,
segment_stemming := gsub("(\(.*?\))", '\"\1\"', segment_stemming)
][
,
lapply(.SD, py_eval),
id
]
这给出了
id segment_stemming
1: A ('Brownie', 'Noun')
2: A ('From', 'Josa')
3: A ('Pi', 'Noun')
4: B ('Dung-caroon-gye', 'Noun')
5: B ('in', 'Josa')
6: B ('innovation', 'Noun')
另一个 data.table
选项使用 strsplit
+ trimws
如下所示
library(data.table)
setDT(x)[
,
.(segment_stemming = trimws(
unlist(strsplit(segment_stemming, "(?<=\)),\s+(?=\()", perl = TRUE)),
whitespace = "\[|\]"
)),
id
]
给予
id segment_stemming
1: A ('Brownie', 'Noun')
2: A ('From', 'Josa')
3: A ('Pi', 'Noun')
4: B ('Dung-caroon-gye', 'Noun')
5: B ('in', 'Josa')
6: B ('innovation', 'Noun')
基础 R
一些基本的 R 选项也应该有效
with(
x,
setNames(
rev(
stack(
tapply(
segment_stemming,
id,
function(v) {
trimws(
unlist(strsplit(v, "(?<=\)),\s+(?=\()", perl = TRUE)),
whitespace = "\[|\]"
)
}
)
)
),
names(x)
)
)
或
with(
x,
setNames(
rev(
stack(
setNames(
regmatches(segment_stemming, gregexpr("\(.*?\)", segment_stemming)),
id
)
)
),
names(x)
)
)
这是另一个可能的选择:
library(data.table)
dt <- structure(list(id = c("A", "B" ), segement_stemming = c("[('Brownie', 'Noun'), ('From', 'Josa'), ('Pi', 'Noun')]", "[('Dung-caroon-gye', 'Noun'), ('in', 'Josa'), ('innovation', 'Noun')]" )), row.names = c(NA, -2L), class = c("data.table", "data.frame" ))
dt2 <- dt[, c(segement_stemming = strsplit(segement_stemming, "(?<=[^']),", perl = TRUE)), by = id]
dt2[, names(dt2) := lapply(.SD, function(x) gsub("\[|\]", "", x))]
dt2
#> id segement_stemming
#> 1: A ('Brownie', 'Noun')
#> 2: A ('From', 'Josa')
#> 3: A ('Pi', 'Noun')
#> 4: B ('Dung-caroon-gye', 'Noun')
#> 5: B ('in', 'Josa')
#> 6: B ('innovation', 'Noun')
由 reprex package (v2.0.1)
创建于 2022-03-11x[,.(segment_stemming = unlist(str_extract_all(segment_stemming, "\(.*?\)"))), by = id]
或者您可以使用 tidyr::unnest
。这样就只有一次调用 str_extract_all
:
x[, segment_stemming := str_extract_all(segment_stemming, "\(.*?\)")]
unnest(x, segment_stemming)
A data.table 方式如下:
library(stringr)
x [, segment_stemming:=gsub("\[|\]", "", segment_stemming, perl = T)] #remove brackets
x [, parsed := str_split(segment_stemming, "\),")] # split string
out <- x[, .(unlist(parsed, recursive = F)), by = .(id)] # unlist elements
out [ , V1 := gsub("\)?$",")", V1)][] # adjust format
id V1
<char> <char>
1: A ('Brownie', 'Noun')
2: A ('From', 'Josa')
3: A ('Pi', 'Noun')
4: B ('Dung-caroon-gye', 'Noun')
5: B ('in', 'Josa')
6: B ('innovation', 'Noun')