如何在 R 中解压元组格式?

How do I unpack tuple format in R?

这是数据集。

library(data.table)

x <- structure(list(id = c("A", "B" ),
                    segment_stemming = c("[('Brownie', 'Noun'), ('From', 'Josa'), ('Pi', 'Noun')]", 
                                          "[('Dung-caroon-gye', 'Noun'), ('in', 'Josa'), ('innovation', 'Noun')]" )), 
               row.names = c(NA, -2L), 
               class = c("data.table", "data.frame" ))

x
# id                                                     segment_stemming
# 1:  A               [('Brownie', 'Noun'), ('From', 'Josa'), ('Pi', 'Noun')]
# 2:  B [('Dung-caroon-gye', 'Noun'), ('in', 'Josa'), ('innovation', 'Noun')]

我想将元组拆分成行。这是我的预期结果。

id             segment_stemming
A              ('Brownie', 'Noun')
A              ('From', 'Josa')
A              ('Pi', 'Noun')
B              ('Dung-caroon-gye', 'Noun')
B              ('in', 'Josa')
B              ('innovation', 'Noun')

我已经使用 R 搜索了元组格式,但找不到任何线索来得出结果。

这是使用 separate_rows 的方法:

library(tidyverse)

x %>% 
  mutate(segment_stemming = gsub("\[|\]", "", segment_stemming)) %>% 
  separate_rows(segment_stemming, sep = ",\s*(?![^()]*\))")

# A tibble: 6 x 2
  id    segment_stemming           
  <chr> <chr>                      
1 A     ('Brownie', 'Noun')        
2 A     ('From', 'Josa')           
3 A     ('Pi', 'Noun')             
4 B     ('Dung-caroon-gye', 'Noun')
5 B     ('in', 'Josa')             
6 B     ('innovation', 'Noun') 

通过一些操作获得更好结果的一种方法(unnest_wider 不是必需的)。

x %>% 
  mutate(segment_stemming = gsub("\[|\]", "", segment_stemming)) %>% 
  separate_rows(segment_stemming, sep = ",\s*(?![^()]*\))") %>% 
  mutate(segment_stemming = segment_stemming %>% 
           str_remove_all("[()',]") %>% 
           str_split(" ")) %>% 
  unnest_wider(segment_stemming)

# A tibble: 6 x 3
  id    ...1            ...2 
  <chr> <chr>           <chr>
1 A     Brownie         Noun 
2 A     From            Josa 
3 A     Pi              Noun 
4 B     Dung-caroon-gye Noun 
5 B     in              Josa 
6 B     innovation      Noun 

data.table接近

这是一个使用 data.table + reticulate

的选项
library(reticulate)
library(data.table)
setDT(x)[
  ,
  segment_stemming := gsub("(\(.*?\))", '\"\1\"', segment_stemming)
][
  ,
  lapply(.SD, py_eval),
  id
]

这给出了

   id            segment_stemming
1:  A         ('Brownie', 'Noun')
2:  A            ('From', 'Josa')
3:  A              ('Pi', 'Noun')
4:  B ('Dung-caroon-gye', 'Noun')
5:  B              ('in', 'Josa')
6:  B      ('innovation', 'Noun')

另一个 data.table 选项使用 strsplit + trimws 如下所示

library(data.table)
setDT(x)[
  ,
  .(segment_stemming = trimws(
    unlist(strsplit(segment_stemming, "(?<=\)),\s+(?=\()", perl = TRUE)),
    whitespace = "\[|\]"
  )),
  id
]

给予

   id            segment_stemming
1:  A         ('Brownie', 'Noun')
2:  A            ('From', 'Josa')
3:  A              ('Pi', 'Noun')
4:  B ('Dung-caroon-gye', 'Noun')
5:  B              ('in', 'Josa')
6:  B      ('innovation', 'Noun')

基础 R

一些基本的 R 选项也应该有效

with(
  x,
  setNames(
    rev(
      stack(
        tapply(
          segment_stemming,
          id,
          function(v) {
            trimws(
              unlist(strsplit(v, "(?<=\)),\s+(?=\()", perl = TRUE)),
              whitespace = "\[|\]"
            )
          }
        )
      )
    ),
    names(x)
  )
)

with(
  x,
  setNames(
    rev(
      stack(
        setNames(
          regmatches(segment_stemming, gregexpr("\(.*?\)", segment_stemming)),
          id
        )
      )
    ),
    names(x)
  )
)

这是另一个可能的选择:

library(data.table)

dt <- structure(list(id = c("A", "B" ), segement_stemming = c("[('Brownie', 'Noun'), ('From', 'Josa'), ('Pi', 'Noun')]", "[('Dung-caroon-gye', 'Noun'), ('in', 'Josa'), ('innovation', 'Noun')]" )), row.names = c(NA, -2L), class = c("data.table", "data.frame" ))

dt2 <- dt[, c(segement_stemming = strsplit(segement_stemming, "(?<=[^']),", perl = TRUE)), by = id]
dt2[, names(dt2) := lapply(.SD, function(x) gsub("\[|\]", "", x))]
dt2
#>    id           segement_stemming
#> 1:  A         ('Brownie', 'Noun')
#> 2:  A            ('From', 'Josa')
#> 3:  A              ('Pi', 'Noun')
#> 4:  B ('Dung-caroon-gye', 'Noun')
#> 5:  B              ('in', 'Josa')
#> 6:  B      ('innovation', 'Noun')

reprex package (v2.0.1)

创建于 2022-03-11
x[,.(segment_stemming = unlist(str_extract_all(segment_stemming, "\(.*?\)"))), by = id]

或者您可以使用 tidyr::unnest。这样就只有一次调用 str_extract_all:

x[, segment_stemming := str_extract_all(segment_stemming, "\(.*?\)")]
unnest(x, segment_stemming)

A data.table 方式如下:

library(stringr)

x [, segment_stemming:=gsub("\[|\]", "", segment_stemming, perl = T)] #remove brackets
x [, parsed := str_split(segment_stemming, "\),")]                     # split string
out <- x[, .(unlist(parsed, recursive = F)), by = .(id)]                # unlist elements
out [ , V1  := gsub("\)?$",")", V1)][]                                 # adjust format

       id                          V1
   <char>                      <char>
1:      A         ('Brownie', 'Noun')
2:      A            ('From', 'Josa')
3:      A              ('Pi', 'Noun')
4:      B ('Dung-caroon-gye', 'Noun')
5:      B              ('in', 'Josa')
6:      B      ('innovation', 'Noun')