拆分 R 中不同列中的值

Splitting values in different columns in R

我数据集中的一列包含类似

的值
utm_source=google&utm_medium=cpc&utm_campaign=1234567&utm_term=brand%20&utm_content=Brand&gclid=ERtyuiipotf_YTj

我应该如何将其拆分为不同的列及其在 R 中的值?

utm_source utm_medium  utm_campaign utm_brand  utm_content
  google      cpc          1234567   brand%20     Brand

dput(column) 给出以下输出

structure(list("null", "gclid=ertyyhglkdl-kjkY", 
    "utm_source=google&utm_medium=cpc&utm_campaign=1234556&utm_term=brand%20shirts&utm_content=Brand&gclid=jhajsgjdgd_ajs", 
    "utm_source=google&utm_medium=cpc&utm_campaign=1674814043&utm_term=brand%20shirts&utm_content=Brand&gclid=KvgMsEAAYASAAEgLq6vD_BwE", 
    "null", "null", "null", "null", "null", "null", "null", "null", 
    "null", "null", "utm_source=fb&utm_medium=ctw&utm_campaign=Shirt_rem&utm_content=CasciaShirt"), class = c("extracted", 
"list"))

将 OP 的更新示例作为 list,我们遍历 listif 元素不是 "null",然后创建一个 tibble,用 separate_rows 拆分 & 处的列,然后将该列拆分为多个列 (separate),用 [= 从命名向量 (deframe) 创建一个 tibble 23=])

library(dplyr)
library(tidyr)
library(tibble)
library(purrr)
map_dfr(lst1, ~ if(.x != "null") tibble(col1 = .x) %>% 
             separate_rows(col1, sep="&") %>% 
             separate(col1, into = c('col1', 'col2'), sep="\=") %>%
             deframe %>% 
             as_tibble_row())

-输出

# A tibble: 4 x 6
#  gclid                    utm_source utm_medium utm_campaign utm_term       utm_content
#  <chr>                    <chr>      <chr>      <chr>        <chr>          <chr>      
#1 ertyyhglkdl-kjkY         <NA>       <NA>       <NA>         <NA>           <NA>       
#2 jhajsgjdgd_ajs           google     cpc        1234556      brand%20shirts Brand      
#3 KvgMsEAAYASAAEgLq6vD_BwE google     cpc        1674814043   brand%20shirts Brand      
#4 <NA>                     fb         ctw        Shirt_rem    <NA>           CasciaShirt

或者不用循环执行此操作,我们可以将 list 转换为 data.frame 中的列,执行一次并转向宽格式

library(data.table)
keep(lst1, ~ .x != "null") %>%
     flatten_chr %>% 
     tibble(col1 = .) %>%
     mutate(rn = row_number()) %>% 
     separate_rows(col1, sep='&') %>% 
     separate(col1, into = c('col1', 'col2'), sep="\=") %>%
     pivot_wider(names_from = col1, values_from = col2) %>% 
     select(-rn)
# A tibble: 4 x 6
#  gclid                    utm_source utm_medium utm_campaign utm_term       utm_content
#  <chr>                    <chr>      <chr>      <chr>        <chr>          <chr>      
#1 ertyyhglkdl-kjkY         <NA>       <NA>       <NA>         <NA>           <NA>       
#2 jhajsgjdgd_ajs           google     cpc        1234556      brand%20shirts Brand      
#3 KvgMsEAAYASAAEgLq6vD_BwE google     cpc        1674814043   brand%20shirts Brand      
#4 <NA>                     fb         ctw        Shirt_rem    <NA>           CasciaShirt

数据

lst1 <- structure(list("null", "gclid=ertyyhglkdl-kjkY", "utm_source=google&utm_medium=cpc&utm_campaign=1234556&utm_term=brand%20shirts&utm_content=Brand&gclid=jhajsgjdgd_ajs", 
    "utm_source=google&utm_medium=cpc&utm_campaign=1674814043&utm_term=brand%20shirts&utm_content=Brand&gclid=KvgMsEAAYASAAEgLq6vD_BwE", 
    "null", "null", "null", "null", "null", "null", "null", "null", 
    "null", "null", "utm_source=fb&utm_medium=ctw&utm_campaign=Shirt_rem&utm_content=CasciaShirt"), class = c("extracted", 
"list"))

我不确定这是否是预期的输出。以下可能是您目标的基本 R 选项

Reduce(
  function(...) merge(..., all = TRUE),
  lapply(
    column,
    function(x) {
      u <- unlist(strsplit(x, "&"))
      setNames(data.frame(as.list(gsub(".*=", "", u))), gsub("=.*", "", u))
    }
  )
)

这给出了

  utm_source utm_medium utm_campaign utm_content null                    gclid
1         fb        ctw    Shirt_rem CasciaShirt <NA>                     <NA>
2     google        cpc      1234556       Brand <NA>           jhajsgjdgd_ajs
3     google        cpc   1674814043       Brand <NA> KvgMsEAAYASAAEgLq6vD_BwE
4       <NA>       <NA>         <NA>        <NA> null         ertyyhglkdl-kjkY
        utm_term
1           <NA>
2 brand%20shirts
3 brand%20shirts
4           <NA>

更新

如果你想保留所有的数据,即使是null,你可以试试下面的代码

Reduce(
  function(x, y) {
    if (all(is.na(x)) | all(is.na(y))) {
      return(rbind(x, y))
    }
    dplyr::full_join(x, y)
  },
  lapply(
    column,
    function(x) {
      if (x == "null") {
        return(NA)
      }
      u <- unlist(strsplit(x, "&"))
      setNames(data.frame(as.list(gsub(".*=", "", u))), gsub("=.*", "", u))
    }
  )
)

这给出了

                      gclid utm_source utm_medium utm_campaign       utm_term
1                      <NA>       <NA>       <NA>         <NA>           <NA>
2          ertyyhglkdl-kjkY       <NA>       <NA>         <NA>           <NA>
3            jhajsgjdgd_ajs     google        cpc      1234556 brand%20shirts
4  KvgMsEAAYASAAEgLq6vD_BwE     google        cpc   1674814043 brand%20shirts
5                      <NA>       <NA>       <NA>         <NA>           <NA>
6                      <NA>       <NA>       <NA>         <NA>           <NA>
7                      <NA>       <NA>       <NA>         <NA>           <NA>
8                      <NA>       <NA>       <NA>         <NA>           <NA>
9                      <NA>       <NA>       <NA>         <NA>           <NA>
10                     <NA>       <NA>       <NA>         <NA>           <NA>
11                     <NA>       <NA>       <NA>         <NA>           <NA>
12                     <NA>       <NA>       <NA>         <NA>           <NA>
13                     <NA>       <NA>       <NA>         <NA>           <NA>
14                     <NA>       <NA>       <NA>         <NA>           <NA>
15                     <NA>         fb        ctw    Shirt_rem           <NA>
   utm_content
1         <NA>
2         <NA>
3        Brand
4        Brand
5         <NA>
6         <NA>
7         <NA>
8         <NA>
9         <NA>
10        <NA>
11        <NA>
12        <NA>
13        <NA>
14        <NA>
15 CasciaShirt