拆分 R 中不同列中的值
Splitting values in different columns in R
我数据集中的一列包含类似
的值
utm_source=google&utm_medium=cpc&utm_campaign=1234567&utm_term=brand%20&utm_content=Brand&gclid=ERtyuiipotf_YTj
我应该如何将其拆分为不同的列及其在 R 中的值?
utm_source utm_medium utm_campaign utm_brand utm_content
google cpc 1234567 brand%20 Brand
dput(column)
给出以下输出
structure(list("null", "gclid=ertyyhglkdl-kjkY",
"utm_source=google&utm_medium=cpc&utm_campaign=1234556&utm_term=brand%20shirts&utm_content=Brand&gclid=jhajsgjdgd_ajs",
"utm_source=google&utm_medium=cpc&utm_campaign=1674814043&utm_term=brand%20shirts&utm_content=Brand&gclid=KvgMsEAAYASAAEgLq6vD_BwE",
"null", "null", "null", "null", "null", "null", "null", "null",
"null", "null", "utm_source=fb&utm_medium=ctw&utm_campaign=Shirt_rem&utm_content=CasciaShirt"), class = c("extracted",
"list"))
将 OP 的更新示例作为 list
,我们遍历 list
,if
元素不是 "null"
,然后创建一个 tibble
,用 separate_rows
拆分 &
处的列,然后将该列拆分为多个列 (separate
),用 [= 从命名向量 (deframe
) 创建一个 tibble 23=])
library(dplyr)
library(tidyr)
library(tibble)
library(purrr)
map_dfr(lst1, ~ if(.x != "null") tibble(col1 = .x) %>%
separate_rows(col1, sep="&") %>%
separate(col1, into = c('col1', 'col2'), sep="\=") %>%
deframe %>%
as_tibble_row())
-输出
# A tibble: 4 x 6
# gclid utm_source utm_medium utm_campaign utm_term utm_content
# <chr> <chr> <chr> <chr> <chr> <chr>
#1 ertyyhglkdl-kjkY <NA> <NA> <NA> <NA> <NA>
#2 jhajsgjdgd_ajs google cpc 1234556 brand%20shirts Brand
#3 KvgMsEAAYASAAEgLq6vD_BwE google cpc 1674814043 brand%20shirts Brand
#4 <NA> fb ctw Shirt_rem <NA> CasciaShirt
或者不用循环执行此操作,我们可以将 list
转换为 data.frame
中的列,执行一次并转向宽格式
library(data.table)
keep(lst1, ~ .x != "null") %>%
flatten_chr %>%
tibble(col1 = .) %>%
mutate(rn = row_number()) %>%
separate_rows(col1, sep='&') %>%
separate(col1, into = c('col1', 'col2'), sep="\=") %>%
pivot_wider(names_from = col1, values_from = col2) %>%
select(-rn)
# A tibble: 4 x 6
# gclid utm_source utm_medium utm_campaign utm_term utm_content
# <chr> <chr> <chr> <chr> <chr> <chr>
#1 ertyyhglkdl-kjkY <NA> <NA> <NA> <NA> <NA>
#2 jhajsgjdgd_ajs google cpc 1234556 brand%20shirts Brand
#3 KvgMsEAAYASAAEgLq6vD_BwE google cpc 1674814043 brand%20shirts Brand
#4 <NA> fb ctw Shirt_rem <NA> CasciaShirt
数据
lst1 <- structure(list("null", "gclid=ertyyhglkdl-kjkY", "utm_source=google&utm_medium=cpc&utm_campaign=1234556&utm_term=brand%20shirts&utm_content=Brand&gclid=jhajsgjdgd_ajs",
"utm_source=google&utm_medium=cpc&utm_campaign=1674814043&utm_term=brand%20shirts&utm_content=Brand&gclid=KvgMsEAAYASAAEgLq6vD_BwE",
"null", "null", "null", "null", "null", "null", "null", "null",
"null", "null", "utm_source=fb&utm_medium=ctw&utm_campaign=Shirt_rem&utm_content=CasciaShirt"), class = c("extracted",
"list"))
我不确定这是否是预期的输出。以下可能是您目标的基本 R 选项
Reduce(
function(...) merge(..., all = TRUE),
lapply(
column,
function(x) {
u <- unlist(strsplit(x, "&"))
setNames(data.frame(as.list(gsub(".*=", "", u))), gsub("=.*", "", u))
}
)
)
这给出了
utm_source utm_medium utm_campaign utm_content null gclid
1 fb ctw Shirt_rem CasciaShirt <NA> <NA>
2 google cpc 1234556 Brand <NA> jhajsgjdgd_ajs
3 google cpc 1674814043 Brand <NA> KvgMsEAAYASAAEgLq6vD_BwE
4 <NA> <NA> <NA> <NA> null ertyyhglkdl-kjkY
utm_term
1 <NA>
2 brand%20shirts
3 brand%20shirts
4 <NA>
更新
如果你想保留所有的数据,即使是null
,你可以试试下面的代码
Reduce(
function(x, y) {
if (all(is.na(x)) | all(is.na(y))) {
return(rbind(x, y))
}
dplyr::full_join(x, y)
},
lapply(
column,
function(x) {
if (x == "null") {
return(NA)
}
u <- unlist(strsplit(x, "&"))
setNames(data.frame(as.list(gsub(".*=", "", u))), gsub("=.*", "", u))
}
)
)
这给出了
gclid utm_source utm_medium utm_campaign utm_term
1 <NA> <NA> <NA> <NA> <NA>
2 ertyyhglkdl-kjkY <NA> <NA> <NA> <NA>
3 jhajsgjdgd_ajs google cpc 1234556 brand%20shirts
4 KvgMsEAAYASAAEgLq6vD_BwE google cpc 1674814043 brand%20shirts
5 <NA> <NA> <NA> <NA> <NA>
6 <NA> <NA> <NA> <NA> <NA>
7 <NA> <NA> <NA> <NA> <NA>
8 <NA> <NA> <NA> <NA> <NA>
9 <NA> <NA> <NA> <NA> <NA>
10 <NA> <NA> <NA> <NA> <NA>
11 <NA> <NA> <NA> <NA> <NA>
12 <NA> <NA> <NA> <NA> <NA>
13 <NA> <NA> <NA> <NA> <NA>
14 <NA> <NA> <NA> <NA> <NA>
15 <NA> fb ctw Shirt_rem <NA>
utm_content
1 <NA>
2 <NA>
3 Brand
4 Brand
5 <NA>
6 <NA>
7 <NA>
8 <NA>
9 <NA>
10 <NA>
11 <NA>
12 <NA>
13 <NA>
14 <NA>
15 CasciaShirt
我数据集中的一列包含类似
的值utm_source=google&utm_medium=cpc&utm_campaign=1234567&utm_term=brand%20&utm_content=Brand&gclid=ERtyuiipotf_YTj
我应该如何将其拆分为不同的列及其在 R 中的值?
utm_source utm_medium utm_campaign utm_brand utm_content
google cpc 1234567 brand%20 Brand
dput(column)
给出以下输出
structure(list("null", "gclid=ertyyhglkdl-kjkY",
"utm_source=google&utm_medium=cpc&utm_campaign=1234556&utm_term=brand%20shirts&utm_content=Brand&gclid=jhajsgjdgd_ajs",
"utm_source=google&utm_medium=cpc&utm_campaign=1674814043&utm_term=brand%20shirts&utm_content=Brand&gclid=KvgMsEAAYASAAEgLq6vD_BwE",
"null", "null", "null", "null", "null", "null", "null", "null",
"null", "null", "utm_source=fb&utm_medium=ctw&utm_campaign=Shirt_rem&utm_content=CasciaShirt"), class = c("extracted",
"list"))
将 OP 的更新示例作为 list
,我们遍历 list
,if
元素不是 "null"
,然后创建一个 tibble
,用 separate_rows
拆分 &
处的列,然后将该列拆分为多个列 (separate
),用 [= 从命名向量 (deframe
) 创建一个 tibble 23=])
library(dplyr)
library(tidyr)
library(tibble)
library(purrr)
map_dfr(lst1, ~ if(.x != "null") tibble(col1 = .x) %>%
separate_rows(col1, sep="&") %>%
separate(col1, into = c('col1', 'col2'), sep="\=") %>%
deframe %>%
as_tibble_row())
-输出
# A tibble: 4 x 6
# gclid utm_source utm_medium utm_campaign utm_term utm_content
# <chr> <chr> <chr> <chr> <chr> <chr>
#1 ertyyhglkdl-kjkY <NA> <NA> <NA> <NA> <NA>
#2 jhajsgjdgd_ajs google cpc 1234556 brand%20shirts Brand
#3 KvgMsEAAYASAAEgLq6vD_BwE google cpc 1674814043 brand%20shirts Brand
#4 <NA> fb ctw Shirt_rem <NA> CasciaShirt
或者不用循环执行此操作,我们可以将 list
转换为 data.frame
中的列,执行一次并转向宽格式
library(data.table)
keep(lst1, ~ .x != "null") %>%
flatten_chr %>%
tibble(col1 = .) %>%
mutate(rn = row_number()) %>%
separate_rows(col1, sep='&') %>%
separate(col1, into = c('col1', 'col2'), sep="\=") %>%
pivot_wider(names_from = col1, values_from = col2) %>%
select(-rn)
# A tibble: 4 x 6
# gclid utm_source utm_medium utm_campaign utm_term utm_content
# <chr> <chr> <chr> <chr> <chr> <chr>
#1 ertyyhglkdl-kjkY <NA> <NA> <NA> <NA> <NA>
#2 jhajsgjdgd_ajs google cpc 1234556 brand%20shirts Brand
#3 KvgMsEAAYASAAEgLq6vD_BwE google cpc 1674814043 brand%20shirts Brand
#4 <NA> fb ctw Shirt_rem <NA> CasciaShirt
数据
lst1 <- structure(list("null", "gclid=ertyyhglkdl-kjkY", "utm_source=google&utm_medium=cpc&utm_campaign=1234556&utm_term=brand%20shirts&utm_content=Brand&gclid=jhajsgjdgd_ajs",
"utm_source=google&utm_medium=cpc&utm_campaign=1674814043&utm_term=brand%20shirts&utm_content=Brand&gclid=KvgMsEAAYASAAEgLq6vD_BwE",
"null", "null", "null", "null", "null", "null", "null", "null",
"null", "null", "utm_source=fb&utm_medium=ctw&utm_campaign=Shirt_rem&utm_content=CasciaShirt"), class = c("extracted",
"list"))
我不确定这是否是预期的输出。以下可能是您目标的基本 R 选项
Reduce(
function(...) merge(..., all = TRUE),
lapply(
column,
function(x) {
u <- unlist(strsplit(x, "&"))
setNames(data.frame(as.list(gsub(".*=", "", u))), gsub("=.*", "", u))
}
)
)
这给出了
utm_source utm_medium utm_campaign utm_content null gclid
1 fb ctw Shirt_rem CasciaShirt <NA> <NA>
2 google cpc 1234556 Brand <NA> jhajsgjdgd_ajs
3 google cpc 1674814043 Brand <NA> KvgMsEAAYASAAEgLq6vD_BwE
4 <NA> <NA> <NA> <NA> null ertyyhglkdl-kjkY
utm_term
1 <NA>
2 brand%20shirts
3 brand%20shirts
4 <NA>
更新
如果你想保留所有的数据,即使是null
,你可以试试下面的代码
Reduce(
function(x, y) {
if (all(is.na(x)) | all(is.na(y))) {
return(rbind(x, y))
}
dplyr::full_join(x, y)
},
lapply(
column,
function(x) {
if (x == "null") {
return(NA)
}
u <- unlist(strsplit(x, "&"))
setNames(data.frame(as.list(gsub(".*=", "", u))), gsub("=.*", "", u))
}
)
)
这给出了
gclid utm_source utm_medium utm_campaign utm_term
1 <NA> <NA> <NA> <NA> <NA>
2 ertyyhglkdl-kjkY <NA> <NA> <NA> <NA>
3 jhajsgjdgd_ajs google cpc 1234556 brand%20shirts
4 KvgMsEAAYASAAEgLq6vD_BwE google cpc 1674814043 brand%20shirts
5 <NA> <NA> <NA> <NA> <NA>
6 <NA> <NA> <NA> <NA> <NA>
7 <NA> <NA> <NA> <NA> <NA>
8 <NA> <NA> <NA> <NA> <NA>
9 <NA> <NA> <NA> <NA> <NA>
10 <NA> <NA> <NA> <NA> <NA>
11 <NA> <NA> <NA> <NA> <NA>
12 <NA> <NA> <NA> <NA> <NA>
13 <NA> <NA> <NA> <NA> <NA>
14 <NA> <NA> <NA> <NA> <NA>
15 <NA> fb ctw Shirt_rem <NA>
utm_content
1 <NA>
2 <NA>
3 Brand
4 Brand
5 <NA>
6 <NA>
7 <NA>
8 <NA>
9 <NA>
10 <NA>
11 <NA>
12 <NA>
13 <NA>
14 <NA>
15 CasciaShirt