来自字符向量的数据框,其中变量名称及其数据共同存储
Dataframe from a character vector where variable name and its data were stored jointly
我遇到这种情况:
foo <- data.frame("vars" = c("animal: mouse | wks: 12 | site: cage | PI: 78",
"animal: dog | wks: 32 | GI: 0.2",
"animal: cat | wks: 8 | site: wild | PI: 13"))
其中变量名和相关数据存储在字符串中,如上例。特别是,每个 variable_name/its_data 单元由 |
分隔。 :
之后是相关数据。
我想要一个像这样的最终数据框:
animal wks site PI GI
mouse 12 cage 78 NA
dog 32 <NA> NA 0.2
cat 8 wild 13 NA
我们可以使用 read.dcf
来自 base R
out <- type.convert(as.data.frame(read.dcf(
textConnection(paste(gsub("\s+\|\s+", "\n", foo$vars),
collapse="\n\n")))), as.is = TRUE)
-输出
> out
animal wks site PI GI
1 mouse 12 cage 78 NA
2 dog 32 <NA> NA 0.2
3 cat 8 wild 13 NA
> str(out)
'data.frame': 3 obs. of 5 variables:
$ animal: chr "mouse" "dog" "cat"
$ wks : int 12 32 8
$ site : chr "cage" NA "wild"
$ PI : int 78 NA 13
$ GI : num NA 0.2 NA
这是一个dplyr
解决方案:
library(dplyr)
library(tidyr)
tibble(foo) %>%
mutate(row = row_number()) %>%
separate_rows(vars, sep = '\|') %>%
separate(vars, c("a", "b"), sep = '\:') %>%
mutate(across(everything(), str_trim)) %>%
group_by(a) %>%
pivot_wider(names_from = a, values_from = b) %>%
type.convert(as.is = TRUE) %>%
select(-row)
animal wks site PI GI
<chr> <int> <chr> <int> <dbl>
1 mouse 12 cage 78 NA
2 dog 32 NA NA 0.2
3 cat 8 wild 13 NA
另一个基础 R 选项使用 Reduce
+ merge
type.convert(
Reduce(
function(x, y) merge(x, y, all = TRUE),
lapply(
strsplit(foo$vars, ":|\|"),
function(x) {
m <- matrix(trimws(x), 2)
setNames(data.frame(m[2, , drop = FALSE]), m[1, ])
}
)
),
as.is = TRUE
)
给予
animal wks site PI GI
1 cat 8 wild 13 NA
2 dog 32 <NA> NA 0.2
3 mouse 12 cage 78 NA
这是一种使用 scan()
行然后 merge
.
的方法
lapply(1:nrow(foo), \(x)
scan(text=foo[x, ], what=character(), sep='|', strip.white=T, qui=T) |>
(\(.) do.call(rbind, strsplit(., ': ')))() |>
(\(.) setNames(data.frame(t(.[, 2])), .[, 1]))()) |>
(\(.) Reduce(\(...) merge(..., all=TRUE), .))()
# animal wks site PI GI
# 1 cat 8 wild 13 <NA>
# 2 dog 32 <NA> <NA> 0.2
# 3 mouse 12 cage 78 <NA>
注意: R >= 4.1 使用
我遇到这种情况:
foo <- data.frame("vars" = c("animal: mouse | wks: 12 | site: cage | PI: 78",
"animal: dog | wks: 32 | GI: 0.2",
"animal: cat | wks: 8 | site: wild | PI: 13"))
其中变量名和相关数据存储在字符串中,如上例。特别是,每个 variable_name/its_data 单元由 |
分隔。 :
之后是相关数据。
我想要一个像这样的最终数据框:
animal wks site PI GI
mouse 12 cage 78 NA
dog 32 <NA> NA 0.2
cat 8 wild 13 NA
我们可以使用 read.dcf
来自 base R
out <- type.convert(as.data.frame(read.dcf(
textConnection(paste(gsub("\s+\|\s+", "\n", foo$vars),
collapse="\n\n")))), as.is = TRUE)
-输出
> out
animal wks site PI GI
1 mouse 12 cage 78 NA
2 dog 32 <NA> NA 0.2
3 cat 8 wild 13 NA
> str(out)
'data.frame': 3 obs. of 5 variables:
$ animal: chr "mouse" "dog" "cat"
$ wks : int 12 32 8
$ site : chr "cage" NA "wild"
$ PI : int 78 NA 13
$ GI : num NA 0.2 NA
这是一个dplyr
解决方案:
library(dplyr)
library(tidyr)
tibble(foo) %>%
mutate(row = row_number()) %>%
separate_rows(vars, sep = '\|') %>%
separate(vars, c("a", "b"), sep = '\:') %>%
mutate(across(everything(), str_trim)) %>%
group_by(a) %>%
pivot_wider(names_from = a, values_from = b) %>%
type.convert(as.is = TRUE) %>%
select(-row)
animal wks site PI GI
<chr> <int> <chr> <int> <dbl>
1 mouse 12 cage 78 NA
2 dog 32 NA NA 0.2
3 cat 8 wild 13 NA
另一个基础 R 选项使用 Reduce
+ merge
type.convert(
Reduce(
function(x, y) merge(x, y, all = TRUE),
lapply(
strsplit(foo$vars, ":|\|"),
function(x) {
m <- matrix(trimws(x), 2)
setNames(data.frame(m[2, , drop = FALSE]), m[1, ])
}
)
),
as.is = TRUE
)
给予
animal wks site PI GI
1 cat 8 wild 13 NA
2 dog 32 <NA> NA 0.2
3 mouse 12 cage 78 NA
这是一种使用 scan()
行然后 merge
.
lapply(1:nrow(foo), \(x)
scan(text=foo[x, ], what=character(), sep='|', strip.white=T, qui=T) |>
(\(.) do.call(rbind, strsplit(., ': ')))() |>
(\(.) setNames(data.frame(t(.[, 2])), .[, 1]))()) |>
(\(.) Reduce(\(...) merge(..., all=TRUE), .))()
# animal wks site PI GI
# 1 cat 8 wild 13 <NA>
# 2 dog 32 <NA> <NA> 0.2
# 3 mouse 12 cage 78 <NA>
注意: R >= 4.1 使用