根据R中特定字符分隔的行中的数据创建新列
Creating new columns based on data in row separated by specific character in R
我有以下table
Owner
Pet
Housing_Type
A
Cats;Dog;Rabbit
3
B
Dog;Rabbit
2
C
Cats
2
D
Cats;Rabbit
3
E
Cats;Fish
1
代码如下:
Data_Pets = structure(list(Owner = structure(1:5, .Label = c("A", "B", "C", "D",
"E"), class = "factor"), Pets = structure(c(2L, 5L, 1L,4L, 3L), .Label = c("Cats ",
"Cats;Dog;Rabbit", "Cats;Fish","Cats;Rabbit", "Dog;Rabbit"), class = "factor"),
House_Type = c(3L,2L, 2L, 3L, 1L)), class = "data.frame", row.names = c(NA, -5L))
谁能告诉我如何根据 Pet 列中的数据创建新列,方法是为每个动物创建一个新列,用 ; 分隔看起来像下面这样 table?
Owner
Cats
Dog
Rabbit
Fish
Housing_Type
A
Y
Y
Y
N
3
B
N
Y
Y
N
2
C
N
Y
N
N
2
D
Y
N
Y
N
3
E
Y
N
N
Y
1
谢谢!
一种方法是定义一个匹配特定动物的辅助函数,然后将列绑定到原始框架。
请注意,进行了一些争论以去除空格以识别要查询的独特动物。
f <- Vectorize(function(string, match) {
ifelse(grepl(match, string), "Y", "N")
}, c("match"))
df %>%
bind_cols(
f(df$Pets, unique(unlist(strsplit(trimws(as.character(df$Pets)), ";"))))
)
Owner Pets House_Type Cats Dog Rabbit Fish
1 A Cats;Dog;Rabbit 3 Y Y Y N
2 B Dog;Rabbit 2 N Y Y N
3 C Cats 2 Y N N N
4 D Cats;Rabbit 3 Y N Y N
5 E Cats;Fish 1 Y N N Y
或者更一般化,如果您不确定分隔符是 ;
,并且存在空格,stringi
很有用:
dplyr::bind_cols(
df,
f(df$Pets, unique(unlist(stringi::stri_extract_all_words(df$Pets))))
)
您可以使用 tidyr
库中的 separate_rows
和 pivot_wider
:
library(tidyr)
library(dplyr)
Data_Pets %>%
separate_rows(Pets , sep = ";") %>%
mutate(Pets = trimws(Pets)) %>%
mutate(temp = row_number()) %>%
pivot_wider(names_from = Pets, values_from = temp) %>%
mutate(across(c(Cats:Fish), function(x) if_else(is.na(x), "N", "Y"))) %>%
dplyr::relocate(House_Type, .after = Fish)
这将生成:
# Owner Cats Dog Rabbit Fish House_Type
# <fct> <chr> <chr> <chr> <chr> <int>
# 1 A Y Y Y N 3
# 2 B N Y Y N 2
# 3 C Y N N N 2
# 4 D Y N Y N 3
# 5 E Y N N Y 1
数据:
Data_Pets = structure(list(Owner = structure(1:5, .Label = c("A", "B", "C", "D",
"E"), class = "factor"), Pets = structure(c(2L, 5L, 1L,4L, 3L), .Label = c("Cats ",
"Cats;Dog;Rabbit", "Cats;Fish","Cats;Rabbit", "Dog;Rabbit"), class = "factor"),
House_Type = c(3L,2L, 2L, 3L, 1L)), class = "data.frame", row.names = c(NA, -5L))
我有以下table
Owner | Pet | Housing_Type |
---|---|---|
A | Cats;Dog;Rabbit | 3 |
B | Dog;Rabbit | 2 |
C | Cats | 2 |
D | Cats;Rabbit | 3 |
E | Cats;Fish | 1 |
代码如下:
Data_Pets = structure(list(Owner = structure(1:5, .Label = c("A", "B", "C", "D",
"E"), class = "factor"), Pets = structure(c(2L, 5L, 1L,4L, 3L), .Label = c("Cats ",
"Cats;Dog;Rabbit", "Cats;Fish","Cats;Rabbit", "Dog;Rabbit"), class = "factor"),
House_Type = c(3L,2L, 2L, 3L, 1L)), class = "data.frame", row.names = c(NA, -5L))
谁能告诉我如何根据 Pet 列中的数据创建新列,方法是为每个动物创建一个新列,用 ; 分隔看起来像下面这样 table?
Owner | Cats | Dog | Rabbit | Fish | Housing_Type |
---|---|---|---|---|---|
A | Y | Y | Y | N | 3 |
B | N | Y | Y | N | 2 |
C | N | Y | N | N | 2 |
D | Y | N | Y | N | 3 |
E | Y | N | N | Y | 1 |
谢谢!
一种方法是定义一个匹配特定动物的辅助函数,然后将列绑定到原始框架。
请注意,进行了一些争论以去除空格以识别要查询的独特动物。
f <- Vectorize(function(string, match) {
ifelse(grepl(match, string), "Y", "N")
}, c("match"))
df %>%
bind_cols(
f(df$Pets, unique(unlist(strsplit(trimws(as.character(df$Pets)), ";"))))
)
Owner Pets House_Type Cats Dog Rabbit Fish
1 A Cats;Dog;Rabbit 3 Y Y Y N
2 B Dog;Rabbit 2 N Y Y N
3 C Cats 2 Y N N N
4 D Cats;Rabbit 3 Y N Y N
5 E Cats;Fish 1 Y N N Y
或者更一般化,如果您不确定分隔符是 ;
,并且存在空格,stringi
很有用:
dplyr::bind_cols(
df,
f(df$Pets, unique(unlist(stringi::stri_extract_all_words(df$Pets))))
)
您可以使用 tidyr
库中的 separate_rows
和 pivot_wider
:
library(tidyr)
library(dplyr)
Data_Pets %>%
separate_rows(Pets , sep = ";") %>%
mutate(Pets = trimws(Pets)) %>%
mutate(temp = row_number()) %>%
pivot_wider(names_from = Pets, values_from = temp) %>%
mutate(across(c(Cats:Fish), function(x) if_else(is.na(x), "N", "Y"))) %>%
dplyr::relocate(House_Type, .after = Fish)
这将生成:
# Owner Cats Dog Rabbit Fish House_Type
# <fct> <chr> <chr> <chr> <chr> <int>
# 1 A Y Y Y N 3
# 2 B N Y Y N 2
# 3 C Y N N N 2
# 4 D Y N Y N 3
# 5 E Y N N Y 1
数据:
Data_Pets = structure(list(Owner = structure(1:5, .Label = c("A", "B", "C", "D",
"E"), class = "factor"), Pets = structure(c(2L, 5L, 1L,4L, 3L), .Label = c("Cats ",
"Cats;Dog;Rabbit", "Cats;Fish","Cats;Rabbit", "Dog;Rabbit"), class = "factor"),
House_Type = c(3L,2L, 2L, 3L, 1L)), class = "data.frame", row.names = c(NA, -5L))