识别复杂字符串中的特定术语时创建新行
Creating new rows when specific terms within complex strings are recognised
我有一个包含各种格式(单词、大小写、特殊字符、空格、连字符、重叠单词)的字符串的数据框。这些是由测量员根据预定义列表 select 编辑的。但是测量员可以 select 每行的多个术语。 我想做的是识别这些预定义术语出现的位置并将它们放在新行中并复制所有其他列。预定义项为不同压力的一些示例数据:
Pre_defined_pressures <- c("Urbanisation", "Land cover (general)", "Agriculture / Horticulture", "Water Quality", "General-effects")
Surveyor_df <- as.data.frame(c("A","B","C","D"))
colnames(Surveyor_df ) <- "ID"
Surveyor_df$Year <- c(1999,1999,2000,2000)
Surveyor_df$Pressure <- c("Urbanisation Land cover (general)",
"Urbanisation Land cover (general) Agriculture / Horticulture General-effects",
"Urbanisation Land cover (general) Water Quality General-effects",
"Urbanisation Land cover (general) Agriculture / Horticulture Water Quality General-effects")
正确的数据框应该是什么样子。
Correct_df <- as.data.frame(c("A","A","B","B","B","B","C","C","C","C","D","D","D","D","D"))
colnames(Correct_df ) <- "ID"
Correct_df$Year <- c(1999,1999,1999,1999,1999,1999,2000,2000,2000,2000,2000,2000,2000,2000,2000)
Correct_df$Pressure <- c("Urbanisation","Land cover (general)",
"Urbanisation","Land cover (general)","Agriculture / Horticulture","General-effects",
"Urbanisation", "Land cover (general)", "Water Quality", "General-effects",
"Urbanisation", "Land cover (general)", "Agriculture / Horticulture", "Water Quality","General-effects")
这样做的一种方法是将每个可能的值提取到一个单独的列中,然后旋转更长的时间。如果值由唯一字符分隔,这会更容易,但在您的数据中,space 可以是词法 space 或分隔符。
library(dplyr)
library(stringr)
library(tidyr)
Surveyor_df |>
mutate(urb = str_extract(Pressure, "Urbanisation"),
lan = str_extract(Pressure, "Land cover \(general\)"),
agr = str_extract(Pressure, "Agriculture / Horticulture"),
wat = str_extract(Pressure, "Water Quality")
) |>
pivot_longer(cols = urb:wat, values_to = "pressure") |>
select(-Pressure, -name) |>
filter(!is.na(pressure))
输出:
#> # A tibble: 12 x 3
#> ID Year pressure
#> <chr> <dbl> <chr>
#> 1 A 1999 Urbanisation
#> 2 A 1999 Land cover (general)
#> 3 B 1999 Urbanisation
#> 4 B 1999 Land cover (general)
#> 5 B 1999 Agriculture / Horticulture
#> 6 C 2000 Urbanisation
#> 7 C 2000 Land cover (general)
#> 8 C 2000 Water Quality
#> 9 D 2000 Urbanisation
#> 10 D 2000 Land cover (general)
#> 11 D 2000 Agriculture / Horticulture
#> 12 D 2000 Water Quality
定义一个命名向量,以逗号作为分隔符(可以是任何符号)。替换现有 Pressure
值,并删除尾随逗号,然后使用 tidy::separate_rows
为每个实例创建单独的行。然后整理一些文字。
此方法避免了手动输入值,如果 Pressure
.
有很多不同的值,这可能会很痛苦
根据 OP 的评论更新并修改了输入数据集。
library(tidyr)
library(dplyr)
library(stringr)
vec_p <- paste0(Pre_defined_pressures, ",")
names(vec_p) <- Pre_defined_pressures
Correct_df <-
Surveyor_df %>%
mutate(Pressure = str_replace_all(Pressure, coll(vec_p)),
Pressure = str_remove(Pressure, ",$")) %>%
separate_rows(Pressure, sep = ",") %>%
mutate(Pressure = str_squish(Pressure))
Correct_df
#> # A tibble: 15 × 3
#> ID Year Pressure
#> <chr> <dbl> <chr>
#> 1 A 1999 Urbanisation
#> 2 A 1999 Land cover (general)
#> 3 B 1999 Urbanisation
#> 4 B 1999 Land cover (general)
#> 5 B 1999 Agriculture / Horticulture
#> 6 B 1999 General-effects
#> 7 C 2000 Urbanisation
#> 8 C 2000 Land cover (general)
#> 9 C 2000 Water Quality
#> 10 C 2000 General-effects
#> 11 D 2000 Urbanisation
#> 12 D 2000 Land cover (general)
#> 13 D 2000 Agriculture / Horticulture
#> 14 D 2000 Water Quality
#> 15 D 2000 General-effects
由 reprex package (v2.0.1)
于 2022-05-24 创建
base 中的一种方法可能是使用 grepl
查找匹配项,然后使用 col
扩展 Surveyor_df 和 Pre_defined_pressures 使用 row
.
. <- t(sapply(Pre_defined_pressures, grepl, Surveyor_df$Pressure, fixed=TRUE))
. <- cbind(Surveyor_df[col(.)[.], 1:2], Pressure = Pre_defined_pressures[row(.)[.]])
all.equal(., Correct_df, check.attributes = FALSE)
#[1] TRUE
.
# ID Year Pressure
#1 A 1999 Urbanisation
#1.1 A 1999 Land cover (general)
#2 B 1999 Urbanisation
#2.1 B 1999 Land cover (general)
#2.2 B 1999 Agriculture / Horticulture
#2.3 B 1999 General-effects
#3 C 2000 Urbanisation
#3.1 C 2000 Land cover (general)
#3.2 C 2000 Water Quality
#3.3 C 2000 General-effects
#4 D 2000 Urbanisation
#4.1 D 2000 Land cover (general)
#4.2 D 2000 Agriculture / Horticulture
#4.3 D 2000 Water Quality
#4.4 D 2000 General-effects
我有一个包含各种格式(单词、大小写、特殊字符、空格、连字符、重叠单词)的字符串的数据框。这些是由测量员根据预定义列表 select 编辑的。但是测量员可以 select 每行的多个术语。 我想做的是识别这些预定义术语出现的位置并将它们放在新行中并复制所有其他列。预定义项为不同压力的一些示例数据:
Pre_defined_pressures <- c("Urbanisation", "Land cover (general)", "Agriculture / Horticulture", "Water Quality", "General-effects")
Surveyor_df <- as.data.frame(c("A","B","C","D"))
colnames(Surveyor_df ) <- "ID"
Surveyor_df$Year <- c(1999,1999,2000,2000)
Surveyor_df$Pressure <- c("Urbanisation Land cover (general)",
"Urbanisation Land cover (general) Agriculture / Horticulture General-effects",
"Urbanisation Land cover (general) Water Quality General-effects",
"Urbanisation Land cover (general) Agriculture / Horticulture Water Quality General-effects")
正确的数据框应该是什么样子。
Correct_df <- as.data.frame(c("A","A","B","B","B","B","C","C","C","C","D","D","D","D","D"))
colnames(Correct_df ) <- "ID"
Correct_df$Year <- c(1999,1999,1999,1999,1999,1999,2000,2000,2000,2000,2000,2000,2000,2000,2000)
Correct_df$Pressure <- c("Urbanisation","Land cover (general)",
"Urbanisation","Land cover (general)","Agriculture / Horticulture","General-effects",
"Urbanisation", "Land cover (general)", "Water Quality", "General-effects",
"Urbanisation", "Land cover (general)", "Agriculture / Horticulture", "Water Quality","General-effects")
这样做的一种方法是将每个可能的值提取到一个单独的列中,然后旋转更长的时间。如果值由唯一字符分隔,这会更容易,但在您的数据中,space 可以是词法 space 或分隔符。
library(dplyr)
library(stringr)
library(tidyr)
Surveyor_df |>
mutate(urb = str_extract(Pressure, "Urbanisation"),
lan = str_extract(Pressure, "Land cover \(general\)"),
agr = str_extract(Pressure, "Agriculture / Horticulture"),
wat = str_extract(Pressure, "Water Quality")
) |>
pivot_longer(cols = urb:wat, values_to = "pressure") |>
select(-Pressure, -name) |>
filter(!is.na(pressure))
输出:
#> # A tibble: 12 x 3
#> ID Year pressure
#> <chr> <dbl> <chr>
#> 1 A 1999 Urbanisation
#> 2 A 1999 Land cover (general)
#> 3 B 1999 Urbanisation
#> 4 B 1999 Land cover (general)
#> 5 B 1999 Agriculture / Horticulture
#> 6 C 2000 Urbanisation
#> 7 C 2000 Land cover (general)
#> 8 C 2000 Water Quality
#> 9 D 2000 Urbanisation
#> 10 D 2000 Land cover (general)
#> 11 D 2000 Agriculture / Horticulture
#> 12 D 2000 Water Quality
定义一个命名向量,以逗号作为分隔符(可以是任何符号)。替换现有 Pressure
值,并删除尾随逗号,然后使用 tidy::separate_rows
为每个实例创建单独的行。然后整理一些文字。
此方法避免了手动输入值,如果 Pressure
.
根据 OP 的评论更新并修改了输入数据集。
library(tidyr)
library(dplyr)
library(stringr)
vec_p <- paste0(Pre_defined_pressures, ",")
names(vec_p) <- Pre_defined_pressures
Correct_df <-
Surveyor_df %>%
mutate(Pressure = str_replace_all(Pressure, coll(vec_p)),
Pressure = str_remove(Pressure, ",$")) %>%
separate_rows(Pressure, sep = ",") %>%
mutate(Pressure = str_squish(Pressure))
Correct_df
#> # A tibble: 15 × 3
#> ID Year Pressure
#> <chr> <dbl> <chr>
#> 1 A 1999 Urbanisation
#> 2 A 1999 Land cover (general)
#> 3 B 1999 Urbanisation
#> 4 B 1999 Land cover (general)
#> 5 B 1999 Agriculture / Horticulture
#> 6 B 1999 General-effects
#> 7 C 2000 Urbanisation
#> 8 C 2000 Land cover (general)
#> 9 C 2000 Water Quality
#> 10 C 2000 General-effects
#> 11 D 2000 Urbanisation
#> 12 D 2000 Land cover (general)
#> 13 D 2000 Agriculture / Horticulture
#> 14 D 2000 Water Quality
#> 15 D 2000 General-effects
由 reprex package (v2.0.1)
于 2022-05-24 创建base 中的一种方法可能是使用 grepl
查找匹配项,然后使用 col
扩展 Surveyor_df 和 Pre_defined_pressures 使用 row
.
. <- t(sapply(Pre_defined_pressures, grepl, Surveyor_df$Pressure, fixed=TRUE))
. <- cbind(Surveyor_df[col(.)[.], 1:2], Pressure = Pre_defined_pressures[row(.)[.]])
all.equal(., Correct_df, check.attributes = FALSE)
#[1] TRUE
.
# ID Year Pressure
#1 A 1999 Urbanisation
#1.1 A 1999 Land cover (general)
#2 B 1999 Urbanisation
#2.1 B 1999 Land cover (general)
#2.2 B 1999 Agriculture / Horticulture
#2.3 B 1999 General-effects
#3 C 2000 Urbanisation
#3.1 C 2000 Land cover (general)
#3.2 C 2000 Water Quality
#3.3 C 2000 General-effects
#4 D 2000 Urbanisation
#4.1 D 2000 Land cover (general)
#4.2 D 2000 Agriculture / Horticulture
#4.3 D 2000 Water Quality
#4.4 D 2000 General-effects