如何将值中的字符串转换为属性和值?
How to convert string in value to attributes and values?
我有 3mio 观测值,属性为“other_tags”。 “other_tags”的值必须转换为新的属性和值。
dput()
structure(list(osm_id = c(105093, 107975, 373652), other_tags = structure(c(2L,
3L, 1L), .Label = c("\"addr:city\"=>\"Neuenegg\",\"addr:street\"=>\"Stuberweg\",\"building\"=>\"school\",\"building:levels\"=>\"2\"",
"\"building\"=>\"commercial\",\"name\"=>\"Pollahof\",\"type\"=>\"multipolygon\"",
"\"building\"=>\"yes\",\"amenity\"=>\"sport\",\"type\"=>\"multipolygon\""
), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
这是数据的子样本:
osm_id other_tags
105093 "building"=>"commercial","name"=>"Pollahof","type"=>"multipolygon"
107975 "building"=>"yes","amenity"=>"sport","type"=>"multipolygon"
373652 "addr:city"=>"Neuenegg","addr:street"=>"Stuberweg","building"=>"school","building:levels"=>"2"
这是所需的数据格式:创建新属性(仅用于建筑和便利设施)并添加值。
osm_id building amenity
105093 commercial
107975 yes sport
373652 school
感谢您的帮助!
没那么难。
other_tags
是因子列,所以我们必须在 上使用 as.charachter
- 在中间列表中提取结果
s
,其中所有变量都是分开的;在使用 strsplit
从 split = ','
中分离出来之后
- 将这些属性存储在 seaparte rwo 中,为新数据框中的每个属性说
df2
- 使用
tidyr
中的 separate()
在两个单独的列中拆分属性名称和值。分隔符 sep
这次用作 =>
- 使用
str_remove_all
删除多余的引号
- 可选
filter
数据集
pivot_wider
转换成想要的格式。
library(tidyverse)
s <- strsplit(as.character(df$other_tags), split = ",")
df2 <- data.frame(osm_id = rep(df$osm_id, sapply(s, length)), other_tags = unlist(s))
df2 %>% separate(other_tags, into = c("Col1", "Col2"), sep = "=>") %>%
mutate(across(starts_with("Col"), ~str_remove_all(., '"'))) %>%
filter(Col1 %in% c("amenity", "building")) %>%
pivot_wider(id_cols = osm_id, names_from = Col1, values_from = Col2)
# A tibble: 3 x 3
osm_id building amenity
<dbl> <chr> <chr>
1 105093 commercial NA
2 107975 yes sport
3 373652 school NA
但是,如果不使用过滤器
df2 %>% separate(other_tags, into = c("Col1", "Col2"), sep = "=>") %>%
mutate(across(starts_with("Col"), ~str_remove_all(., '"'))) %>%
pivot_wider(id_cols = osm_id, names_from = Col1, values_from = Col2)
# A tibble: 3 x 8
osm_id building name type amenity `addr:city` `addr:street` `building:levels`
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 105093 commercial Pollahof multipolygon NA NA NA NA
2 107975 yes NA multipolygon sport NA NA NA
3 373652 school NA NA NA Neuenegg Stuberweg 2
单管道语法
df %>% mutate(other_tags = as.character(other_tags),
other_tags = str_split(other_tags, ",")) %>%
unnest(other_tags) %>%
mutate(other_tags = str_remove_all(other_tags, '"')) %>%
separate(other_tags, into = c("Col1", "Col2"), sep = "=>") %>%
filter(Col1 %in% c("amenity", "building")) %>%
pivot_wider(id_cols = osm_id, names_from = Col1, values_from = Col2)
# A tibble: 3 x 3
osm_id building amenity
<dbl> <chr> <chr>
1 105093 commercial NA
2 107975 yes sport
3 373652 school NA
我们可以使用 (g)sub
和 str_extract
以及环视(只需两行代码):
library(stringr)
df$building <- str_extract(gsub('"','', df$other_tags),'(?<=building=>)\w+(?=,)')
df$amenity <- str_extract(gsub('"','', df$other_tags),'(?<=amenity=>)\w+(?=,)')
如果出于某种原因你想删除列 other_tags
:
df$other_tags <- NULL
结果:
df
osm_id building amenity
1 105093 commercial <NA>
2 107975 yes sport
3 373652 school <NA>
我有 3mio 观测值,属性为“other_tags”。 “other_tags”的值必须转换为新的属性和值。
dput()
structure(list(osm_id = c(105093, 107975, 373652), other_tags = structure(c(2L,
3L, 1L), .Label = c("\"addr:city\"=>\"Neuenegg\",\"addr:street\"=>\"Stuberweg\",\"building\"=>\"school\",\"building:levels\"=>\"2\"",
"\"building\"=>\"commercial\",\"name\"=>\"Pollahof\",\"type\"=>\"multipolygon\"",
"\"building\"=>\"yes\",\"amenity\"=>\"sport\",\"type\"=>\"multipolygon\""
), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
这是数据的子样本:
osm_id other_tags
105093 "building"=>"commercial","name"=>"Pollahof","type"=>"multipolygon"
107975 "building"=>"yes","amenity"=>"sport","type"=>"multipolygon"
373652 "addr:city"=>"Neuenegg","addr:street"=>"Stuberweg","building"=>"school","building:levels"=>"2"
这是所需的数据格式:创建新属性(仅用于建筑和便利设施)并添加值。
osm_id building amenity
105093 commercial
107975 yes sport
373652 school
感谢您的帮助!
没那么难。
other_tags
是因子列,所以我们必须在 上使用 - 在中间列表中提取结果
s
,其中所有变量都是分开的;在使用strsplit
从 - 将这些属性存储在 seaparte rwo 中,为新数据框中的每个属性说
df2
- 使用
tidyr
中的separate()
在两个单独的列中拆分属性名称和值。分隔符sep
这次用作=>
- 使用
str_remove_all
删除多余的引号
- 可选
filter
数据集 pivot_wider
转换成想要的格式。
as.charachter
split = ','
中分离出来之后
library(tidyverse)
s <- strsplit(as.character(df$other_tags), split = ",")
df2 <- data.frame(osm_id = rep(df$osm_id, sapply(s, length)), other_tags = unlist(s))
df2 %>% separate(other_tags, into = c("Col1", "Col2"), sep = "=>") %>%
mutate(across(starts_with("Col"), ~str_remove_all(., '"'))) %>%
filter(Col1 %in% c("amenity", "building")) %>%
pivot_wider(id_cols = osm_id, names_from = Col1, values_from = Col2)
# A tibble: 3 x 3
osm_id building amenity
<dbl> <chr> <chr>
1 105093 commercial NA
2 107975 yes sport
3 373652 school NA
但是,如果不使用过滤器
df2 %>% separate(other_tags, into = c("Col1", "Col2"), sep = "=>") %>%
mutate(across(starts_with("Col"), ~str_remove_all(., '"'))) %>%
pivot_wider(id_cols = osm_id, names_from = Col1, values_from = Col2)
# A tibble: 3 x 8
osm_id building name type amenity `addr:city` `addr:street` `building:levels`
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 105093 commercial Pollahof multipolygon NA NA NA NA
2 107975 yes NA multipolygon sport NA NA NA
3 373652 school NA NA NA Neuenegg Stuberweg 2
单管道语法
df %>% mutate(other_tags = as.character(other_tags),
other_tags = str_split(other_tags, ",")) %>%
unnest(other_tags) %>%
mutate(other_tags = str_remove_all(other_tags, '"')) %>%
separate(other_tags, into = c("Col1", "Col2"), sep = "=>") %>%
filter(Col1 %in% c("amenity", "building")) %>%
pivot_wider(id_cols = osm_id, names_from = Col1, values_from = Col2)
# A tibble: 3 x 3
osm_id building amenity
<dbl> <chr> <chr>
1 105093 commercial NA
2 107975 yes sport
3 373652 school NA
我们可以使用 (g)sub
和 str_extract
以及环视(只需两行代码):
library(stringr)
df$building <- str_extract(gsub('"','', df$other_tags),'(?<=building=>)\w+(?=,)')
df$amenity <- str_extract(gsub('"','', df$other_tags),'(?<=amenity=>)\w+(?=,)')
如果出于某种原因你想删除列 other_tags
:
df$other_tags <- NULL
结果:
df
osm_id building amenity
1 105093 commercial <NA>
2 107975 yes sport
3 373652 school <NA>