如何在 R 中将一个字符列拆分为多个列
How to split a character column into multiple columns in R
我有一个数据框x
:
dput(x)
structure(list(District = structure(c(6L, 6L, 6L, 6L, 6L, 6L), .Label = c("District - Central (06)",
"District - East (04)", "District - New Delhi (05)", "District - North (02)",
"District - North East (03)", "District - North West (01)", "District - South (09)",
"District - South West (08)", "District - West (07)"), class = "factor"),
Age = structure(c(103L, 1L, 2L, 14L, 25L, 36L), .Label = c("0",
"1", "10", "100+", "11", "12", "13", "14", "15", "16", "17",
"18", "19", "2", "20", "21", "22", "23", "24", "25", "26",
"27", "28", "29", "3", "30", "31", "32", "33", "34", "35",
"36", "37", "38", "39", "4", "40", "41", "42", "43", "44",
"45", "46", "47", "48", "49", "5", "50", "51", "52", "53",
"54", "55", "56", "57", "58", "59", "6", "60", "61", "62",
"63", "64", "65", "66", "67", "68", "69", "7", "70", "71",
"72", "73", "74", "75", "76", "77", "78", "79", "8", "80",
"81", "82", "83", "84", "85", "86", "87", "88", "89", "9",
"90", "91", "92", "93", "94", "95", "96", "97", "98", "99",
"Age not stated", "All ages"), class = "factor"), Total = c(3656539L,
56131L, 58644L, 63835L, 63859L, 64945L), Rural = c(213950L,
3589L, 3757L, 4200L, 4102L, 4223L), Urban = c(3442589L, 52542L,
54887L, 59635L, 59757L, 60722L)), .Names = c("District",
"Age", "Total", "Rural", "Urban"), row.names = c(NA, 6L), class = "data.frame")
我想拆分 District
列以将分区名称提取到新列 Name
中。例如。 "District - North West (01)" 应该分裂给 "North West"。
我尝试了 str_split_fixed
并得到:
x
District Age Total Rural Urban 1 name
1 District - North West (01) All ages 3656539 213950 3442589 North West (01)
2 District - North West (01) 0 56131 3589 52542 North West (01)
3 District - North West (01) 1 58644 3757 54887 North West (01)
4 District - North West (01) 2 63835 4200 59635 North West (01)
5 District - North West (01) 3 63859 4102 59757 North West (01)
6 District - North West (01) 4 64945 4223 60722 North West (01)
我尝试再次使用相同的函数拆分 name
列以将地区名称与代码分开,但它给我以下错误:
Error in stri_split_regex(string, pattern, n = n, simplify = TRUE, opts_regex = attr(pattern, :
Incorrectly nested parentheses in regexp pattern. (U_REGEX_MISMATCHED_PAREN)
有没有办法在单个函数中根据模式将字符列拆分为多个列?
你可以得到你想要的 gsub
:
gsub("^.* +- +([A-Za-z ]+) \(.*$", "\1", df$District)
[1] "North West" "North West" "North West" "North West" "North West" "North West"
gsub
("^.* +- +([A-Za-z ]+) \(.*$")的第一个参数是正则表达式,可以解释如下:
从字符串“^”的开头开始,匹配任何字符“.*”后跟至少一个 space、一个连字符和至少一个 space " +- + ”。然后捕获下一个由(至少一个)字母和spaces“[A-Za-z]+”组成的文本“()”。当您到达 space 后跟括号“\\(”时停止捕获,然后匹配所有内容直到文本“.*$”结束。
gsub
的第二个参数,“\\1”表示用括号中捕获的文本替换文本。
将其分配给变量:
df$name <- gsub("^.* +- +([A-Za-z ]+) \(.*$", "\1", df$District)
你可以使用
library(stringr)
data.frame(str_split_fixed(df$District, " ", 3))
X1 X2 X3
1 District - North West (01)
2 District - North West (01)
3 District - North West (01)
4 District - North West (01)
5 District - North West (01)
6 District - North West (01)
您可以使用 gsub
删除此处多余的内容,
gsub("[[:digit:]]","",df$X3)
gsub("[[:punct:]]","",df$X3)
等等
您还可以匹配提取:
library(stringi)
library(dplyr)
library(purrr)
mutate(x,
name=map_chr(stri_match_all_regex(District, "- ([[:alpha:]]+ [[:alpha:]]+) "), function(x) x[,2]),
code=map_chr(stri_match_all_regex(District, "\(([[:digit:]]+)\)"), function(x) x[,2]))
## District Age Total Rural Urban name code
## 1 District - North West (01) All ages 3656539 213950 3442589 North West 01
## 2 District - North West (01) 0 56131 3589 52542 North West 01
## 3 District - North West (01) 1 58644 3757 54887 North West 01
## 4 District - North West (01) 2 63835 4200 59635 North West 01
## 5 District - North West (01) 3 63859 4102 59757 North West 01
## 6 District - North West (01) 4 64945 4223 60722 North West 01
我有一个数据框x
:
dput(x)
structure(list(District = structure(c(6L, 6L, 6L, 6L, 6L, 6L), .Label = c("District - Central (06)",
"District - East (04)", "District - New Delhi (05)", "District - North (02)",
"District - North East (03)", "District - North West (01)", "District - South (09)",
"District - South West (08)", "District - West (07)"), class = "factor"),
Age = structure(c(103L, 1L, 2L, 14L, 25L, 36L), .Label = c("0",
"1", "10", "100+", "11", "12", "13", "14", "15", "16", "17",
"18", "19", "2", "20", "21", "22", "23", "24", "25", "26",
"27", "28", "29", "3", "30", "31", "32", "33", "34", "35",
"36", "37", "38", "39", "4", "40", "41", "42", "43", "44",
"45", "46", "47", "48", "49", "5", "50", "51", "52", "53",
"54", "55", "56", "57", "58", "59", "6", "60", "61", "62",
"63", "64", "65", "66", "67", "68", "69", "7", "70", "71",
"72", "73", "74", "75", "76", "77", "78", "79", "8", "80",
"81", "82", "83", "84", "85", "86", "87", "88", "89", "9",
"90", "91", "92", "93", "94", "95", "96", "97", "98", "99",
"Age not stated", "All ages"), class = "factor"), Total = c(3656539L,
56131L, 58644L, 63835L, 63859L, 64945L), Rural = c(213950L,
3589L, 3757L, 4200L, 4102L, 4223L), Urban = c(3442589L, 52542L,
54887L, 59635L, 59757L, 60722L)), .Names = c("District",
"Age", "Total", "Rural", "Urban"), row.names = c(NA, 6L), class = "data.frame")
我想拆分 District
列以将分区名称提取到新列 Name
中。例如。 "District - North West (01)" 应该分裂给 "North West"。
我尝试了 str_split_fixed
并得到:
x
District Age Total Rural Urban 1 name
1 District - North West (01) All ages 3656539 213950 3442589 North West (01)
2 District - North West (01) 0 56131 3589 52542 North West (01)
3 District - North West (01) 1 58644 3757 54887 North West (01)
4 District - North West (01) 2 63835 4200 59635 North West (01)
5 District - North West (01) 3 63859 4102 59757 North West (01)
6 District - North West (01) 4 64945 4223 60722 North West (01)
我尝试再次使用相同的函数拆分 name
列以将地区名称与代码分开,但它给我以下错误:
Error in stri_split_regex(string, pattern, n = n, simplify = TRUE, opts_regex = attr(pattern, : Incorrectly nested parentheses in regexp pattern. (U_REGEX_MISMATCHED_PAREN)
有没有办法在单个函数中根据模式将字符列拆分为多个列?
你可以得到你想要的 gsub
:
gsub("^.* +- +([A-Za-z ]+) \(.*$", "\1", df$District)
[1] "North West" "North West" "North West" "North West" "North West" "North West"
gsub
("^.* +- +([A-Za-z ]+) \(.*$")的第一个参数是正则表达式,可以解释如下:
从字符串“^”的开头开始,匹配任何字符“.*”后跟至少一个 space、一个连字符和至少一个 space " +- + ”。然后捕获下一个由(至少一个)字母和spaces“[A-Za-z]+”组成的文本“()”。当您到达 space 后跟括号“\\(”时停止捕获,然后匹配所有内容直到文本“.*$”结束。
gsub
的第二个参数,“\\1”表示用括号中捕获的文本替换文本。
将其分配给变量:
df$name <- gsub("^.* +- +([A-Za-z ]+) \(.*$", "\1", df$District)
你可以使用
library(stringr)
data.frame(str_split_fixed(df$District, " ", 3))
X1 X2 X3
1 District - North West (01)
2 District - North West (01)
3 District - North West (01)
4 District - North West (01)
5 District - North West (01)
6 District - North West (01)
您可以使用 gsub
删除此处多余的内容,
gsub("[[:digit:]]","",df$X3)
gsub("[[:punct:]]","",df$X3)
等等
您还可以匹配提取:
library(stringi)
library(dplyr)
library(purrr)
mutate(x,
name=map_chr(stri_match_all_regex(District, "- ([[:alpha:]]+ [[:alpha:]]+) "), function(x) x[,2]),
code=map_chr(stri_match_all_regex(District, "\(([[:digit:]]+)\)"), function(x) x[,2]))
## District Age Total Rural Urban name code
## 1 District - North West (01) All ages 3656539 213950 3442589 North West 01
## 2 District - North West (01) 0 56131 3589 52542 North West 01
## 3 District - North West (01) 1 58644 3757 54887 North West 01
## 4 District - North West (01) 2 63835 4200 59635 North West 01
## 5 District - North West (01) 3 63859 4102 59757 North West 01
## 6 District - North West (01) 4 64945 4223 60722 North West 01