长字符向量的特定字符串操作
Specific string manipulation of a long vector of characters
我是 R 的初学者。我有以下来自更大数据集的字符列表。我只想保留每个字符串中的数值。我该如何进行?我已经尝试使用 stringr 包的一些功能,但我没有成功。感谢您的帮助。
"(799.88) (966.01) (1634.17) (4714.35) (2992.45) (3200.66)",
"Per capita monthly income 226.9 312.29 452.16 1037.67 1145.13 1178.85",
"(375.99) (293.48) (749.61) (1832.05) (980.07) (1224.46)", "Per capita income / Hour of work 4.10 10.63 8.91 14.40 22.52 18.12 ",
"(6.88) (20.87) (17.30) (27.44) (27.68) (24.47)", "Number of observations (with weight) 727,671 142,936 630,353 413,807 86,717 248,179"
)```
可以使用str_extract_all
;在小数点前捕获 .
或 ,
使用字符 class [.,]
:
library(stringr)
str_extract_all(x, "\d+[.,]\d+")
[[1]]
[1] "799.88" "966.01" "1634.17" "4714.35" "2992.45" "3200.66"
[[2]]
[1] "226.9" "312.29" "452.16" "1037.67" "1145.13" "1178.85"
[[3]]
[1] "375.99" "293.48" "749.61" "1832.05" "980.07" "1224.46"
[[4]]
[1] "4.10" "10.63" "8.91" "14.40" "22.52" "18.12"
[[5]]
[1] "6.88" "20.87" "17.30" "27.44" "27.68" "24.47"
[[6]]
[1] "727,671" "142,936" "630,353" "413,807" "86,717" "248,179"
要将它们全部放在一个向量(而不是列表)中,请使用 unlist
:
unlist(str_extract_all(x, "\d+[.,]\d+"))
[1] "799.88" "966.01" "1634.17" "4714.35" "2992.45" "3200.66" "226.9" "312.29" "452.16" "1037.67" "1145.13"
[12] "1178.85" "375.99" "293.48" "749.61" "1832.05" "980.07" "1224.46" "4.10" "10.63" "8.91" "14.40"
[23] "22.52" "18.12" "6.88" "20.87" "17.30" "27.44" "27.68" "24.47" "727,671" "142,936" "630,353"
[34] "413,807" "86,717" "248,179"
数据:
x <- c("(799.88) (966.01) (1634.17) (4714.35) (2992.45) (3200.66)",
"Per capita monthly income 226.9 312.29 452.16 1037.67 1145.13 1178.85",
"(375.99) (293.48) (749.61) (1832.05) (980.07) (1224.46)", "Per capita income / Hour of work 4.10 10.63 8.91 14.40 22.52 18.12 ",
"(6.88) (20.87) (17.30) (27.44) (27.68) (24.47)", "Number of observations (with weight) 727,671 142,936 630,353 413,807 86,717 248,179"
)
使用 dplyr、tidyr 和 readr 获取数字的另一个选项:
library(dplyr)
library(tidyr)
library(readr)
# dummy data as df with one column
df <- data.frame(vec = c("(799.88) (966.01) (1634.17) (4714.35) (2992.45) (3200.66)",
"Per capita monthly income 226.9 312.29 452.16 1037.67 1145.13 1178.85",
"(375.99) (293.48) (749.61) (1832.05) (980.07) (1224.46)", "Per capita income / Hour of work 4.10 10.63 8.91 14.40 22.52 18.12 ",
"(6.88) (20.87) (17.30) (27.44) (27.68) (24.47)", "Number of observations (with weight) 727,671 142,936 630,353 413,807 86,717 248,179"))
df1 <- df %>%
# building a unique identifier from the rownames
dplyr::mutate(ID = dplyr::row_number()) %>%
# separate into rows by blanks
tidyr::separate_rows(vec, sep = " ") %>%
# use automatic number extraction from readr
dplyr::mutate(NEW = readr::parse_number(vec))
# we can now use the ID from before to get retangle shaped data:
df1 %>%
dplyr::group_by(ID) %>%
dplyr::mutate(ID2 = dplyr::row_number()) %>%
dplyr::select(ID2, NEW) %>%
tidyr::pivot_wider(names_from= "ID2", values_from = "NEW")
也许得到一个没有小数点的数字可以使它成为一个选项,修改自 Chris R
s = c("(6.88) (10) (17.30) ", "Num obs: 7,671 48,179")
str_extract_all(s, "\d+[.,]?\d+")
我是 R 的初学者。我有以下来自更大数据集的字符列表。我只想保留每个字符串中的数值。我该如何进行?我已经尝试使用 stringr 包的一些功能,但我没有成功。感谢您的帮助。
"(799.88) (966.01) (1634.17) (4714.35) (2992.45) (3200.66)",
"Per capita monthly income 226.9 312.29 452.16 1037.67 1145.13 1178.85",
"(375.99) (293.48) (749.61) (1832.05) (980.07) (1224.46)", "Per capita income / Hour of work 4.10 10.63 8.91 14.40 22.52 18.12 ",
"(6.88) (20.87) (17.30) (27.44) (27.68) (24.47)", "Number of observations (with weight) 727,671 142,936 630,353 413,807 86,717 248,179"
)```
可以使用str_extract_all
;在小数点前捕获 .
或 ,
使用字符 class [.,]
:
library(stringr)
str_extract_all(x, "\d+[.,]\d+")
[[1]]
[1] "799.88" "966.01" "1634.17" "4714.35" "2992.45" "3200.66"
[[2]]
[1] "226.9" "312.29" "452.16" "1037.67" "1145.13" "1178.85"
[[3]]
[1] "375.99" "293.48" "749.61" "1832.05" "980.07" "1224.46"
[[4]]
[1] "4.10" "10.63" "8.91" "14.40" "22.52" "18.12"
[[5]]
[1] "6.88" "20.87" "17.30" "27.44" "27.68" "24.47"
[[6]]
[1] "727,671" "142,936" "630,353" "413,807" "86,717" "248,179"
要将它们全部放在一个向量(而不是列表)中,请使用 unlist
:
unlist(str_extract_all(x, "\d+[.,]\d+"))
[1] "799.88" "966.01" "1634.17" "4714.35" "2992.45" "3200.66" "226.9" "312.29" "452.16" "1037.67" "1145.13"
[12] "1178.85" "375.99" "293.48" "749.61" "1832.05" "980.07" "1224.46" "4.10" "10.63" "8.91" "14.40"
[23] "22.52" "18.12" "6.88" "20.87" "17.30" "27.44" "27.68" "24.47" "727,671" "142,936" "630,353"
[34] "413,807" "86,717" "248,179"
数据:
x <- c("(799.88) (966.01) (1634.17) (4714.35) (2992.45) (3200.66)",
"Per capita monthly income 226.9 312.29 452.16 1037.67 1145.13 1178.85",
"(375.99) (293.48) (749.61) (1832.05) (980.07) (1224.46)", "Per capita income / Hour of work 4.10 10.63 8.91 14.40 22.52 18.12 ",
"(6.88) (20.87) (17.30) (27.44) (27.68) (24.47)", "Number of observations (with weight) 727,671 142,936 630,353 413,807 86,717 248,179"
)
使用 dplyr、tidyr 和 readr 获取数字的另一个选项:
library(dplyr)
library(tidyr)
library(readr)
# dummy data as df with one column
df <- data.frame(vec = c("(799.88) (966.01) (1634.17) (4714.35) (2992.45) (3200.66)",
"Per capita monthly income 226.9 312.29 452.16 1037.67 1145.13 1178.85",
"(375.99) (293.48) (749.61) (1832.05) (980.07) (1224.46)", "Per capita income / Hour of work 4.10 10.63 8.91 14.40 22.52 18.12 ",
"(6.88) (20.87) (17.30) (27.44) (27.68) (24.47)", "Number of observations (with weight) 727,671 142,936 630,353 413,807 86,717 248,179"))
df1 <- df %>%
# building a unique identifier from the rownames
dplyr::mutate(ID = dplyr::row_number()) %>%
# separate into rows by blanks
tidyr::separate_rows(vec, sep = " ") %>%
# use automatic number extraction from readr
dplyr::mutate(NEW = readr::parse_number(vec))
# we can now use the ID from before to get retangle shaped data:
df1 %>%
dplyr::group_by(ID) %>%
dplyr::mutate(ID2 = dplyr::row_number()) %>%
dplyr::select(ID2, NEW) %>%
tidyr::pivot_wider(names_from= "ID2", values_from = "NEW")
也许得到一个没有小数点的数字可以使它成为一个选项,修改自 Chris R
s = c("(6.88) (10) (17.30) ", "Num obs: 7,671 48,179")
str_extract_all(s, "\d+[.,]?\d+")