如何从 R 中的字符串中提取数字?
how can I extract numbers from a string in R?
names(score)
[1] "(Intercept)" "aado2_calc(20,180]" "aado2_calc(360,460]"
[4] "aado2_calc(460,629]" "albumin[1,1.8]" "albumin(1.8,2.2]"
[7] "albumin(2.2,2.8]" "aniongap(15,18]" "aniongap(18,20]"
[10] "aniongap(20,22]" "aniongap(22,25]" "aniongap(25,49]"
我想提取括号内的两个数字(括号外的数字不需要)有“(”或“[”。第一个数字将分配给一个对象"low"和仅次于 "high"。
您可以使用 readr
包和函数 parse_number
以方便使用。要获得更多功能,您需要使用 r 中的基本正则表达式函数或 stringi
之类的包
scorenames <- c(
"(Intercept)" ,"aado2_calc(20,180]" ,"aado2_calc(360,460]"
,"aado2_calc(460,629]" ,"albumin[1,1.8]" ,"albumin(1.8,2.2]"
,"albumin(2.2,2.8]" ,"aniongap(15,18]" ,"aniongap(18,20]"
,"aniongap(20,22]" ,"aniongap(22,25]" ,"aniongap(25,49]"
)
第一步可能是提取 "parens" 分隔符内的所有内容(包括 ()
、[]
和逗号 ,
)。
mat <- regmatches(scorenames,
gregexpr("(?<=[\[\(,])[0-9.]+(?=[\]\),])", scorenames, perl = TRUE))
str(mat)
# List of 12
# $ : chr(0)
# $ : chr [1:2] "20" "180"
# $ : chr [1:2] "360" "460"
# $ : chr [1:2] "460" "629"
# $ : chr [1:2] "1" "1.8"
# $ : chr [1:2] "1.8" "2.2"
# $ : chr [1:2] "2.2" "2.8"
# $ : chr [1:2] "15" "18"
# $ : chr [1:2] "18" "20"
# $ : chr [1:2] "20" "22"
# $ : chr [1:2] "22" "25"
# $ : chr [1:2] "25" "49"
从这里,我们可以看出 (1) 第一个有问题(毫不奇怪,你需要在这里弄清楚你想要什么),以及 (2) 其余的看起来是正确的。
这是处理此列表的一种粗略方法。这是非常信任和天真的......你应该添加检查以确保列表的长度为 2,所有内容都正确转换(可能在 tryCatch
中),等等
newnames <- lapply(mat, function(m) {
if (! length(m)) return(list(low = NA, high = NA))
setNames(as.list(as.numeric(m)), nm = c("low", "high"))
})
str(newnames)
# List of 12
# $ :List of 2
# ..$ low : logi NA
# ..$ high: logi NA
# $ :List of 2
# ..$ low : num 20
# ..$ high: num 180
# $ :List of 2
# ..$ low : num 360
# ..$ high: num 460
# ...snip...
您可以通过以下方式将其变成 data.frame:
head(do.call(rbind.data.frame, newnames))
# low high
# 1 NA NA
# 2 20.0 180.0
# 3 360.0 460.0
# 4 460.0 629.0
# 5 1.0 1.8
# 6 1.8 2.2
就像@jake-kaupp 说的-使用stringi
:) 如您所见,stringi 解决方案更短、更容易理解并且更快-最多30 次!
简答:
arr <- stri_extract_all_regex(x, "(?<=[\[\(,])[0-9.]+(?=[\]\),])", simplify = NA)
data.frame(low = as.numeric(arr[,1]), high = as.numeric(arr[,2]))
长答案:
require(stringi)
require(microbenchmark)
grepFun <- function(x){
mat <- regmatches(x,
gregexpr("(?<=[\[\(,])[0-9.]+(?=[\]\),])", x, perl = TRUE))
newnames <- lapply(mat, function(m) {
if (! length(m)) return(list(low = NA, high = NA))
setNames(as.list(as.numeric(m)), nm = c("low", "high"))
})
do.call(rbind.data.frame, newnames)
}
striFun <- function(x){
arr <- stri_extract_all_regex(x, "(?<=[\[\(,])[0-9.]+(?=[\]\),])", simplify = NA)
data.frame(low = as.numeric(arr[,1]), high = as.numeric(arr[,2]))
}
# both functions work the same
grepFun(scorenames)
low high
1 NA NA
2 20.0 180.0
3 360.0 460.0
4 460.0 629.0
...
12 25.0 49.0
striFun(scorenames)
low high
1 NA NA
2 20.0 180.0
3 360.0 460.0
4 460.0 629.0
...
12 25.0 49.0
# generating more complicated vector
n <- 10000
x <- stri_paste(stri_rand_strings(n, length = 1:10), sample(c("(","["),n,TRUE),
sample(1000,n,TRUE), ",", sample(1000,n,TRUE), sample(c(")","]"), n, TRUE))
head(x) # check first elements
[1] "O[68,434]" "Ql[783,151)" "Zk0(773,60)" "ETfV(446,518]" "Xixbr(576,855)" "G6QnHu(92,955)"
#short test using new data
grepFun(x[1:6])
low high
1 68 434
2 783 151
3 773 60
4 446 518
5 576 855
6 92 955
striFun(x[1:6])
low high
1 68 434
2 783 151
3 773 60
4 446 518
5 576 855
6 92 955
#and some benchmark to prove performance
microbenchmark(grepFun(x), striFun(x))
Unit: milliseconds
expr min lq mean median uq max neval
grepFun(x) 330.27733 366.09306 416.56330 406.08914 465.29829 568.15250 100
striFun(x) 11.57449 11.97825 13.38157 12.46927 13.67699 25.97455 100
names(score)
[1] "(Intercept)" "aado2_calc(20,180]" "aado2_calc(360,460]"
[4] "aado2_calc(460,629]" "albumin[1,1.8]" "albumin(1.8,2.2]"
[7] "albumin(2.2,2.8]" "aniongap(15,18]" "aniongap(18,20]"
[10] "aniongap(20,22]" "aniongap(22,25]" "aniongap(25,49]"
我想提取括号内的两个数字(括号外的数字不需要)有“(”或“[”。第一个数字将分配给一个对象"low"和仅次于 "high"。
您可以使用 readr
包和函数 parse_number
以方便使用。要获得更多功能,您需要使用 r 中的基本正则表达式函数或 stringi
scorenames <- c(
"(Intercept)" ,"aado2_calc(20,180]" ,"aado2_calc(360,460]"
,"aado2_calc(460,629]" ,"albumin[1,1.8]" ,"albumin(1.8,2.2]"
,"albumin(2.2,2.8]" ,"aniongap(15,18]" ,"aniongap(18,20]"
,"aniongap(20,22]" ,"aniongap(22,25]" ,"aniongap(25,49]"
)
第一步可能是提取 "parens" 分隔符内的所有内容(包括 ()
、[]
和逗号 ,
)。
mat <- regmatches(scorenames,
gregexpr("(?<=[\[\(,])[0-9.]+(?=[\]\),])", scorenames, perl = TRUE))
str(mat)
# List of 12
# $ : chr(0)
# $ : chr [1:2] "20" "180"
# $ : chr [1:2] "360" "460"
# $ : chr [1:2] "460" "629"
# $ : chr [1:2] "1" "1.8"
# $ : chr [1:2] "1.8" "2.2"
# $ : chr [1:2] "2.2" "2.8"
# $ : chr [1:2] "15" "18"
# $ : chr [1:2] "18" "20"
# $ : chr [1:2] "20" "22"
# $ : chr [1:2] "22" "25"
# $ : chr [1:2] "25" "49"
从这里,我们可以看出 (1) 第一个有问题(毫不奇怪,你需要在这里弄清楚你想要什么),以及 (2) 其余的看起来是正确的。
这是处理此列表的一种粗略方法。这是非常信任和天真的......你应该添加检查以确保列表的长度为 2,所有内容都正确转换(可能在 tryCatch
中),等等
newnames <- lapply(mat, function(m) {
if (! length(m)) return(list(low = NA, high = NA))
setNames(as.list(as.numeric(m)), nm = c("low", "high"))
})
str(newnames)
# List of 12
# $ :List of 2
# ..$ low : logi NA
# ..$ high: logi NA
# $ :List of 2
# ..$ low : num 20
# ..$ high: num 180
# $ :List of 2
# ..$ low : num 360
# ..$ high: num 460
# ...snip...
您可以通过以下方式将其变成 data.frame:
head(do.call(rbind.data.frame, newnames))
# low high
# 1 NA NA
# 2 20.0 180.0
# 3 360.0 460.0
# 4 460.0 629.0
# 5 1.0 1.8
# 6 1.8 2.2
就像@jake-kaupp 说的-使用stringi
:) 如您所见,stringi 解决方案更短、更容易理解并且更快-最多30 次!
简答:
arr <- stri_extract_all_regex(x, "(?<=[\[\(,])[0-9.]+(?=[\]\),])", simplify = NA)
data.frame(low = as.numeric(arr[,1]), high = as.numeric(arr[,2]))
长答案:
require(stringi)
require(microbenchmark)
grepFun <- function(x){
mat <- regmatches(x,
gregexpr("(?<=[\[\(,])[0-9.]+(?=[\]\),])", x, perl = TRUE))
newnames <- lapply(mat, function(m) {
if (! length(m)) return(list(low = NA, high = NA))
setNames(as.list(as.numeric(m)), nm = c("low", "high"))
})
do.call(rbind.data.frame, newnames)
}
striFun <- function(x){
arr <- stri_extract_all_regex(x, "(?<=[\[\(,])[0-9.]+(?=[\]\),])", simplify = NA)
data.frame(low = as.numeric(arr[,1]), high = as.numeric(arr[,2]))
}
# both functions work the same grepFun(scorenames) low high 1 NA NA 2 20.0 180.0 3 360.0 460.0 4 460.0 629.0 ... 12 25.0 49.0 striFun(scorenames) low high 1 NA NA 2 20.0 180.0 3 360.0 460.0 4 460.0 629.0 ... 12 25.0 49.0
# generating more complicated vector
n <- 10000
x <- stri_paste(stri_rand_strings(n, length = 1:10), sample(c("(","["),n,TRUE),
sample(1000,n,TRUE), ",", sample(1000,n,TRUE), sample(c(")","]"), n, TRUE))
head(x) # check first elements
[1] "O[68,434]" "Ql[783,151)" "Zk0(773,60)" "ETfV(446,518]" "Xixbr(576,855)" "G6QnHu(92,955)"
#short test using new data grepFun(x[1:6]) low high 1 68 434 2 783 151 3 773 60 4 446 518 5 576 855 6 92 955 striFun(x[1:6]) low high 1 68 434 2 783 151 3 773 60 4 446 518 5 576 855 6 92 955 #and some benchmark to prove performance microbenchmark(grepFun(x), striFun(x)) Unit: milliseconds expr min lq mean median uq max neval grepFun(x) 330.27733 366.09306 416.56330 406.08914 465.29829 568.15250 100 striFun(x) 11.57449 11.97825 13.38157 12.46927 13.67699 25.97455 100