使用 str_extract_all() 后从 R 中的字符串向量中提取数字总和
extracting the sum of numbers from a vector of strings in R after using str_extract_all()
我有一个格式不正确的数据框,其中包含一个字符串向量,例如
f<-data.frame(FruitQuantity=c("10 apple", "orange(15), bananas(30)", "cucumber-15",0,"not sure",NA))
> f
FruitQuantity
1 10 apple
2 orange(15), bananas(30)
3 cucumber-15
4 0
5 not sure
6 <NA>
我希望从中提取计数数据的总和到另一个向量中,如下所示:
FruitQuantity Total
1 10 apple 10
2 orange(15), bananas(30) 45
3 cucumber-15 15
4 0 0
5 not sure NA
6 <NA> NA
为了提取数值数据,我执行了以下操作
library(tidyverse)
f$SeperateCount<-str_extract_all(f$FruitQuantity,"\d+")
结果:
>f
FruitQuantity SeperateCount
1 10 apple 10
2 orange(15), bananas(30) 15, 30
3 cucumber-15 15
4 0 0
5 not sure
6 <NA> NA
> f$SeperateCount
[[1]]
[1] "10"
[[2]]
[1] "15" "30"
[[3]]
[1] "15"
[[4]]
[1] "0"
[[5]]
character(0)
[[6]]
[1] NA
它返回了一个列表,其中包含提取为字符的数字列表,例如第二行 c(15,30)
和第五行 character(0)
为了获取每个列表中元素的总和,我尝试了以下方法
f$Total<-sapply(f$SeperateCount,sum)
返回错误
Error in FUN(X[[i]], ...) : invalid 'type' (character) of argument
然后我尝试将列表中的字符转换为整数
f$SeperateCountNumeric<-lapply(f$SeperateCount, function(x) if(all(grepl('^[0-9.]+$', x))) as.integer(x) else x)
> f$SeperateCountNumeric
[[1]]
[1] 10
[[2]]
[1] 15 30
[[3]]
[1] 15
[[4]]
[1] 0
[[5]]
integer(0)
[[6]]
[1] NA
> f
FruitQuantity SeperateCount SeperateCountNumeric
1 10 apple 10 10
2 orange(15), bananas(30) 15, 30 15, 30
3 cucumber-15 15 15
4 0 0 0
5 not sure
6 <NA> NA NA
但即使转换为整数后,相同的字符错误仍然存在
> sapply(f$SeperateCountNumeric,sum)
Error in FUN(X[[i]], ...) : invalid 'type' (character) of argument
有没有其他方法可以做到这一点?
非常感谢您的帮助!
借助软件包 stringr
你可以试试这个
library(stringr)
f$Total <- sapply(str_extract_all(f$FruitQuantity, "[[:digit:]]+"),
function(x) ifelse(identical(x, character(0)),NA,sum(as.numeric(x))))
f
FruitQuantity Total
1 10 apple 10
2 orange(15), bananas(30) 45
3 cucumber-15 15
4 0 0
5 not sure NA
6 <NA> NA
类似的基本R解决方案
f$Total <- sapply(strsplit(trimws(
gsub("[[:alpha:](),-]","", f$FruitQuantity)), " "),
function(x) ifelse(identical(x, character(0)),
NA,sum(as.numeric(x))))
f
FruitQuantity Total
1 10 apple 10
2 orange(15), bananas(30) 45
3 cucumber-15 15
4 0 0
5 not sure NA
6 <NA> NA
我有一个格式不正确的数据框,其中包含一个字符串向量,例如
f<-data.frame(FruitQuantity=c("10 apple", "orange(15), bananas(30)", "cucumber-15",0,"not sure",NA))
> f
FruitQuantity
1 10 apple
2 orange(15), bananas(30)
3 cucumber-15
4 0
5 not sure
6 <NA>
我希望从中提取计数数据的总和到另一个向量中,如下所示:
FruitQuantity Total
1 10 apple 10
2 orange(15), bananas(30) 45
3 cucumber-15 15
4 0 0
5 not sure NA
6 <NA> NA
为了提取数值数据,我执行了以下操作
library(tidyverse)
f$SeperateCount<-str_extract_all(f$FruitQuantity,"\d+")
结果:
>f
FruitQuantity SeperateCount
1 10 apple 10
2 orange(15), bananas(30) 15, 30
3 cucumber-15 15
4 0 0
5 not sure
6 <NA> NA
> f$SeperateCount
[[1]]
[1] "10"
[[2]]
[1] "15" "30"
[[3]]
[1] "15"
[[4]]
[1] "0"
[[5]]
character(0)
[[6]]
[1] NA
它返回了一个列表,其中包含提取为字符的数字列表,例如第二行 c(15,30)
和第五行 character(0)
为了获取每个列表中元素的总和,我尝试了以下方法
f$Total<-sapply(f$SeperateCount,sum)
返回错误
Error in FUN(X[[i]], ...) : invalid 'type' (character) of argument
然后我尝试将列表中的字符转换为整数
f$SeperateCountNumeric<-lapply(f$SeperateCount, function(x) if(all(grepl('^[0-9.]+$', x))) as.integer(x) else x)
> f$SeperateCountNumeric
[[1]]
[1] 10
[[2]]
[1] 15 30
[[3]]
[1] 15
[[4]]
[1] 0
[[5]]
integer(0)
[[6]]
[1] NA
> f
FruitQuantity SeperateCount SeperateCountNumeric
1 10 apple 10 10
2 orange(15), bananas(30) 15, 30 15, 30
3 cucumber-15 15 15
4 0 0 0
5 not sure
6 <NA> NA NA
但即使转换为整数后,相同的字符错误仍然存在
> sapply(f$SeperateCountNumeric,sum)
Error in FUN(X[[i]], ...) : invalid 'type' (character) of argument
有没有其他方法可以做到这一点?
非常感谢您的帮助!
借助软件包 stringr
你可以试试这个
library(stringr)
f$Total <- sapply(str_extract_all(f$FruitQuantity, "[[:digit:]]+"),
function(x) ifelse(identical(x, character(0)),NA,sum(as.numeric(x))))
f
FruitQuantity Total
1 10 apple 10
2 orange(15), bananas(30) 45
3 cucumber-15 15
4 0 0
5 not sure NA
6 <NA> NA
类似的基本R解决方案
f$Total <- sapply(strsplit(trimws(
gsub("[[:alpha:](),-]","", f$FruitQuantity)), " "),
function(x) ifelse(identical(x, character(0)),
NA,sum(as.numeric(x))))
f
FruitQuantity Total
1 10 apple 10
2 orange(15), bananas(30) 45
3 cucumber-15 15
4 0 0
5 not sure NA
6 <NA> NA