从 r 中的字符串格式中提取数字形式的年龄
Extract Age in numeric form from string format in r
我有一个字符串向量 Age
,其中包含各种对象的年龄,如下所示:
"1 Months8 Days" "1 Years" "10 Days" "10 Months" "10 Months1 Days"
现在,我希望它转换为一个数字,等于年龄。
最好的方法是什么?
我试过使用:
regmatches(years, gregexpr("[[:digit:]]+", years))
但这并没有区分月和年。该向量没有上述格式以外的任何其他格式的字符串。如何做到这一点?
下面是使用 gsub
的更复杂的方式。我在匹配日期时遇到问题,@BhargavRao, @Cath and @Tensibai 在聊天中修复了它。
x <- c("1 Months8 Days", "1 Years", "10 Days", "10 Months", "10 Months1 Days", "1 Months")
xy <- data.frame(
original = x,
years = as.numeric(sapply(strsplit(x, split = "Years"), FUN = function(m) gsub("\d+$", "\1", m))),
months = as.numeric(sapply(strsplit(x, split = "Months"), FUN = function(m) gsub("\d+$", "\1", m)[1])),
days = as.numeric(sapply(strsplit(x, split = "Days"), FUN = function(m) gsub("(.*?)(\d\d?)(\s)$", "\2", m)))
)
xy$result <- apply(xy[-1], MARGIN = 1, FUN = function(m) sum(m["years"], m["months"]/12, m["days"]/365, na.rm = TRUE))
xy
original years months days result
1 1 Months8 Days NA 1 8 0.10525114
2 1 Years 1 NA NA 1.00000000
3 10 Days NA NA 10 0.02739726
4 10 Months NA 10 NA 0.83333333
5 10 Months1 Days NA 10 1 0.83607306
6 1 Months NA 1 NA 0.08333333
这适用于给定格式
> age<- c("1 Months8 Days", "1 Years", "10 Days", "10 Months", "10 Months1 Days")
> t <- regmatches(age, gregexpr("[[:digit:]]+", age))
> c_m <- regexpr("Months" , age) > 0
> c_d <- regexpr("Days" , age) > 0
> i <- 1
> age_in_years <- sapply(t , function(x ){
+ if(length(x) >1){
+ a <- as.numeric(x[1])/12 + as.numeric(x[2]) / 360 }
+ else{
+ if(c_m[i]){
+ a <- as.numeric(x) / 12}
+ else if (c_d[i]){
+ a <- as.numeric(x)/360}
+ else{a<- as.numeric(x)} }
+ i <<- i +1
+ a})
> age_in_years
[1] 0.10555556 1.00000000 0.02777778 0.83333333 0.83611111
这是一个可能的解决方案:
v <- c("1 Months8 Days", "1 Years","10 Days","10 Months","10 Months1 Days")
periods <- c(1, 1/12, 1/365)
names(periods) <- c('Years','Months','Days')
ages <- rep.int(0,length(v))
for(p in names(periods)){
matches <- regmatches(v,gregexpr(paste0("[[:digit:]]+(?= ",p,")"), v,perl=TRUE))
ages <- ages + sapply(matches,function(x) ifelse(length(x)== 0,0,as.numeric(x)*periods[p]))
}
> ages
[1] 0.10525114 1.00000000 0.02739726 0.83333333 0.83607306
我们可以使用gsubfn
library(gsubfn)
v2 <- gsubfn("[A-Za-z]+", list(Years = "* 1", Months = "* 1/12", Days = "* 1/365"),
sub("(?<=[A-Za-z])(?=[0-9])", " + ", v1, perl=TRUE))
unname(sapply(v2, function(x) eval(parse(text=x))))
#[1] 0.10525114 1.00000000 0.02739726 0.83333333 0.83607306
数据
v1 <- c("1 Months8 Days", "1 Years", "10 Days", "10 Months", "10 Months1 Days")
我有一个字符串向量 Age
,其中包含各种对象的年龄,如下所示:
"1 Months8 Days" "1 Years" "10 Days" "10 Months" "10 Months1 Days"
现在,我希望它转换为一个数字,等于年龄。 最好的方法是什么?
我试过使用:
regmatches(years, gregexpr("[[:digit:]]+", years))
但这并没有区分月和年。该向量没有上述格式以外的任何其他格式的字符串。如何做到这一点?
下面是使用 gsub
的更复杂的方式。我在匹配日期时遇到问题,@BhargavRao, @Cath and @Tensibai 在聊天中修复了它。
x <- c("1 Months8 Days", "1 Years", "10 Days", "10 Months", "10 Months1 Days", "1 Months")
xy <- data.frame(
original = x,
years = as.numeric(sapply(strsplit(x, split = "Years"), FUN = function(m) gsub("\d+$", "\1", m))),
months = as.numeric(sapply(strsplit(x, split = "Months"), FUN = function(m) gsub("\d+$", "\1", m)[1])),
days = as.numeric(sapply(strsplit(x, split = "Days"), FUN = function(m) gsub("(.*?)(\d\d?)(\s)$", "\2", m)))
)
xy$result <- apply(xy[-1], MARGIN = 1, FUN = function(m) sum(m["years"], m["months"]/12, m["days"]/365, na.rm = TRUE))
xy
original years months days result
1 1 Months8 Days NA 1 8 0.10525114
2 1 Years 1 NA NA 1.00000000
3 10 Days NA NA 10 0.02739726
4 10 Months NA 10 NA 0.83333333
5 10 Months1 Days NA 10 1 0.83607306
6 1 Months NA 1 NA 0.08333333
这适用于给定格式
> age<- c("1 Months8 Days", "1 Years", "10 Days", "10 Months", "10 Months1 Days")
> t <- regmatches(age, gregexpr("[[:digit:]]+", age))
> c_m <- regexpr("Months" , age) > 0
> c_d <- regexpr("Days" , age) > 0
> i <- 1
> age_in_years <- sapply(t , function(x ){
+ if(length(x) >1){
+ a <- as.numeric(x[1])/12 + as.numeric(x[2]) / 360 }
+ else{
+ if(c_m[i]){
+ a <- as.numeric(x) / 12}
+ else if (c_d[i]){
+ a <- as.numeric(x)/360}
+ else{a<- as.numeric(x)} }
+ i <<- i +1
+ a})
> age_in_years
[1] 0.10555556 1.00000000 0.02777778 0.83333333 0.83611111
这是一个可能的解决方案:
v <- c("1 Months8 Days", "1 Years","10 Days","10 Months","10 Months1 Days")
periods <- c(1, 1/12, 1/365)
names(periods) <- c('Years','Months','Days')
ages <- rep.int(0,length(v))
for(p in names(periods)){
matches <- regmatches(v,gregexpr(paste0("[[:digit:]]+(?= ",p,")"), v,perl=TRUE))
ages <- ages + sapply(matches,function(x) ifelse(length(x)== 0,0,as.numeric(x)*periods[p]))
}
> ages
[1] 0.10525114 1.00000000 0.02739726 0.83333333 0.83607306
我们可以使用gsubfn
library(gsubfn)
v2 <- gsubfn("[A-Za-z]+", list(Years = "* 1", Months = "* 1/12", Days = "* 1/365"),
sub("(?<=[A-Za-z])(?=[0-9])", " + ", v1, perl=TRUE))
unname(sapply(v2, function(x) eval(parse(text=x))))
#[1] 0.10525114 1.00000000 0.02739726 0.83333333 0.83607306
数据
v1 <- c("1 Months8 Days", "1 Years", "10 Days", "10 Months", "10 Months1 Days")