在R时间序列数据框中,如何基于正则表达式进行分离和分类
In R time series dataframe, how to separate and categorize based on regex
你好,我有一个时间序列数据框,其中包含一系列产品及其不同的税率,我需要将其分为两类:百分比数字 (AV) 和文本(所有其他没有百分比数字 (SPEC) 的内容,即由字符向量中的第一个加号分隔:
#note there are many more years
product <- c("01","02")
yr1<-c("0%","11.5% + 190 GBP/100kg")
yr2<-c("0%","15% + 190 GBP/100kg + MAX 8.5%/100kg")
yearnum =2
sched <- data.frame(product,yr1,yr2)
#where yearnum is the number of years
schedule<-c(paste0("yr",1:yearnum))
#categorize av and specific DUTY rates
for(j in 1:yearnum){
for(i in schedule){
sched <- sched %>% separate(i, c(paste0("av.yr",j), paste0("spec.yr",j)), " \+ ", remove=F, extra = "merge")}}
我试图将它们分离到下面的结果中,但我的 for 循环公式有问题。有人可以帮忙吗?
#and the output should be
product <- c("01","02")
yr1<-c("0%","11.5% + 190 GBP/100kg")
yr2<-c("0%","15% + 190 GBP/100kg + MAX 8.5%/100kg")
av.yr1<- c("0%","11.5%")
av.yr2 <-c("0%","15%")
spec.yr1 <-c("","190 GBP/100kg")
spec.yr2 <-c("","190 GBP/100kg + MAX 8.5%/100kg")
sched<-data.frame(product,yr1,yr2,av.yr1,av.yr2,spec.yr1,spec.yr2)
你只需要遍历一个索引:
library(tidyr)
#note there are many more years
product <- c("01","02")
yr1<-c("0%","11.5% + 190 GBP/100kg")
yr2<-c("0%","15% + 190 GBP/100kg + MAX 8.5%/100kg")
yearnum =2
sched <- data.frame(product,yr1,yr2)
#where yearnum is the number of years
schedule<-c(paste0("yr",1:yearnum))
#categorize av and specific DUTY rates
for(j in 1:yearnum){
i <- schedule[j]
sched <- sched %>% separate(i, c(paste0("av.yr",j), paste0("spec.yr",j)),
" \+ ", remove=F, extra = "merge", fill = "right")
}
sched
#> product yr1 av.yr1 spec.yr1
#> 1 01 0% 0% <NA>
#> 2 02 11.5% + 190 GBP/100kg 11.5% 190 GBP/100kg
#> yr2 av.yr2 spec.yr2
#> 1 0% 0% <NA>
#> 2 15% + 190 GBP/100kg + MAX 8.5%/100kg 15% 190 GBP/100kg + MAX 8.5%/100kg
由 reprex package (v2.0.1)
于 2022-05-25 创建
如果你有很多年,我认为最好的办法是将你的数据转换为长格式,使用 separate
或 mutate
和正则表达式,以及 pivot_back宽。
pivot_longer(sched, -product) %>%
separate(value,into=c("av","spec"),sep = " [+] ",extra = "merge") %>%
pivot_wider(names_from=name,values_from=av:spec,names_sep = ".")
输出:
product av.yr1 av.yr2 spec.yr1 spec.yr2
<chr> <chr> <chr> <chr> <chr>
1 01 0% 0% NA NA
2 02 11.5% 15% 190 GBP/100kg 190 GBP/100kg + MAX 8.5%/100kg
这是一个使用 mutate
的选项,它也保留了原始列:
pivot_longer(sched, -product, values_to = "yr", names_prefix = "yr") %>%
mutate(av.yr = str_extract(yr,"^\d*[.]?\d*%"),
spec.yr = str_remove(yr, "^\d*[.]?\d*%( [+] )?")) %>%
pivot_wider(names_from=name, values_from=yr:spec.yr, names_sep = "")
输出
product yr1 yr2 av.yr1 av.yr2 spec.yr1 spec.yr2
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 01 0% 0% 0% 0% "" ""
2 02 11.5% + 190 GBP/100kg 15% + 190 GBP/100kg + MAX 8.5%/100kg 11.5% 15% "190 GBP/100kg" "190 GBP/100kg + MAX 8.5%/100kg"
你好,我有一个时间序列数据框,其中包含一系列产品及其不同的税率,我需要将其分为两类:百分比数字 (AV) 和文本(所有其他没有百分比数字 (SPEC) 的内容,即由字符向量中的第一个加号分隔:
#note there are many more years
product <- c("01","02")
yr1<-c("0%","11.5% + 190 GBP/100kg")
yr2<-c("0%","15% + 190 GBP/100kg + MAX 8.5%/100kg")
yearnum =2
sched <- data.frame(product,yr1,yr2)
#where yearnum is the number of years
schedule<-c(paste0("yr",1:yearnum))
#categorize av and specific DUTY rates
for(j in 1:yearnum){
for(i in schedule){
sched <- sched %>% separate(i, c(paste0("av.yr",j), paste0("spec.yr",j)), " \+ ", remove=F, extra = "merge")}}
我试图将它们分离到下面的结果中,但我的 for 循环公式有问题。有人可以帮忙吗?
#and the output should be
product <- c("01","02")
yr1<-c("0%","11.5% + 190 GBP/100kg")
yr2<-c("0%","15% + 190 GBP/100kg + MAX 8.5%/100kg")
av.yr1<- c("0%","11.5%")
av.yr2 <-c("0%","15%")
spec.yr1 <-c("","190 GBP/100kg")
spec.yr2 <-c("","190 GBP/100kg + MAX 8.5%/100kg")
sched<-data.frame(product,yr1,yr2,av.yr1,av.yr2,spec.yr1,spec.yr2)
你只需要遍历一个索引:
library(tidyr)
#note there are many more years
product <- c("01","02")
yr1<-c("0%","11.5% + 190 GBP/100kg")
yr2<-c("0%","15% + 190 GBP/100kg + MAX 8.5%/100kg")
yearnum =2
sched <- data.frame(product,yr1,yr2)
#where yearnum is the number of years
schedule<-c(paste0("yr",1:yearnum))
#categorize av and specific DUTY rates
for(j in 1:yearnum){
i <- schedule[j]
sched <- sched %>% separate(i, c(paste0("av.yr",j), paste0("spec.yr",j)),
" \+ ", remove=F, extra = "merge", fill = "right")
}
sched
#> product yr1 av.yr1 spec.yr1
#> 1 01 0% 0% <NA>
#> 2 02 11.5% + 190 GBP/100kg 11.5% 190 GBP/100kg
#> yr2 av.yr2 spec.yr2
#> 1 0% 0% <NA>
#> 2 15% + 190 GBP/100kg + MAX 8.5%/100kg 15% 190 GBP/100kg + MAX 8.5%/100kg
由 reprex package (v2.0.1)
于 2022-05-25 创建如果你有很多年,我认为最好的办法是将你的数据转换为长格式,使用 separate
或 mutate
和正则表达式,以及 pivot_back宽。
pivot_longer(sched, -product) %>%
separate(value,into=c("av","spec"),sep = " [+] ",extra = "merge") %>%
pivot_wider(names_from=name,values_from=av:spec,names_sep = ".")
输出:
product av.yr1 av.yr2 spec.yr1 spec.yr2
<chr> <chr> <chr> <chr> <chr>
1 01 0% 0% NA NA
2 02 11.5% 15% 190 GBP/100kg 190 GBP/100kg + MAX 8.5%/100kg
这是一个使用 mutate
的选项,它也保留了原始列:
pivot_longer(sched, -product, values_to = "yr", names_prefix = "yr") %>%
mutate(av.yr = str_extract(yr,"^\d*[.]?\d*%"),
spec.yr = str_remove(yr, "^\d*[.]?\d*%( [+] )?")) %>%
pivot_wider(names_from=name, values_from=yr:spec.yr, names_sep = "")
输出
product yr1 yr2 av.yr1 av.yr2 spec.yr1 spec.yr2
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 01 0% 0% 0% 0% "" ""
2 02 11.5% + 190 GBP/100kg 15% + 190 GBP/100kg + MAX 8.5%/100kg 11.5% 15% "190 GBP/100kg" "190 GBP/100kg + MAX 8.5%/100kg"