如何 select 数字后缀小于某个值的变量
How to select variables with numeric suffixes lower than a value
我有一个与此类似的数据框。
df <- data.frame(id=c(1,2,3), tot_1=runif(3, 0, 100), tot_2=runif(3, 0, 100), tot_3=runif(3, 0, 100), tot_4=runif(3, 0, 100))
我想select或者只对后缀小于3的进行操作
#select
df <- df %>% select(id, tot_1, tot_2)
#or sum
df <- df %>% mutate(sumVar = rowSums(across(c(tot_1, tot_2))))
但是,在我的真实数据中,变量比较多,而且顺序不对。那么我怎么能 select 他们不手动做呢?
我们可以用matches
df %>%
mutate(sumVar = rowSums(across(matches('tot_[1-2]$'))))
如果我们需要更灵活,从以 'tot' 开头的列名称中提取数字部分,根据条件子集并使用新名称
library(stringr)
nm1 <- str_subset(names(df), 'tot')
nm2 <- nm1[readr::parse_number(nm1) <3]
df %>%
mutate(sumVar = rowSums(across(all_of(nm2))))
num_range
的解决方案
这是 dplyr 中经常被遗忘的 num_range
选择助手的罕见情况,它一步从名称中提取数字,然后选择一个范围:
确定阈值
suffix_threshold <- 3
Select( )
library(dplyr)
df %>% select(id, num_range(prefix='tot_',
range=seq_len(suffix_threshold-1)))
id tot_1 tot_2
1 1 26.75082 26.89506
2 2 21.86453 18.11683
3 3 51.67968 51.85761
mutate() with rowSums()
library(dplyr)
df %>% mutate(sumVar = across(num_range(prefix='tot_', range=seq_len(suffix_threshold-1)))%>%
rowSums)
id tot_1 tot_2 tot_3 tot_4 sumVar
1 1 26.75082 26.89506 56.27829 71.79353 53.64588
2 2 21.86453 18.11683 12.91569 96.14099 39.98136
3 3 51.67968 51.85761 25.63676 10.01408 103.53730
这是基本的 R 方式 -
cols <- grep('tot_', names(df), value = TRUE)
#Select
df[c('id', cols[as.numeric(sub('tot_', '',cols)) < 3])]
# id tot_1 tot_2
#1 1 75.409112 30.59338
#2 2 9.613496 44.96151
#3 3 58.589574 64.90672
#Rowsums
df$sumVar <- rowSums(df[cols[as.numeric(sub('tot_', '',cols)) < 3]])
df
# id tot_1 tot_2 tot_3 tot_4 sumVar
#1 1 75.409112 30.59338 59.82815 50.495758 106.00250
#2 2 9.613496 44.96151 84.19916 2.189482 54.57501
#3 3 58.589574 64.90672 18.17310 71.390459 123.49629
我有一个与此类似的数据框。
df <- data.frame(id=c(1,2,3), tot_1=runif(3, 0, 100), tot_2=runif(3, 0, 100), tot_3=runif(3, 0, 100), tot_4=runif(3, 0, 100))
我想select或者只对后缀小于3的进行操作
#select
df <- df %>% select(id, tot_1, tot_2)
#or sum
df <- df %>% mutate(sumVar = rowSums(across(c(tot_1, tot_2))))
但是,在我的真实数据中,变量比较多,而且顺序不对。那么我怎么能 select 他们不手动做呢?
我们可以用matches
df %>%
mutate(sumVar = rowSums(across(matches('tot_[1-2]$'))))
如果我们需要更灵活,从以 'tot' 开头的列名称中提取数字部分,根据条件子集并使用新名称
library(stringr)
nm1 <- str_subset(names(df), 'tot')
nm2 <- nm1[readr::parse_number(nm1) <3]
df %>%
mutate(sumVar = rowSums(across(all_of(nm2))))
num_range
这是 dplyr 中经常被遗忘的 num_range
选择助手的罕见情况,它一步从名称中提取数字,然后选择一个范围:
确定阈值
suffix_threshold <- 3
Select( )
library(dplyr)
df %>% select(id, num_range(prefix='tot_',
range=seq_len(suffix_threshold-1)))
id tot_1 tot_2
1 1 26.75082 26.89506
2 2 21.86453 18.11683
3 3 51.67968 51.85761
mutate() with rowSums()
library(dplyr)
df %>% mutate(sumVar = across(num_range(prefix='tot_', range=seq_len(suffix_threshold-1)))%>%
rowSums)
id tot_1 tot_2 tot_3 tot_4 sumVar
1 1 26.75082 26.89506 56.27829 71.79353 53.64588
2 2 21.86453 18.11683 12.91569 96.14099 39.98136
3 3 51.67968 51.85761 25.63676 10.01408 103.53730
这是基本的 R 方式 -
cols <- grep('tot_', names(df), value = TRUE)
#Select
df[c('id', cols[as.numeric(sub('tot_', '',cols)) < 3])]
# id tot_1 tot_2
#1 1 75.409112 30.59338
#2 2 9.613496 44.96151
#3 3 58.589574 64.90672
#Rowsums
df$sumVar <- rowSums(df[cols[as.numeric(sub('tot_', '',cols)) < 3]])
df
# id tot_1 tot_2 tot_3 tot_4 sumVar
#1 1 75.409112 30.59338 59.82815 50.495758 106.00250
#2 2 9.613496 44.96151 84.19916 2.189482 54.57501
#3 3 58.589574 64.90672 18.17310 71.390459 123.49629