如何将相同的转换应用于数据框中的变量组?
How to apply the same transformation to groups of variables in a data frame?
我有一个包含许多变量的数据框,这些变量的名称包含标签。
mydf <- data.frame(
var_x = 1:5, var_y = runif(5), var_z = runif(5),
other_x = 10:14, other_p = runif(5), other_r = runif(5)
)
mydf
var_x var_y var_z other_x other_p other_r
1 1 0.2700212 0.05893272 10 0.6212327 0.6177092
2 2 0.1284033 0.27333098 11 0.6933060 0.7520978
3 3 0.7313771 0.69352560 12 0.3154764 0.8479646
4 4 0.2400357 0.25151053 13 0.2057361 0.5138406
5 5 0.1797793 0.78550584 14 0.6671606 0.5801830
我想将 var_*
个变量除以 var_x
,将 other_*
个变量除以 other_x
。我怎样才能轻松做到这一点?
我尝试使用 mutate_each
的 dplyr
。如果只有一组要缩放,则以下方法有效。我怎样才能对每个标签自动执行此操作?
library(dplyr)
scale_var <- mydf$var_x
mydf %>% mutate_each(funs(./scale_var), matches("^var"))
我试着写了自己的函数如下。
mymutate <- function(data, type) {
scale_var <- mydf[[paste0(type, "_x")]]
data %>% mutate_each(
funs(./scale_var),
matches(paste0("^", type))
)
}
但是当我试图在一种类型上 运行 它时 mymutate(mydf, type = "var")
它抛出了一个我不太明白的错误:Error in paste0("^", type) : object 'type' not found
更新
我想只使用新变量,所以该方法将 x
个变量也分开并不重要。
我有很多像var
和other
这样的标签,所以我不想一一写出来。这就是为什么我尝试构建自己的函数以便稍后在 lapply
.
中使用它的原因
更新2
这些是我的数据框的变量。
[1] "location_50_all_1" "location_50_both_sides_important_1"
[3] "location_50_left_important_1" "location_50_other_important_1"
[5] "location_50_right_important_1" "ownership_all_1"
[7] "ownership_both_sides_important_1" "ownership_left_important_1"
[9] "ownership_other_important_1" "ownership_right_important_1"
[11] "person_all_1" "person_both_sides_important_1"
[13] "person_left_important_1" "person_other_important_1"
[15] "person_right_important_1" "union_all_1"
[17] "union_both_sides_important_1" "union_left_important_1"
[19] "union_other_important_1" "union_right_important_1"
[21] "total_left_important" "total_right_important"
[23] "total_both_sides_important" "total_other_important"
[25] "total_firm_officials" "left"
[27] "right" "connected"
我想将 location_50*
个变量除以 location_50_all_1
,location_200*
、ownership*
、person*
、union*
也是如此。
更新3
是为什么 'type' not found
.
的答案
这就是你想要的吗...
library(dplyr)
mydf <- data.frame(
var_x = 1:5, var_y = runif(5), var_z = runif(5),
other_x = 10:14, other_p = runif(5), other_r = runif(5)
)
my_df_var <-
mydf %>%
select(contains("var"))
my_divided_var_df <-
my_df_var / my_df_var[, 1]
my_df_other <-
mydf %>%
select(contains("other"))
my_divided_other_df <-
my_df_other / my_df_other[, 1]
my_final_df <-
bind_cols(my_divided_var_df,
my_divided_other_df)
my_final_df
var_x var_y var_z other_x other_p other_r
1 1 0.1505216 0.50006694 1 0.01507284 0.04272813
2 1 0.3694496 0.07608916 1 0.03721775 0.07758692
3 1 0.1615257 0.05903999 1 0.04790595 0.00702291
4 1 0.1867266 0.15325190 1 0.06612689 0.03709427
5 1 0.1823187 0.15005917 1 0.02325902 0.05880811
var_x_orig <- mydf$var_x
other_x_orig <- mydf$other_x
mydf %>%
mutate_each(funs(./var_x_orig), matches("^var")) %>%
mutate_each(funs(./other_x_orig), matches("^other"))
这可能会有帮助。如果每个变量名称有三列(例如,三列 'var' 和三列 'other'),我会使用 lapply()
。然后,如有必要,绑定列以返回到原始数据格式。
# mydf
# var_x var_y var_z other_x other_p other_r
#1 1 0.8393539 0.2685360 10 0.82749405 0.77923222
#2 2 0.8966534 0.6157903 11 0.30657267 0.97301619
#3 3 0.7426782 0.6982445 12 0.75195632 0.03107233
#4 4 0.9448537 0.3711827 13 0.68455120 0.45232667
#5 5 0.4848614 0.2108115 14 0.01126723 0.91213041
library(dplyr)
num <- seq(1, ncol(mydf), 3)
lapply(num, function(x) mydf[, x:(x+2)]) -> foo
lapply(foo, function(y) {y[,2] = y[, 2] / y[, 1]
y[,3] = y[, 3] / y[, 1]
y}) %>%
bind_cols(.)
# var_x var_y var_z other_x other_p other_r
#1 1 0.83935391 0.26853595 10 0.0827494049 0.077923222
#2 2 0.44832669 0.30789516 11 0.0278702429 0.088456017
#3 3 0.24755938 0.23274817 12 0.0626630264 0.002589361
#4 4 0.23621343 0.09279569 13 0.0526577848 0.034794359
#5 5 0.09697229 0.04216230 14 0.0008048022 0.065152172
这个 mymutate
的修改版本即使在数据框结构不合理的情况下也能正常工作(即每种情况下应缩放的列数不同)。
# mydf
# var_x var_y var_z other_x other_p other_r
# 1 1 0.1913353 0.4706113 10 0.003120607 0.17808048
# 2 2 0.1620725 0.6228830 11 0.844399758 0.01361841
# 3 3 0.5148884 0.3671178 12 0.996055741 0.33513972
# 4 4 0.8086168 0.3265216 13 0.984819261 0.96802056
# 5 5 0.9902217 0.9087540 14 0.951119864 0.82479090
mymutate <- function(data, type) {
scale_var <- data[[paste0(type, "_x")]]
data %<>%
select(matches(paste0("^", type))) %>%
mutate_each(funs(./scale_var))
data[[paste0(type, "_x")]] <- scale_var
data
}
types <- c("var", "other")
lapply(types, mymutate, data=mydf) %>% bind_cols(.)
# var_x var_y var_z other_x other_p other_r
# 1 1 0.19133528 0.47061133 10 0.0003120607 0.017808048
# 2 2 0.08103626 0.31144148 11 0.0767636144 0.001238037
# 3 3 0.17162946 0.12237259 12 0.0830046451 0.027928310
# 4 4 0.20215421 0.08163039 13 0.0757553278 0.074463120
# 5 5 0.19804435 0.18175081 14 0.0679371332 0.058913635
我有一个包含许多变量的数据框,这些变量的名称包含标签。
mydf <- data.frame(
var_x = 1:5, var_y = runif(5), var_z = runif(5),
other_x = 10:14, other_p = runif(5), other_r = runif(5)
)
mydf
var_x var_y var_z other_x other_p other_r
1 1 0.2700212 0.05893272 10 0.6212327 0.6177092
2 2 0.1284033 0.27333098 11 0.6933060 0.7520978
3 3 0.7313771 0.69352560 12 0.3154764 0.8479646
4 4 0.2400357 0.25151053 13 0.2057361 0.5138406
5 5 0.1797793 0.78550584 14 0.6671606 0.5801830
我想将 var_*
个变量除以 var_x
,将 other_*
个变量除以 other_x
。我怎样才能轻松做到这一点?
我尝试使用 mutate_each
的 dplyr
。如果只有一组要缩放,则以下方法有效。我怎样才能对每个标签自动执行此操作?
library(dplyr)
scale_var <- mydf$var_x
mydf %>% mutate_each(funs(./scale_var), matches("^var"))
我试着写了自己的函数如下。
mymutate <- function(data, type) {
scale_var <- mydf[[paste0(type, "_x")]]
data %>% mutate_each(
funs(./scale_var),
matches(paste0("^", type))
)
}
但是当我试图在一种类型上 运行 它时 mymutate(mydf, type = "var")
它抛出了一个我不太明白的错误:Error in paste0("^", type) : object 'type' not found
更新
我想只使用新变量,所以该方法将 x
个变量也分开并不重要。
我有很多像var
和other
这样的标签,所以我不想一一写出来。这就是为什么我尝试构建自己的函数以便稍后在 lapply
.
更新2
这些是我的数据框的变量。
[1] "location_50_all_1" "location_50_both_sides_important_1"
[3] "location_50_left_important_1" "location_50_other_important_1"
[5] "location_50_right_important_1" "ownership_all_1"
[7] "ownership_both_sides_important_1" "ownership_left_important_1"
[9] "ownership_other_important_1" "ownership_right_important_1"
[11] "person_all_1" "person_both_sides_important_1"
[13] "person_left_important_1" "person_other_important_1"
[15] "person_right_important_1" "union_all_1"
[17] "union_both_sides_important_1" "union_left_important_1"
[19] "union_other_important_1" "union_right_important_1"
[21] "total_left_important" "total_right_important"
[23] "total_both_sides_important" "total_other_important"
[25] "total_firm_officials" "left"
[27] "right" "connected"
我想将 location_50*
个变量除以 location_50_all_1
,location_200*
、ownership*
、person*
、union*
也是如此。
更新3
'type' not found
.
这就是你想要的吗...
library(dplyr)
mydf <- data.frame(
var_x = 1:5, var_y = runif(5), var_z = runif(5),
other_x = 10:14, other_p = runif(5), other_r = runif(5)
)
my_df_var <-
mydf %>%
select(contains("var"))
my_divided_var_df <-
my_df_var / my_df_var[, 1]
my_df_other <-
mydf %>%
select(contains("other"))
my_divided_other_df <-
my_df_other / my_df_other[, 1]
my_final_df <-
bind_cols(my_divided_var_df,
my_divided_other_df)
my_final_df
var_x var_y var_z other_x other_p other_r
1 1 0.1505216 0.50006694 1 0.01507284 0.04272813
2 1 0.3694496 0.07608916 1 0.03721775 0.07758692
3 1 0.1615257 0.05903999 1 0.04790595 0.00702291
4 1 0.1867266 0.15325190 1 0.06612689 0.03709427
5 1 0.1823187 0.15005917 1 0.02325902 0.05880811
var_x_orig <- mydf$var_x
other_x_orig <- mydf$other_x
mydf %>%
mutate_each(funs(./var_x_orig), matches("^var")) %>%
mutate_each(funs(./other_x_orig), matches("^other"))
这可能会有帮助。如果每个变量名称有三列(例如,三列 'var' 和三列 'other'),我会使用 lapply()
。然后,如有必要,绑定列以返回到原始数据格式。
# mydf
# var_x var_y var_z other_x other_p other_r
#1 1 0.8393539 0.2685360 10 0.82749405 0.77923222
#2 2 0.8966534 0.6157903 11 0.30657267 0.97301619
#3 3 0.7426782 0.6982445 12 0.75195632 0.03107233
#4 4 0.9448537 0.3711827 13 0.68455120 0.45232667
#5 5 0.4848614 0.2108115 14 0.01126723 0.91213041
library(dplyr)
num <- seq(1, ncol(mydf), 3)
lapply(num, function(x) mydf[, x:(x+2)]) -> foo
lapply(foo, function(y) {y[,2] = y[, 2] / y[, 1]
y[,3] = y[, 3] / y[, 1]
y}) %>%
bind_cols(.)
# var_x var_y var_z other_x other_p other_r
#1 1 0.83935391 0.26853595 10 0.0827494049 0.077923222
#2 2 0.44832669 0.30789516 11 0.0278702429 0.088456017
#3 3 0.24755938 0.23274817 12 0.0626630264 0.002589361
#4 4 0.23621343 0.09279569 13 0.0526577848 0.034794359
#5 5 0.09697229 0.04216230 14 0.0008048022 0.065152172
这个 mymutate
的修改版本即使在数据框结构不合理的情况下也能正常工作(即每种情况下应缩放的列数不同)。
# mydf
# var_x var_y var_z other_x other_p other_r
# 1 1 0.1913353 0.4706113 10 0.003120607 0.17808048
# 2 2 0.1620725 0.6228830 11 0.844399758 0.01361841
# 3 3 0.5148884 0.3671178 12 0.996055741 0.33513972
# 4 4 0.8086168 0.3265216 13 0.984819261 0.96802056
# 5 5 0.9902217 0.9087540 14 0.951119864 0.82479090
mymutate <- function(data, type) {
scale_var <- data[[paste0(type, "_x")]]
data %<>%
select(matches(paste0("^", type))) %>%
mutate_each(funs(./scale_var))
data[[paste0(type, "_x")]] <- scale_var
data
}
types <- c("var", "other")
lapply(types, mymutate, data=mydf) %>% bind_cols(.)
# var_x var_y var_z other_x other_p other_r
# 1 1 0.19133528 0.47061133 10 0.0003120607 0.017808048
# 2 2 0.08103626 0.31144148 11 0.0767636144 0.001238037
# 3 3 0.17162946 0.12237259 12 0.0830046451 0.027928310
# 4 4 0.20215421 0.08163039 13 0.0757553278 0.074463120
# 5 5 0.19804435 0.18175081 14 0.0679371332 0.058913635