计算大量变量之间的差异
Calculate difference beetwen a big number of variables
我正在尝试计算不同列之间的差异,我是用一个循环来完成的,但我知道这不是一个优雅的解决方案,也不是 R 中最好的解决方案(效率不高)而且我的结果有重复的结果而不是逻辑运算(disp-disp 或 hp_disp 和 disp_hp)。
我的真实数据有Na,我试着模拟了一下。我的目标是尝试改进我的命令以获得与下面相同的 table。
我的命令示例如下:
names(mtcars)
mtcars$mpg[mtcars$am==1]=NA
vars1= c("mpg","cyl","disp","hp")
vars2= c("mpg","cyl","disp","hp")
df=data.frame()
df_all=data.frame()
df_all=length(mtcars)
for(i in vars1){
for(k in vars2) {
df= mtcars[[i]]-mtcars[[k]]
df_all=cbind(df_all, df)
length =ncol(df_all)
colnames(df_all)[length]= paste0(i,"_",k)
}
}
head(df_all)
disp_mpg disp_cyl disp_disp disp_hp hp_mpg hp_cyl hp_disp hp_hp
[1,] NA 154 0 50 NA 104 -50 0
[2,] NA 154 0 50 NA 104 -50 0
[3,] NA 104 0 15 NA 89 -15 0
[4,] 236.6 252 0 148 88.6 104 -148 0
[5,] 341.3 352 0 185 156.3 167 -185 0
[6,] 206.9 219 0 120 86.9 99 -120 0
这是一种方法,使用 data.table 库
library(data.table)
vars = c("mpg","cyl","disp","hp")
# create table of pairs to diff
to_diff <- CJ(vars, vars)[V1 < V2]
# calculate diffs
diffs <-
to_diff[, .(diff_val = mtcars[, V1] - mtcars[, V2]),
by = .(cols = paste0(V1, '_minus_', V2))]
# number each row in each "cols" group
diffs[, rid := rowid(cols)]
# transform so that rid determines the row, cols determines the col, and
# the values are the value of diff_val
dcast(diffs, rid ~ cols, value.var = 'diff_val')
输出
#
# rid cyl_minus_disp cyl_minus_hp cyl_minus_mpg disp_minus_hp disp_minus_mpg hp_minus_mpg
# 1: 1 -154.0 -104 -15.0 50.0 139.0 89.0
# 2: 2 -154.0 -104 -15.0 50.0 139.0 89.0
# 3: 3 -104.0 -89 -18.8 15.0 85.2 70.2
# 4: 4 -252.0 -104 -15.4 148.0 236.6 88.6
# 5: 5 -352.0 -167 -10.7 185.0 341.3 156.3
# 6: 6 -219.0 -99 -12.1 120.0 206.9 86.9
# 7: 7 -352.0 -237 -6.3 115.0 345.7 230.7
# 8: 8 -142.7 -58 -20.4 84.7 122.3 37.6
# 9: 9 -136.8 -91 -18.8 45.8 118.0 72.2
# 10: 10 -161.6 -117 -13.2 44.6 148.4 103.8
# 11: 11 -161.6 -117 -11.8 44.6 149.8 105.2
# 12: 12 -267.8 -172 -8.4 95.8 259.4 163.6
# 13: 13 -267.8 -172 -9.3 95.8 258.5 162.7
# 14: 14 -267.8 -172 -7.2 95.8 260.6 164.8
# 15: 15 -464.0 -197 -2.4 267.0 461.6 194.6
# 16: 16 -452.0 -207 -2.4 245.0 449.6 204.6
# 17: 17 -432.0 -222 -6.7 210.0 425.3 215.3
# 18: 18 -74.7 -62 -28.4 12.7 46.3 33.6
# 19: 19 -71.7 -48 -26.4 23.7 45.3 21.6
# 20: 20 -67.1 -61 -29.9 6.1 37.2 31.1
# 21: 21 -116.1 -93 -17.5 23.1 98.6 75.5
# 22: 22 -310.0 -142 -7.5 168.0 302.5 134.5
# 23: 23 -296.0 -142 -7.2 154.0 288.8 134.8
# 24: 24 -342.0 -237 -5.3 105.0 336.7 231.7
# 25: 25 -392.0 -167 -11.2 225.0 380.8 155.8
# 26: 26 -75.0 -62 -23.3 13.0 51.7 38.7
# 27: 27 -116.3 -87 -22.0 29.3 94.3 65.0
# 28: 28 -91.1 -109 -26.4 -17.9 64.7 82.6
# 29: 29 -343.0 -256 -7.8 87.0 335.2 248.2
# 30: 30 -139.0 -169 -13.7 -30.0 125.3 155.3
# 31: 31 -293.0 -327 -7.0 -34.0 286.0 320.0
# 32: 32 -117.0 -105 -17.4 12.0 99.6 87.6
# rid cyl_minus_disp cyl_minus_hp cyl_minus_mpg disp_minus_hp disp_minus_mpg hp_minus_mpg
我正在尝试计算不同列之间的差异,我是用一个循环来完成的,但我知道这不是一个优雅的解决方案,也不是 R 中最好的解决方案(效率不高)而且我的结果有重复的结果而不是逻辑运算(disp-disp 或 hp_disp 和 disp_hp)。
我的真实数据有Na,我试着模拟了一下。我的目标是尝试改进我的命令以获得与下面相同的 table。
我的命令示例如下:
names(mtcars)
mtcars$mpg[mtcars$am==1]=NA
vars1= c("mpg","cyl","disp","hp")
vars2= c("mpg","cyl","disp","hp")
df=data.frame()
df_all=data.frame()
df_all=length(mtcars)
for(i in vars1){
for(k in vars2) {
df= mtcars[[i]]-mtcars[[k]]
df_all=cbind(df_all, df)
length =ncol(df_all)
colnames(df_all)[length]= paste0(i,"_",k)
}
}
head(df_all)
disp_mpg disp_cyl disp_disp disp_hp hp_mpg hp_cyl hp_disp hp_hp
[1,] NA 154 0 50 NA 104 -50 0
[2,] NA 154 0 50 NA 104 -50 0
[3,] NA 104 0 15 NA 89 -15 0
[4,] 236.6 252 0 148 88.6 104 -148 0
[5,] 341.3 352 0 185 156.3 167 -185 0
[6,] 206.9 219 0 120 86.9 99 -120 0
这是一种方法,使用 data.table 库
library(data.table)
vars = c("mpg","cyl","disp","hp")
# create table of pairs to diff
to_diff <- CJ(vars, vars)[V1 < V2]
# calculate diffs
diffs <-
to_diff[, .(diff_val = mtcars[, V1] - mtcars[, V2]),
by = .(cols = paste0(V1, '_minus_', V2))]
# number each row in each "cols" group
diffs[, rid := rowid(cols)]
# transform so that rid determines the row, cols determines the col, and
# the values are the value of diff_val
dcast(diffs, rid ~ cols, value.var = 'diff_val')
输出
#
# rid cyl_minus_disp cyl_minus_hp cyl_minus_mpg disp_minus_hp disp_minus_mpg hp_minus_mpg
# 1: 1 -154.0 -104 -15.0 50.0 139.0 89.0
# 2: 2 -154.0 -104 -15.0 50.0 139.0 89.0
# 3: 3 -104.0 -89 -18.8 15.0 85.2 70.2
# 4: 4 -252.0 -104 -15.4 148.0 236.6 88.6
# 5: 5 -352.0 -167 -10.7 185.0 341.3 156.3
# 6: 6 -219.0 -99 -12.1 120.0 206.9 86.9
# 7: 7 -352.0 -237 -6.3 115.0 345.7 230.7
# 8: 8 -142.7 -58 -20.4 84.7 122.3 37.6
# 9: 9 -136.8 -91 -18.8 45.8 118.0 72.2
# 10: 10 -161.6 -117 -13.2 44.6 148.4 103.8
# 11: 11 -161.6 -117 -11.8 44.6 149.8 105.2
# 12: 12 -267.8 -172 -8.4 95.8 259.4 163.6
# 13: 13 -267.8 -172 -9.3 95.8 258.5 162.7
# 14: 14 -267.8 -172 -7.2 95.8 260.6 164.8
# 15: 15 -464.0 -197 -2.4 267.0 461.6 194.6
# 16: 16 -452.0 -207 -2.4 245.0 449.6 204.6
# 17: 17 -432.0 -222 -6.7 210.0 425.3 215.3
# 18: 18 -74.7 -62 -28.4 12.7 46.3 33.6
# 19: 19 -71.7 -48 -26.4 23.7 45.3 21.6
# 20: 20 -67.1 -61 -29.9 6.1 37.2 31.1
# 21: 21 -116.1 -93 -17.5 23.1 98.6 75.5
# 22: 22 -310.0 -142 -7.5 168.0 302.5 134.5
# 23: 23 -296.0 -142 -7.2 154.0 288.8 134.8
# 24: 24 -342.0 -237 -5.3 105.0 336.7 231.7
# 25: 25 -392.0 -167 -11.2 225.0 380.8 155.8
# 26: 26 -75.0 -62 -23.3 13.0 51.7 38.7
# 27: 27 -116.3 -87 -22.0 29.3 94.3 65.0
# 28: 28 -91.1 -109 -26.4 -17.9 64.7 82.6
# 29: 29 -343.0 -256 -7.8 87.0 335.2 248.2
# 30: 30 -139.0 -169 -13.7 -30.0 125.3 155.3
# 31: 31 -293.0 -327 -7.0 -34.0 286.0 320.0
# 32: 32 -117.0 -105 -17.4 12.0 99.6 87.6
# rid cyl_minus_disp cyl_minus_hp cyl_minus_mpg disp_minus_hp disp_minus_mpg hp_minus_mpg