消除高度相关的列但保留我的非数字列
Eliminate highly correlated columns but conserving my non-numerical columns
我有一个包含分类变量和数值变量的数据框。我想去掉高度相关的变量。
所以,首先我去掉了我所有的分类列,制作了矩阵并找出哪些数字列需要删除。现在我想回去,也有我的分类变量列。
而且我对最简单的方法感到困惑......也许是一个连接选择我想从第二个添加的列table......
library(caret)
training_2 <- subset(training, select = -c(user_name ,timestamp, etc))
corr_matrix <- cor(training_2)
highCorr <- findCorrelation(corr_matrix, .75)
training_2<- training_2[,-highCorr]
编辑:数据
structure(list(X = 1:15, yaw_belt = c(-94.4, -94.4, -94.4, -94.4,
-94.4, -94.4, -94.4, -94.4, -94.4, -94.4, -94.4, -94.4, -94.4,
-94.4, -94.4), gyros_belt_x = c(0, 0.02, 0, 0.02, 0.02, 0.02,
0.02, 0.02, 0.02, 0.03, 0.03, 0.02, 0.02, 0.02, 0), gyros_belt_y = c(0,
0, 0, 0, 0.02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), gyros_belt_z = c(-0.02,
-0.02, -0.02, -0.03, -0.02, -0.02, -0.02, -0.02, -0.02, 0, -0.02,
-0.02, 0, -0.02, 0), magnet_belt_x = c(-3L, -7L, -2L, -6L, -6L,
0L, -4L, -2L, 1L, -3L, -5L, -2L, -3L, -8L, -1L), magnet_belt_y = c(599L,
608L, 600L, 604L, 600L, 603L, 599L, 603L, 602L, 609L, 596L, 602L,
606L, 598L, 597L), roll_arm = c(-128, -128, -128, -128, -128,
-128, -128, -128, -128, -128, -128, -128, -128, -128, -129),
pitch_arm = c(22.5, 22.5, 22.5, 22.1, 22.1, 22, 21.9, 21.8,
21.7, 21.6, 21.5, 21.5, 21.4, 21.4, 21.4), yaw_arm = c(-161,
-161, -161, -161, -161, -161, -161, -161, -161, -161, -161,
-161, -161, -161, -161), total_accel_arm = c(34L, 34L, 34L,
34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L
), gyros_arm_y = c(0, -0.02, -0.02, -0.03, -0.03, -0.03,
-0.03, -0.02, -0.03, -0.03, -0.03, -0.03, -0.02, 0, 0), gyros_arm_z = c(-0.02,
-0.02, -0.02, 0.02, 0, 0, 0, 0, -0.02, -0.02, 0, 0, -0.02,
-0.03, -0.03), magnet_arm_x = c(-368L, -369L, -368L, -372L,
-374L, -369L, -373L, -372L, -369L, -376L, -366L, -363L, -372L,
-371L, -374L), magnet_arm_z = c(516L, 513L, 513L, 512L, 506L,
513L, 509L, 510L, 518L, 516L, 509L, 520L, 509L, 523L, 510L
), roll_dumbbell = c(13.05217456, 13.13073959, 12.85074981,
13.43119971, 13.37871611, 13.38245941, 13.12694911, 12.75083041,
13.15463353, 13.33033728, 13.13073959, 13.10320545, 13.38245941,
13.41047767, 13.07948887), pitch_dumbbell = c(-70.49400371,
-70.63750507, -70.27811982, -70.39379464, -70.42855971, -70.81758832,
-70.24756905, -70.34768359, -70.42520377, -70.85058796, -70.63750507,
-70.45974712, -70.81758832, -70.99594236, -70.67116245),
yaw_dumbbell = c(-84.87393888, -84.71064711, -85.14078134,
-84.87362553, -84.85305745, -84.46500278, -85.09961258, -85.09708174,
-84.91563379, -84.44601983, -84.71064711, -84.89472246, -84.46500278,
-84.28004856, -84.69053461), total_accel_dumbbell = c(37L,
37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L,
37L, 37L), gyros_dumbbell_y = c(-0.02, -0.02, -0.02, -0.02,
-0.02, -0.02, -0.02, -0.02, -0.02, -0.02, -0.02, -0.02, -0.02,
-0.02, -0.02), magnet_dumbbell_z = c(-65, -64, -63, -60,
-68, -66, -70, -74, -65, -69, -64, -65, -69, -68, -63), roll_forearm = c(28.4,
28.3, 28.3, 28.1, 28, 27.9, 27.9, 27.8, 27.7, 27.7, 27.6,
27.5, 27.2, 27.2, 27.2), pitch_forearm = c(-63.9, -63.9,
-63.9, -63.9, -63.9, -63.9, -63.9, -63.8, -63.8, -63.8, -63.8,
-63.8, -63.9, -63.9, -63.9), yaw_forearm = c(-153, -153,
-152, -152, -152, -152, -152, -152, -152, -152, -152, -152,
-151, -151, -151), total_accel_forearm = c(36L, 36L, 36L,
36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L
), gyros_forearm_x = c(0.03, 0.02, 0.03, 0.02, 0.02, 0.02,
0.02, 0.02, 0.03, 0.02, 0.02, 0.02, 0, 0, 0), gyros_forearm_z = c(-0.02,
-0.02, 0, 0, -0.02, -0.03, -0.02, 0, -0.02, -0.02, -0.02,
-0.03, -0.03, -0.03, -0.02), accel_forearm_x = c(192L, 192L,
196L, 189L, 189L, 193L, 195L, 193L, 193L, 190L, 193L, 191L,
193L, 193L, 192L), accel_forearm_z = c(-215L, -216L, -213L,
-214L, -214L, -215L, -215L, -213L, -214L, -215L, -214L, -215L,
-215L, -214L, -214L), magnet_forearm_x = c(-17L, -18L, -18L,
-16L, -17L, -9L, -18L, -9L, -16L, -22L, -17L, -11L, -15L,
-14L, -16L), magnet_forearm_y = c(654, 661, 658, 658, 655,
660, 659, 660, 653, 656, 657, 657, 655, 659, 656), magnet_forearm_z = c(476,
473, 469, 469, 473, 478, 470, 474, 476, 473, 465, 478, 472,
478, 472)), .Names = c("X", "yaw_belt", "gyros_belt_x", "gyros_belt_y",
"gyros_belt_z", "magnet_belt_x", "magnet_belt_y", "roll_arm",
"pitch_arm", "yaw_arm", "total_accel_arm", "gyros_arm_y", "gyros_arm_z",
"magnet_arm_x", "magnet_arm_z", "roll_dumbbell", "pitch_dumbbell",
"yaw_dumbbell", "total_accel_dumbbell", "gyros_dumbbell_y", "magnet_dumbbell_z",
"roll_forearm", "pitch_forearm", "yaw_forearm", "total_accel_forearm",
"gyros_forearm_x", "gyros_forearm_z", "accel_forearm_x", "accel_forearm_z",
"magnet_forearm_x", "magnet_forearm_y", "magnet_forearm_z"), row.names = c(NA,
15L), class = "data.frame")
谢谢
不幸的是我无法使用你的数据@mclzc,因为其中一列只有一个值,当我尝试计算相关性时它 returns 出错了,但我会提供一个样本数据设置。
library(caret)
#sample data, 4 numeric fields and 2 categorical
df <- data.frame(a=runif(10), b=runif(10), c=1:10, d=1:10, e=letters[1:10], f=letters[1:10])
#categorical columns
cat_cols <- c('e', 'f')
#remove categorical
df2 <- df[!names(df) %in% cat_cols]
#run correlations
cor_mat <- cor(df2)
#the index of the columns to be removed because they have a high correlation
index <- findCorrelation(cor_mat, .75)
#the name of the columns chosen above
to_be_removed <- colnames(cor_mat)[index]
#now go back to df and use to_be_removed to subset the original df
df[!names(df) %in% to_be_removed]
输出(按照 findCorrelations 的指示仅删除列 c,而分类列 e 和 f 仍保留):
a b d e f
1 0.8495639 0.52388246 1 a a
2 0.9478409 0.11466655 2 b b
3 0.9086955 0.12097760 3 c c
4 0.7724303 0.03160203 4 d d
5 0.1269406 0.10740182 5 e e
6 0.7454943 0.95892265 6 f f
7 0.2651961 0.15223472 7 g g
8 0.5714763 0.49808509 8 h h
9 0.1226760 0.06088693 9 i i
10 0.5526266 0.14715471 10 j j
注意上面的 findCorrelation
函数 returns 要删除的列的索引而不是名称。您需要有名称,以便可以将其从原始数据集中排除(您不能使用索引作为排除分类列后更改的列数)。所以,在我上面的代码中,我使用 index
找到列名,然后用它来删除不需要的列(保留分类列)
我有一个包含分类变量和数值变量的数据框。我想去掉高度相关的变量。
所以,首先我去掉了我所有的分类列,制作了矩阵并找出哪些数字列需要删除。现在我想回去,也有我的分类变量列。
而且我对最简单的方法感到困惑......也许是一个连接选择我想从第二个添加的列table......
library(caret)
training_2 <- subset(training, select = -c(user_name ,timestamp, etc))
corr_matrix <- cor(training_2)
highCorr <- findCorrelation(corr_matrix, .75)
training_2<- training_2[,-highCorr]
编辑:数据
structure(list(X = 1:15, yaw_belt = c(-94.4, -94.4, -94.4, -94.4,
-94.4, -94.4, -94.4, -94.4, -94.4, -94.4, -94.4, -94.4, -94.4,
-94.4, -94.4), gyros_belt_x = c(0, 0.02, 0, 0.02, 0.02, 0.02,
0.02, 0.02, 0.02, 0.03, 0.03, 0.02, 0.02, 0.02, 0), gyros_belt_y = c(0,
0, 0, 0, 0.02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), gyros_belt_z = c(-0.02,
-0.02, -0.02, -0.03, -0.02, -0.02, -0.02, -0.02, -0.02, 0, -0.02,
-0.02, 0, -0.02, 0), magnet_belt_x = c(-3L, -7L, -2L, -6L, -6L,
0L, -4L, -2L, 1L, -3L, -5L, -2L, -3L, -8L, -1L), magnet_belt_y = c(599L,
608L, 600L, 604L, 600L, 603L, 599L, 603L, 602L, 609L, 596L, 602L,
606L, 598L, 597L), roll_arm = c(-128, -128, -128, -128, -128,
-128, -128, -128, -128, -128, -128, -128, -128, -128, -129),
pitch_arm = c(22.5, 22.5, 22.5, 22.1, 22.1, 22, 21.9, 21.8,
21.7, 21.6, 21.5, 21.5, 21.4, 21.4, 21.4), yaw_arm = c(-161,
-161, -161, -161, -161, -161, -161, -161, -161, -161, -161,
-161, -161, -161, -161), total_accel_arm = c(34L, 34L, 34L,
34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L
), gyros_arm_y = c(0, -0.02, -0.02, -0.03, -0.03, -0.03,
-0.03, -0.02, -0.03, -0.03, -0.03, -0.03, -0.02, 0, 0), gyros_arm_z = c(-0.02,
-0.02, -0.02, 0.02, 0, 0, 0, 0, -0.02, -0.02, 0, 0, -0.02,
-0.03, -0.03), magnet_arm_x = c(-368L, -369L, -368L, -372L,
-374L, -369L, -373L, -372L, -369L, -376L, -366L, -363L, -372L,
-371L, -374L), magnet_arm_z = c(516L, 513L, 513L, 512L, 506L,
513L, 509L, 510L, 518L, 516L, 509L, 520L, 509L, 523L, 510L
), roll_dumbbell = c(13.05217456, 13.13073959, 12.85074981,
13.43119971, 13.37871611, 13.38245941, 13.12694911, 12.75083041,
13.15463353, 13.33033728, 13.13073959, 13.10320545, 13.38245941,
13.41047767, 13.07948887), pitch_dumbbell = c(-70.49400371,
-70.63750507, -70.27811982, -70.39379464, -70.42855971, -70.81758832,
-70.24756905, -70.34768359, -70.42520377, -70.85058796, -70.63750507,
-70.45974712, -70.81758832, -70.99594236, -70.67116245),
yaw_dumbbell = c(-84.87393888, -84.71064711, -85.14078134,
-84.87362553, -84.85305745, -84.46500278, -85.09961258, -85.09708174,
-84.91563379, -84.44601983, -84.71064711, -84.89472246, -84.46500278,
-84.28004856, -84.69053461), total_accel_dumbbell = c(37L,
37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L,
37L, 37L), gyros_dumbbell_y = c(-0.02, -0.02, -0.02, -0.02,
-0.02, -0.02, -0.02, -0.02, -0.02, -0.02, -0.02, -0.02, -0.02,
-0.02, -0.02), magnet_dumbbell_z = c(-65, -64, -63, -60,
-68, -66, -70, -74, -65, -69, -64, -65, -69, -68, -63), roll_forearm = c(28.4,
28.3, 28.3, 28.1, 28, 27.9, 27.9, 27.8, 27.7, 27.7, 27.6,
27.5, 27.2, 27.2, 27.2), pitch_forearm = c(-63.9, -63.9,
-63.9, -63.9, -63.9, -63.9, -63.9, -63.8, -63.8, -63.8, -63.8,
-63.8, -63.9, -63.9, -63.9), yaw_forearm = c(-153, -153,
-152, -152, -152, -152, -152, -152, -152, -152, -152, -152,
-151, -151, -151), total_accel_forearm = c(36L, 36L, 36L,
36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L
), gyros_forearm_x = c(0.03, 0.02, 0.03, 0.02, 0.02, 0.02,
0.02, 0.02, 0.03, 0.02, 0.02, 0.02, 0, 0, 0), gyros_forearm_z = c(-0.02,
-0.02, 0, 0, -0.02, -0.03, -0.02, 0, -0.02, -0.02, -0.02,
-0.03, -0.03, -0.03, -0.02), accel_forearm_x = c(192L, 192L,
196L, 189L, 189L, 193L, 195L, 193L, 193L, 190L, 193L, 191L,
193L, 193L, 192L), accel_forearm_z = c(-215L, -216L, -213L,
-214L, -214L, -215L, -215L, -213L, -214L, -215L, -214L, -215L,
-215L, -214L, -214L), magnet_forearm_x = c(-17L, -18L, -18L,
-16L, -17L, -9L, -18L, -9L, -16L, -22L, -17L, -11L, -15L,
-14L, -16L), magnet_forearm_y = c(654, 661, 658, 658, 655,
660, 659, 660, 653, 656, 657, 657, 655, 659, 656), magnet_forearm_z = c(476,
473, 469, 469, 473, 478, 470, 474, 476, 473, 465, 478, 472,
478, 472)), .Names = c("X", "yaw_belt", "gyros_belt_x", "gyros_belt_y",
"gyros_belt_z", "magnet_belt_x", "magnet_belt_y", "roll_arm",
"pitch_arm", "yaw_arm", "total_accel_arm", "gyros_arm_y", "gyros_arm_z",
"magnet_arm_x", "magnet_arm_z", "roll_dumbbell", "pitch_dumbbell",
"yaw_dumbbell", "total_accel_dumbbell", "gyros_dumbbell_y", "magnet_dumbbell_z",
"roll_forearm", "pitch_forearm", "yaw_forearm", "total_accel_forearm",
"gyros_forearm_x", "gyros_forearm_z", "accel_forearm_x", "accel_forearm_z",
"magnet_forearm_x", "magnet_forearm_y", "magnet_forearm_z"), row.names = c(NA,
15L), class = "data.frame")
谢谢
不幸的是我无法使用你的数据@mclzc,因为其中一列只有一个值,当我尝试计算相关性时它 returns 出错了,但我会提供一个样本数据设置。
library(caret)
#sample data, 4 numeric fields and 2 categorical
df <- data.frame(a=runif(10), b=runif(10), c=1:10, d=1:10, e=letters[1:10], f=letters[1:10])
#categorical columns
cat_cols <- c('e', 'f')
#remove categorical
df2 <- df[!names(df) %in% cat_cols]
#run correlations
cor_mat <- cor(df2)
#the index of the columns to be removed because they have a high correlation
index <- findCorrelation(cor_mat, .75)
#the name of the columns chosen above
to_be_removed <- colnames(cor_mat)[index]
#now go back to df and use to_be_removed to subset the original df
df[!names(df) %in% to_be_removed]
输出(按照 findCorrelations 的指示仅删除列 c,而分类列 e 和 f 仍保留):
a b d e f
1 0.8495639 0.52388246 1 a a
2 0.9478409 0.11466655 2 b b
3 0.9086955 0.12097760 3 c c
4 0.7724303 0.03160203 4 d d
5 0.1269406 0.10740182 5 e e
6 0.7454943 0.95892265 6 f f
7 0.2651961 0.15223472 7 g g
8 0.5714763 0.49808509 8 h h
9 0.1226760 0.06088693 9 i i
10 0.5526266 0.14715471 10 j j
注意上面的 findCorrelation
函数 returns 要删除的列的索引而不是名称。您需要有名称,以便可以将其从原始数据集中排除(您不能使用索引作为排除分类列后更改的列数)。所以,在我上面的代码中,我使用 index
找到列名,然后用它来删除不需要的列(保留分类列)