R - Predict(),重命名列,并且“有 10 行但找到的变量有 20 行”

R - Predict(), renaming columns, and " had 10 rows but variables found have 20 rows "

在其他帖子中,我看到人们提供了针对具体问题的解决方案,但我不明白出现问题的根本原因。

我愿意...

modTest = glm( trainLabels[,1] ~ A + B + C + 
           D + E + F + G + 
            H + I, family=binomial(link='logit') )

以上是 20 个标签,9 个向量,每个向量有 20 个值。

然后我尝试预测 10 个未见过的例子。这是 10 行,9 个特征,相同的顺序。

preds = predict( modTest, testFeatures )

我收到错误...

Warning message:
'newdata' had 10 rows but variables found have 20 rows 

编辑:简化、删除长特征名称等

> names(trainFeatures)
[1] "Neg"  "Pos"  "Num"  "UN"   "UNA"  "UNUA" "UP"   "UPA"  "UPUA"
names(testFeatures)
[1] "Neg"  "Pos"  "Num"  "UN"   "UNA"  "UNUA" "UP"   "UPA"  "UPUA"

编辑:Dputs...

为了使用输出,我所做的是...

 modTest = glm( trainLabels[,1] ~ as.matrix(trainFeatures) )
 preds = predict( modTest, testFeatures )

Warning message:
'newdata' had 10 rows but variables found have 20 rows 

不确定为什么我仍然收到该警告。

  dput(trainLabels)
    structure(list(Neg = c(1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 
0, 0, 0, 1, 1, 1, 0), Pos = c(1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 
1, 1, 0, 0, 0, 1, 1, 1, 0), Num = c(1, 1, 0, 0, 0, 0, 1, 0, 0, 
0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0), UN = c(1, 1, 0, 0, 0, 0, 1, 
0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0), UNA = c(1, 1, 0, 0, 0, 
0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0), UNUA = c(1, 1, 
0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0), UP = c(1, 
1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0), UPA = c(1, 
1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0), UPUA = c(1, 
1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0)), .Names = c("Neg", 
"Pos", "Num", "UN", "UNA", "UNUA", "UP", "UPA", "UPUA"), row.names = c(NA, 
-20L), class = "data.frame")
    dput(trainFeatures)
    structure(list(Neg = c(39106, 44664, 114130, 26526, 22122, 19175, 
    29438, 17741, 17589, 20666, 66024, 168336, 86283, 74826, 88998, 
    75756, 16041, 17087, 15235, 16659), Pos = c(16129, 21064, 57730, 
    10314, 18105, 16837, 19300, 16873, 13681, 18414, 27148, 120497, 
    60031, 49016, 59250, 36264, 15786, 16315, 14556, 16057), Num = c(82994, 
    121367, 306842, 55458, 69148, 63167, 85891, 58674, 55874, 67505, 
    152475, 427106, 221043, 190043, 223744, 177388, 51657, 54883, 

    48378, 54115), UN = c(32343, 35433, 74835, 22271, 17686, 15498, 
    22416, 14238, 14078, 16800, 54636, 121211, 68079, 59913, 70884, 
    61408, 13221, 14114, 12647, 13487), UNA = c(95.1499874, 95.0987263, 
    95.3942596, 95.5444865, 113.1263844, 112.3827424, 111.2684513, 
    113.2184128, 112.4336258, 114.1739588, 113.5086472, 111.6715378, 
    112.2842917, 111.9490612, 113.6465561, 111.5254103, 112.2179148, 
    111.2933853, 112.9056117, 113.1511475), UNUA = c(-94.4280737, 
    -94.5019854, -94.9246672, -95.0379578, -113.2247115, -112.3497485, 
    -111.1631387, -113.2051289, -112.1822898, -114.0431466, -113.7435412, 
    -111.6226818, -112.4077795, -111.9886653, -113.8072166, -111.6138577, 
    -113.0855995, -112.3075275, -114.2628431, -114.1088453), UP = c(10384, 
    13015, 24470, 6891, 13445, 12852, 13008, 13093, 9878, 14272, 
    14938, 77058, 40595, 32518, 39889, 21424, 8322, 8451, 7440, 8071
    ), UPA = c(58.6289931, 57.73430079, 61.3480343, 57.8297594, 62.1749994, 
    65.1140073, 62.619361, 63.6791219, 63.412582, 65.1856906, 45.18365794, 
    71.32918265, 56.04488913, 58.13008276, 53.16603128, 50.36242011, 
    64.6742956, 64.0982314, 63.4422878, 64.24099034), UPUA = c(88.9216885, 
    88.3012858, 88.1996008, 88.9910129, 91.0232669, 89.4524702, 91.9122816, 
    89.8549338, 90.6487273, 88.2063941, 99.9573821, 109.9128868, 
    103.7989926, 104.0274764, 103.4209936, 101.5065677, 85.8110039, 
    87.0786241, 86.1020646, 86.8835026)), .Names = c("Neg", "Pos", 
    "Num", "UN", "UNA", "UNUA", "UP", "UPA", "UPUA"), row.names = c(NA, 
    -20L), class = "data.frame")
    dput(testLabels)
    structure(list(Neg = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1), Pos = c(0, 
    1, 1, 1, 0, 1, 1, 1, 1, 1), Num = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 
    1), UN = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1), UNA = c(0, 1, 1, 1, 
    0, 1, 1, 1, 1, 1), UNUA = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1), UP = c(0, 
    1, 1, 1, 0, 1, 1, 1, 1, 1), UPA = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 
    1), UPUA = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1)), .Names = c("Neg", 
"Pos", "Num", "UN", "UNA", "UNUA", "UP", "UPA", "UPUA"), row.names = c(NA, 
-10L), class = "data.frame")
    > dput(testFeatures)
    structure(list(Neg = c(51404, 32447, 24642, 95979, 15743, 90843, 
    13813, 11496, 12871, 13546), Pos = c(23350, 13525, 19941, 49984, 
    10867, 64404, 13324, 11302, 12918, 13118), Num = c(121342, 68160, 
77219, 248890, 49259, 232645, 43707, 35674, 40734, 42979), UN = c(40766, 
27363, 19590, 71772, 12615, 71496, 11529, 9739, 10810, 11346), 
    UNA = c(95.2486872, 93.4642772, 111.3853297, 112.6770471, 
    110.0845355, 113.6696598, 111.8409793, 116.0476022, 120.3481302, 
    111.9496978), UNUA = c(-94.6150698, -92.5605373, -111.1994432, 
    -112.4947319, -109.7130777, -113.8083912, -112.5678322, -116.5407619, 
    -121.4756386, -113.4991191), UP = c(14285, 9043, 14862, 31626, 
    7491, 43903, 7021, 5559, 6149, 6789), UPA = c(61.25585053, 
    62.6231081, 64.191128, 64.6397131, 63.4911744, 58.4792454, 
    63.5063289, 60.5667637, 60.3857056, 64.1569975), UPUA = c(88.4605419, 
    88.2790682, 90.0217465, 88.8441004, 91.0222662, 105.0494229, 
    85.8914139, 86.7685668, 84.8304901, 86.9786109)), .Names = c("Neg", 
"Pos", "Num", "UN", "UNA", "UNUA", "UP", "UPA", "UPUA"), row.names = c(NA, 
-10L), class = "data.frame")

所以,我 运行 使用您提供的所有数据编写代码并得到很好的结果。这是模型拟合:

modTest = glm(trainLabels[,1] ~ Neg + Pos + Num + 
                   UN + UNA + UNUA + UP + 
                   UPA + UPUA, family=binomial(link='logit'),
              data = trainFeatues)

以下是测试数据的预测值:

predict( modTest, testFeatures)
         1          2          3          4          5          6          7          8 
 4.6711576 -1.3572345 -2.0639104 18.7625539 -7.6961149  0.4317324 -0.8983256 -8.2052158 
         9         10 
-1.5968013 10.8357174 

注意:替代规范可以是这样的:

modTest = glm(trainLabels[,1] ~ trainFeatues$Neg + trainFeatues$Pos +
                   trainFeatues$Num + trainFeatues$UN + trainFeatues$UNA +
                   trainFeatues$UNUA + trainFeatues$UP + trainFeatues$UPA + 
                   trainFeatues$UPUA, family=binomial(link='logit'))

但是拟合模型如下:

modTest$coefficients
      (Intercept)  trainFeatues$Neg  trainFeatues$Pos  trainFeatues$Num   trainFeatues$UN 
     4.027803e+01      8.874801e-04     -3.000123e-03      1.277138e-04     -4.521793e-04 
 trainFeatues$UNA trainFeatues$UNUA   trainFeatues$UP  trainFeatues$UPA trainFeatues$UPUA 
    -1.519463e+01     -1.480503e+01      2.930261e-03      4.741432e-01     -3.690940e-01 

当您输入训练数据进行预测时,这会导致问题,因为上面拟合的特征与输入的新数据不匹配进行预测。导致:

predict( modTest, testFeatures)
          1           2           3           4           5           6           7 
 0.21651890  3.23450117 -2.16298672 -0.06949967 -0.91026504 -0.91484739 -1.69209826 
          8           9          10          11          12          13          14 
-2.45603982 -6.35855600 -1.84871546 -0.25027815  2.72625440 -0.50422297 -1.76701963 
         15          16          17          18          19          20 
 0.05033351  0.65101666  0.27680835  1.79176029  6.79618311 -0.16186455 
Warning message:
'newdata' had 10 rows but variables found have 20 rows