在 R 中,预测产生错误的长度
in R, predict yields wrong length
new.y = predict(model, newx = new.x)
,
new.y 的长度不同于 new.x
的行长度
代码在这里:
install.packages('ISLR')
library(ISLR)
fix(Hitters) # load data
Hitters = na.omit(Hitters) # remove NA
x = model.matrix(Salary ~ ., Hitters)[ , -1]
y = Hitters$Salary
set.seed(1)
train = sample(1:nrow(x), nrow(x)/2) # random sampling
test = (-train)
lm.fit = lm(y ~ x, subset=train)
lm.pred = predict( lm.fit, newx = x[test,])
dim(x[test,]) # output 132*19
length(lm.pred) # output 131
length(y[test]) # output 132
有人知道为什么长度不对吗?谢谢!
更新:错误是 newx = x[test, ]
没有被 predict
识别
谢谢@Pascal!
为了让它更明显:
install.packages('ISLR')
library(ISLR)
fix(Hitters) # load data
Hitters = na.omit(Hitters) # remove NA
x = model.matrix(Salary ~ ., Hitters)[ , -1]
y = Hitters$Salary
set.seed(2)
train = sample(1:nrow(x), 150) # random sampling (specify size for testing)
test = (1:nrow(x))[-train]
lm.fit = lm(y ~ x, subset=train)
lm.pred = predict( lm.fit, newx = x[test,])
dim(x[test,]) # output 113 19
length(lm.pred) # output 150 - still using training data
lm.fit = lm(Salary ~ ., data = Hitters, subset = train)
lm.pred = predict( lm.fit, newdata = Hitters[test,])
dim(x[test,]) # output 113 19
length(lm.pred) # output 113
在第一个和第二个代码中定义 test
的方法应该是一样的。测试:
x = c('A','B','C','D','E')
set.seed(2)
n = length(x)
train = sample(1:n, n/2) # random sampling
test = -train
test # output -1 -3
x[test] # output "B" "D" "E"
test = (1:n)[-train]
test # output 2 4 5
x[test] # output "B" "D" "E"
尝试为参数 newdata
提供 data.frame,如:
lm.pred <- predict(lm.fit,
newdata=data.frame(x=x[test,],y=0))
此外,我不确定参数 subset
是否按照您的想法行事。相反,我会在您对 lm
的调用中提供参数 data
,如:
lm.fit = lm(y ~ x,
data=data.frame(x=x,y=y)[train,])
试试这个:
install.packages('ISLR')
library(ISLR)
fix(Hitters) # load data
Hitters = na.omit(Hitters) # remove NA
x = Hitters[,-1]
y = Hitters$Salary
set.seed(1)
train = sample(1:nrow(x), nrow(x)/2) # random sampling
test_data <- x[-train,]
y_test <- y[-train]
y_train<-y[train]
train_data <- data.frame(Y= y[train],x[train,])
lm.fit = lm(Y ~ ., train_data)
lm.pred = predict( lm.fit, newx = test_data)
dim(test_data) # output 161*19
length(lm.pred) # output 130
length(y_test) # output 161
我猜 lm.pred 的长度差异是由于 y
中的空值造成的
您可以通过以下方式简化:
library(ISLR)
Hitters <- na.omit(Hitters) # remove NA
set.seed(1)
train <- sample(1:nrow(Hitters), nrow(Hitters)/2) # random sampling
test <- (1:nrow(Hitters))[-train] # your definition of test was incorrect
lm.fit <- lm(Salary ~ ., data = Hitters, subset = train)
lm.pred <- predict(lm.fit, newdata = Hitters[test,])
dim(Hitters[test,]) # output 132*20
length(lm.pred) # output 132
new.y = predict(model, newx = new.x)
,
new.y 的长度不同于 new.x
代码在这里:
install.packages('ISLR')
library(ISLR)
fix(Hitters) # load data
Hitters = na.omit(Hitters) # remove NA
x = model.matrix(Salary ~ ., Hitters)[ , -1]
y = Hitters$Salary
set.seed(1)
train = sample(1:nrow(x), nrow(x)/2) # random sampling
test = (-train)
lm.fit = lm(y ~ x, subset=train)
lm.pred = predict( lm.fit, newx = x[test,])
dim(x[test,]) # output 132*19
length(lm.pred) # output 131
length(y[test]) # output 132
有人知道为什么长度不对吗?谢谢!
更新:错误是 newx = x[test, ]
没有被 predict
识别
谢谢@Pascal!
为了让它更明显:
install.packages('ISLR')
library(ISLR)
fix(Hitters) # load data
Hitters = na.omit(Hitters) # remove NA
x = model.matrix(Salary ~ ., Hitters)[ , -1]
y = Hitters$Salary
set.seed(2)
train = sample(1:nrow(x), 150) # random sampling (specify size for testing)
test = (1:nrow(x))[-train]
lm.fit = lm(y ~ x, subset=train)
lm.pred = predict( lm.fit, newx = x[test,])
dim(x[test,]) # output 113 19
length(lm.pred) # output 150 - still using training data
lm.fit = lm(Salary ~ ., data = Hitters, subset = train)
lm.pred = predict( lm.fit, newdata = Hitters[test,])
dim(x[test,]) # output 113 19
length(lm.pred) # output 113
在第一个和第二个代码中定义 test
的方法应该是一样的。测试:
x = c('A','B','C','D','E')
set.seed(2)
n = length(x)
train = sample(1:n, n/2) # random sampling
test = -train
test # output -1 -3
x[test] # output "B" "D" "E"
test = (1:n)[-train]
test # output 2 4 5
x[test] # output "B" "D" "E"
尝试为参数 newdata
提供 data.frame,如:
lm.pred <- predict(lm.fit,
newdata=data.frame(x=x[test,],y=0))
此外,我不确定参数 subset
是否按照您的想法行事。相反,我会在您对 lm
的调用中提供参数 data
,如:
lm.fit = lm(y ~ x,
data=data.frame(x=x,y=y)[train,])
试试这个:
install.packages('ISLR')
library(ISLR)
fix(Hitters) # load data
Hitters = na.omit(Hitters) # remove NA
x = Hitters[,-1]
y = Hitters$Salary
set.seed(1)
train = sample(1:nrow(x), nrow(x)/2) # random sampling
test_data <- x[-train,]
y_test <- y[-train]
y_train<-y[train]
train_data <- data.frame(Y= y[train],x[train,])
lm.fit = lm(Y ~ ., train_data)
lm.pred = predict( lm.fit, newx = test_data)
dim(test_data) # output 161*19
length(lm.pred) # output 130
length(y_test) # output 161
我猜 lm.pred 的长度差异是由于 y
中的空值造成的您可以通过以下方式简化:
library(ISLR)
Hitters <- na.omit(Hitters) # remove NA
set.seed(1)
train <- sample(1:nrow(Hitters), nrow(Hitters)/2) # random sampling
test <- (1:nrow(Hitters))[-train] # your definition of test was incorrect
lm.fit <- lm(Salary ~ ., data = Hitters, subset = train)
lm.pred <- predict(lm.fit, newdata = Hitters[test,])
dim(Hitters[test,]) # output 132*20
length(lm.pred) # output 132