新数据中的预测变量类型与训练数据中的预测变量类型不匹配
Type of predictors in new data do not match that of the training data
我想在 R 中使用随机森林预测自杀率 (log_suicides_per_100k),我遇到的问题是,当我尝试选择变量的一个级别时,出现错误:
Type of predictors in new data do not match that of the training data.
型号是:
rf3 <- randomForest(log_suicides_per_100k~ age+sex+log_gdp_per_capita+log_population+year, # formula data = train, # data ntree = 500)
性别有四个等级:男性和女性
年龄有六个等级; "15-24 岁", "25-34 岁", "35-54 岁",
“5-14 岁”、“55-74 岁”、“75 岁以上”
structure(list(year = c(2001L, 2004L, 2008L, 2010L, 2004L, 2011L
), sex = structure(c(2L, 2L, 1L, 2L, 2L, 1L), .Label = c("female",
"male"), class = "factor"), age = structure(c(1L, 6L, 3L, 6L,
2L, 3L), .Label = c("15-24 years", "25-34 years", "35-54 years",
"5-14 years", "55-74 years", "75+ years"), class = "factor"),
log_population = c(14.0462476055718, 10.0651811415341,
13.5550389013841,
10.2665669441479, 15.5047227728237, 13.4021140795298),
log_suicides_per_100k = c(2.42657107277504,
4.03069453514564, 2.38508631450579, 4.15261347034608,
2.88480071284671,
0.647103242058539), log_gdp_per_capita = c(7.67786350067821,
9.13701670755734, 11.1338150021447, 9.65117262392164,
7.95472333449791,
8.14177220465645)), row.names = c(7888L, 8465L, 7593L, 8535L,
25159L, 9656L), class = "data.frame")
我想预测 2025 年 75 岁以上男性的自杀率。
prediction <- predict(rf3, data.frame (age = '75+ years', sex= 'male', log_gdp_per_capita = 13.082, log_population = 9.393, year = 2025))
这是一些有效的代码。因为您没有包含所有代码,所以存在它对您不起作用的风险。因素和水平需要匹配,所以这是正确的关键。训练数据中的因素和水平被复制并设置为与测试数据中的相匹配。
library(randomForest)
traindf <- structure(
list(
year = c(2001L, 2004L, 2008L, 2010L, 2004L, 2011L),
sex = structure(
c(2L, 2L, 1L, 2L, 2L, 1L),
.Label = c("female",
"male"),
class = "factor"
),
age = structure(
c(1L, 6L, 3L, 6L,
2L, 3L),
.Label = c(
"15-24 years",
"25-34 years",
"35-54 years",
"5-14 years",
"55-74 years",
"75+ years"
),
class = "factor"
),
log_population = c(
14.0462476055718,
10.0651811415341,
13.5550389013841,
10.2665669441479,
15.5047227728237,
13.4021140795298
),
log_suicides_per_100k = c(
2.42657107277504,
4.03069453514564,
2.38508631450579,
4.15261347034608,
2.88480071284671,
0.647103242058539
),
log_gdp_per_capita = c(
7.67786350067821,
9.13701670755734,
11.1338150021447,
9.65117262392164,
7.95472333449791,
8.14177220465645
)
),
row.names = c(7888L, 8465L, 7593L, 8535L,
25159L, 9656L),
class = "data.frame"
)
rf3 <- randomForest(log_suicides_per_100k ~ age+sex+log_gdp_per_capita+log_population+year, data=traindf)
testdf <- data.frame(age='75+ years', sex='male', log_gdp_per_capita=13.082, log_population=9.393, year=2025)
testdf$sex <- factor(testdf$sex, levels=levels(traindf$sex))
testdf$age <- factor(testdf$age, levels=levels(traindf$age))
prediction <- predict(rf3, testdf)
prediction
#3.200609
我想在 R 中使用随机森林预测自杀率 (log_suicides_per_100k),我遇到的问题是,当我尝试选择变量的一个级别时,出现错误:
Type of predictors in new data do not match that of the training data.
型号是:
rf3 <- randomForest(log_suicides_per_100k~ age+sex+log_gdp_per_capita+log_population+year, # formula data = train, # data ntree = 500)
性别有四个等级:男性和女性 年龄有六个等级; "15-24 岁", "25-34 岁", "35-54 岁", “5-14 岁”、“55-74 岁”、“75 岁以上”
structure(list(year = c(2001L, 2004L, 2008L, 2010L, 2004L, 2011L
), sex = structure(c(2L, 2L, 1L, 2L, 2L, 1L), .Label = c("female",
"male"), class = "factor"), age = structure(c(1L, 6L, 3L, 6L,
2L, 3L), .Label = c("15-24 years", "25-34 years", "35-54 years",
"5-14 years", "55-74 years", "75+ years"), class = "factor"),
log_population = c(14.0462476055718, 10.0651811415341,
13.5550389013841,
10.2665669441479, 15.5047227728237, 13.4021140795298),
log_suicides_per_100k = c(2.42657107277504,
4.03069453514564, 2.38508631450579, 4.15261347034608,
2.88480071284671,
0.647103242058539), log_gdp_per_capita = c(7.67786350067821,
9.13701670755734, 11.1338150021447, 9.65117262392164,
7.95472333449791,
8.14177220465645)), row.names = c(7888L, 8465L, 7593L, 8535L,
25159L, 9656L), class = "data.frame")
我想预测 2025 年 75 岁以上男性的自杀率。
prediction <- predict(rf3, data.frame (age = '75+ years', sex= 'male', log_gdp_per_capita = 13.082, log_population = 9.393, year = 2025))
这是一些有效的代码。因为您没有包含所有代码,所以存在它对您不起作用的风险。因素和水平需要匹配,所以这是正确的关键。训练数据中的因素和水平被复制并设置为与测试数据中的相匹配。
library(randomForest)
traindf <- structure(
list(
year = c(2001L, 2004L, 2008L, 2010L, 2004L, 2011L),
sex = structure(
c(2L, 2L, 1L, 2L, 2L, 1L),
.Label = c("female",
"male"),
class = "factor"
),
age = structure(
c(1L, 6L, 3L, 6L,
2L, 3L),
.Label = c(
"15-24 years",
"25-34 years",
"35-54 years",
"5-14 years",
"55-74 years",
"75+ years"
),
class = "factor"
),
log_population = c(
14.0462476055718,
10.0651811415341,
13.5550389013841,
10.2665669441479,
15.5047227728237,
13.4021140795298
),
log_suicides_per_100k = c(
2.42657107277504,
4.03069453514564,
2.38508631450579,
4.15261347034608,
2.88480071284671,
0.647103242058539
),
log_gdp_per_capita = c(
7.67786350067821,
9.13701670755734,
11.1338150021447,
9.65117262392164,
7.95472333449791,
8.14177220465645
)
),
row.names = c(7888L, 8465L, 7593L, 8535L,
25159L, 9656L),
class = "data.frame"
)
rf3 <- randomForest(log_suicides_per_100k ~ age+sex+log_gdp_per_capita+log_population+year, data=traindf)
testdf <- data.frame(age='75+ years', sex='male', log_gdp_per_capita=13.082, log_population=9.393, year=2025)
testdf$sex <- factor(testdf$sex, levels=levels(traindf$sex))
testdf$age <- factor(testdf$age, levels=levels(traindf$age))
prediction <- predict(rf3, testdf)
prediction
#3.200609