测试数据在执行 logit 时具有新级别，但在 C5 中进行预测时不会出错

Question

我不知道 2 模型如何处理因子水平，但 Logit 不会预测并给出一条错误消息，说明新的因子水平。当我预测使用 C5 时它工作正常。我已经从单个数据帧创建了训练和测试，并且两者的级别相互匹配。

我正在寻求对此行为的解释和解决方案。我知道测试中的新级别无法计算它们的系数，但我认为将它们设置为 NULL 应该没问题。

这是一些代码。我用它来匹配保持和训练的水平。 tr=数据集被分成训练和测试。

tr=structure(
        list(
            production_year = c(
                2007L, 2010L, 2010L, 2008L,
                2007L, 2008L, 2008L, 2008L, 2007L, 2011L, 2009L, 2009L, 2009L,
                2008L, 2007L, 2007L, 2010L, 2009L, 2008L, 2008L, 2010L, 2010L,
                2007L, 2010L, 2009L, 2008L, 2007L, 2007L, 2008L, 2007L, 2010L,
                2011L, 2010L, 2007L, 2009L, 2009L, 2008L, 2008L, 2010L, 2011L
            ), movie_sequel = structure(
                c(
                    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
                    1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L,
                    1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
                ), .Label = c("0", "1"), class = "factor"
            ), creative_type = structure(
                c(
                    1L,
                    4L, 1L, 4L, 5L, 1L, 1L, 6L, 2L, 1L, 6L, 1L, 1L, 1L, 1L, 1L, 1L,
                    1L, 8L, 1L, 7L, 1L, 1L, 3L, 1L, 1L, 2L, 4L, 4L, 1L, 1L, 4L, 5L,
                    5L, 1L, 4L, 1L, 1L, 1L, 1L
                ), .Label = c(
                    "Contemporary Fiction",
                    "Dramatization", "Factual", "Fantasy", "Historical Fiction",
                    "Kids Fiction", "Science Fiction", "Super Hero"
                ), class = "factor"
            ),
            source = structure(
                c(
                    6L, 2L, 6L, 7L, 2L, 6L, 6L, 6L, 4L,
                    6L, 2L, 7L, 6L, 6L, 6L, 3L, 6L, 6L, 1L, 2L, 6L, 5L, 6L, 5L,
                    5L, 6L, 4L, 2L, 2L, 6L, 6L, 2L, 7L, 4L, 6L, 5L, 6L, 2L, 6L,
                    6L
                ), .Label = c(
                    "Based on Comic/Graphic Novel", "Based on Fiction Book/Short Story",
                    "Based on Folk Tale/Legend/Fairytale", "Based on Real Life Events",
                    "Based on TV", "Original Screenplay", "Remake"
                ), class = "factor"
            ),
            production_method = structure(
                c(
                    3L, 3L, 3L, 3L, 3L, 3L, 3L,
                    2L, 3L, 3L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L,
                    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
                    3L, 3L, 3L
                ), .Label = c(
                    "Animation/Live Action", "Digital Animation",
                    "Live Action", "Stop-Motion Animation"
                ), class = "factor"
            ),
            genre = structure(
                c(
                    3L, 1L, 4L, 5L, 1L, 4L, 3L, 3L, 4L, 5L,
                    2L, 7L, 6L, 5L, 7L, 3L, 3L, 7L, 1L, 7L, 7L, 3L, 4L, 3L, 3L,
                    6L, 4L, 2L, 1L, 2L, 6L, 4L, 7L, 1L, 4L, 2L, 3L, 7L, 7L, 5L
                ), .Label = c(
                    "Action", "Adventure", "Comedy", "Drama", "Horror",
                    "Romantic Comedy", "Thriller/Suspense"
                ), class = "factor"
            ),
            language = structure(
                c(
                    2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
                    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
                    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
                    2L
                ), .Label = c("Danish", "English"), class = "factor"
            ),
            movie_board_rating_display_name = structure(
                c(
                    3L, 3L, 3L,
                    2L, 2L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 3L, 3L, 2L, 3L, 3L,
                    3L, 3L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 1L, 2L, 3L, 2L, 2L, 3L,
                    2L, 3L, 1L, 2L, 3L, 3L, 2L
                ), .Label = c("PG", "PG-13", "R"), class = "factor"
            ), movie_release_pattern_display_name = structure(
                c(
                    4L,
                    4L, 3L, 4L, 4L, 3L, 4L, 4L, 3L, 4L, 4L, 4L, 4L, 4L, 3L, 4L,
                    3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 4L, 4L, 1L, 4L,
                    4L, 4L, 2L, 3L, 4L, 4L, 4L, 3L, 4L
                ), .Label = c("Exclusive",
                              "Expands Wide", "Limited", "Wide"), class = "factor"
            ), Category1 = structure(
                c(
                    1L,
                    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
                    1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
                    2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
                ), .Label = c("0", "1"), class = "factor"
            )
        ), .Names = c(
            "production_year",
            "movie_sequel", "creative_type", "source", "production_method",
            "genre", "language", "movie_board_rating_display_name", "movie_release_pattern_display_name",
            "Category1"
        ), row.names = c(
            506L, 474L, 1011L, 569L, 737L, 1124L,
            602L, 717L, 747L, 977L, 284L, 620L, 100L, 301L, 514L, 865L, 828L,
            283L, 921L, 839L, 15L, 937L, 931L, 201L, 273L, 507L, 1180L, 689L,
            276L, 649L, 603L, 22L, 555L, 974L, 552L, 500L, 216L, 312L, 796L,
            682L
        ), class = "data.frame"
    )

    train=tr[1:25,] # training data
    hold=tr[26:40,] # test data

    for(i in 1:ncol(train)){
        if(is.factor(train[,i])){
            hold[,i] <- factor(hold[,i],levels=levels(train[,i]))
            
        }
    }

m.glm=glm(Category1 ~ ., data = train, family = 'binomial')
labels=hold$Category1
hold$Category1=NULL
p=predict(m.glm, hold)

所有关卡

structure(list(production_year = 2011L, movie_sequel = structure(1L, .Label = c("0", 
"1"), class = "factor"), creative_type = structure(5L, .Label = c("Contemporary Fiction", 
"Dramatization", "Factual", "Fantasy", "Historical Fiction", 
"Kids Fiction", "Multiple Creative Types", "Science Fiction", 
"Super Hero"), class = "factor"), source = structure(14L, .Label = c("Based on Comic/Graphic Novel", 
"Based on Factual Book/Article", "Based on Fiction Book/Short Story", 
"Based on Folk Tale/Legend/Fairytale", "Based on Game", "Based on Musical or Opera", 
"Based on Play", "Based on Real Life Events", "Based on Short Film", 
"Based on Theme Park Ride", "Based on Toy", "Based on TV", "Compilation", 
"Original Screenplay", "Remake", "Spin-Off"), class = "factor"), 
    production_method = structure(4L, .Label = c("Animation/Live Action", 
    "Digital Animation", "Hand Animation", "Live Action", "Multiple Production Methods", 
    "Stop-Motion Animation"), class = "factor"), genre = structure(13L, .Label = c("Action", 
    "Adventure", "Black Comedy", "Comedy", "Concert/Performance", 
    "Documentary", "Drama", "Horror", "Multiple Genres", "Musical", 
    "Romantic Comedy", "Thriller/Suspense", "Western"), class = "factor"), 
    language = structure(3L, .Label = c("Arabic", "Danish", "English", 
    "Farsi", "French", "German", "Hebrew", "Hindi", "Italian", 
    "Japanese", "Norwegian", "Polish", "Portuguese", "Silent", 
    "Spanish", "Swedish"), class = "factor"), movie_board_rating_display_name = structure(6L, .Label = c("G", 
    "NC-17", "Not Rated", "PG", "PG-13", "R"), class = "factor"), 
    movie_release_pattern_display_name = structure(7L, .Label = c("Exclusive", 
    "Expands Wide", "IMAX", "Limited", "Oscar Qualifying Run", 
    "Special Engagement", "Wide"), class = "factor"), Category1 = structure(1L, .Label = c("0", 
    "1"), class = "factor")), .Names = c("production_year", "movie_sequel", 
"creative_type", "source", "production_method", "genre", "language", 
"movie_board_rating_display_name", "movie_release_pattern_display_name", 
"Category1"), row.names = 304L, class = "data.frame")

Answer 1

按照我的看法，您必须排除具有未用于拟合模型的级别的行。

predict(m.glm, hold[!hold$movie_release_pattern_display_name %in% c("Exclusive", "Expands Wide"), ])

测试数据在执行 logit 时具有新级别，但在 C5 中进行预测时不会出错

Test data has new levels while doing a logit but doesn't gives an error while predicting in C5

r

predict

logistic-regression