计算数据帧的多个子集的斜率时的时间不匹配

time mismatch in calculating the slope of multiple subsets of a data frame

我正在尝试根据年份和国家对我的数据进行子集化,并计算每个数据的回归系数。

我的数据结构的一个子集是:

pdata <- structure(list(movie_odid = c(10100L, 10100L, 520100L, 520100L, 
650100L, 650100L, 10100L, 10100L, 520100L, 780100L, 780100L, 
950100L, 950100L, 540100L, 540100L, 780100L, 780100L, 880100L, 
880100L, 450100L, 450100L, 540100L, 540100L, 640100L, 640100L, 
800100L, 800100L, 420100L, 420100L, 450100L, 450100L, 490100L, 
490100L, 640100L, 640100L, 430100L, 430100L, 490100L, 490100L, 
590100L, 590100L, 1620100L, 1620100L, 390100L, 390100L, 8810100L, 
8810100L, 9480100L, 9480100L, 570100L, 570100L, 590100L, 590100L
), chart_date = structure(c(5L, 6L, 3L, 4L, 1L, 2L, 7L, 8L, 7L, 
11L, 12L, 9L, 10L, 17L, 18L, 13L, 14L, 15L, 16L, 23L, 24L, 19L, 
20L, 25L, 26L, 21L, 22L, 29L, 30L, 27L, 28L, 31L, 32L, 27L, 28L, 
37L, 38L, 33L, 34L, 35L, 36L, 39L, 40L, 41L, 42L, 47L, 48L, 45L, 
46L, 43L, 44L, 39L, 40L), .Label = c("1997-05-23", "1997-05-30", 
"1997-07-04", "1997-07-11", "1997-12-19", "1997-12-26", "1998-01-02", 
"1998-01-09", "1998-06-26", "1998-07-03", "1998-07-24", "1998-07-31", 
"1999-02-05", "1999-02-12", "1999-06-04", "1999-06-11", "1999-11-19", 
"1999-11-26", "2000-01-07", "2000-01-14", "2000-05-26", "2000-06-02", 
"2000-11-17", "2000-11-24", "2000-12-22", "2000-12-29", "2001-01-05", 
"2001-01-12", "2001-05-18", "2001-05-25", "2001-11-02", "2001-11-09", 
"2002-01-04", "2002-01-11", "2002-04-19", "2002-04-26", "2002-11-15", 
"2002-11-22", "2003-01-03", "2003-01-10", "2003-05-09", "2003-05-16", 
"2003-05-23", "2003-05-30", "2003-09-12", "2003-09-19", "2003-11-07", 
"2003-11-14"), class = "factor"), chart_year = c(1997L, 1997L, 
1997L, 1997L, 1997L, 1997L, 1998L, 1998L, 1998L, 1998L, 1998L, 
1998L, 1998L, 1999L, 1999L, 1999L, 1999L, 1999L, 1999L, 2000L, 
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2002L, 2002L, 2002L, 
2002L, 2002L, 2002L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 
2003L, 2003L, 2003L, 2003L, 2003L, 2003L), revenue = c(52969336L, 
71183357L, 76457208L, 43593212L, 81172327L, 45111185L, 45012810L, 
37568867L, 48261L, 49760360L, 36612617L, 1962627L, 1441774L, 
23093123L, 65927993L, 5107876L, 4771193L, 3000000L, 82216507L, 
84977355L, 59922105L, 8431650L, 7296370L, 78711571L, 40769776L, 
83347490L, 37133438L, 56498192L, 63650772L, 2968580L, 788895L, 
76599345L, 57025088L, 28499878L, 22837762L, 106131568L, 61909948L, 
4502006L, 3272808L, 822068L, 1078673L, 3843873L, 2101748L, 42508303L, 
121361422L, 10163670L, 11628760L, 29944555L, 14018616L, 100066590L, 
49010220L, 3536766L, 3321470L), theaters = c(2674L, 2711L, 3020L, 
3020L, 3281L, 3282L, 2727L, 2746L, 58L, 2453L, 2540L, 214L, 214L, 
3236L, 3236L, 1027L, 1140L, 0L, 3312L, 3127L, 3134L, 2752L, 2326L, 
2774L, 2929L, 3653L, 3653L, 3587L, 3623L, 2594L, 912L, 3237L, 
3269L, 2948L, 3048L, 3682L, 3682L, 1425L, 1313L, 108L, 141L, 
1808L, 1180L, 3603L, 3603L, 576L, 1177L, 3282L, 3289L, 3483L, 
3492L, 1194L, 1212L), running_time = c(194L, 194L, 98L, 98L, 
134L, 134L, 194L, 194L, 98L, 169L, 169L, 220L, 220L, 92L, 92L, 
169L, 169L, 95L, 95L, 105L, 105L, 92L, 92L, 143L, 143L, 126L, 
126L, 90L, 90L, 105L, 105L, 92L, 92L, 143L, 143L, 161L, 161L, 
92L, 92L, 95L, 95L, 133L, 133L, 138L, 138L, 135L, 135L, 0L, 0L, 
102L, 102L, 95L, 95L), ifUS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 
1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L)), class = "data.frame", row.names = c(NA, 
-53L))

我的代码是:

coefLM <- function(x) {
  coef(lm(log(revenue) ~ running_time, data = x))[2]
}

spl <- with(pdata, split(pdata, list(chart_year = chart_year, ifUS = ifUS)))
out <- unique(pdata[, c("chart_year", "ifUS")])
out <- transform(out, slope = sapply(spl, coefLM))

out

但是,结果中存在时间不匹配。例如,对于“1999.0.running_time”,chart_year 是“2012”。能否请您指导我可能的原因是什么?

在您对 out <- transform(...) 的调用中,chart_yearifUS 的顺序与 spl 列表不匹配。

spl <- with(pdata, split(pdata, list(chart_year = chart_year, ifUS = ifUS)))
out <- unique(pdata[, c("chart_year", "ifUS")])
out
#    chart_year ifUS
# 1        1997    1
# 7        1998    1
# 14       1999    1
# 20       2000    1
# 28       2001    1
# 36       2002    0
# 38       2002    1
# 42       2003    0
# 44       2003    1
nrow(out)
# [1] 9

但是当你尝试 transform 它时,

out <- transform(out, slope = sapply(spl, coefLM))
# Error in lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) : 
#   0 (non-NA) cases

我怀疑你没有得到这个错误是因为你有一个不同的(更大的)spl然而,如果你看outnames(spl),即使长度相同,也似乎不在一条直线上。 (我将在一个框架中展示只是为了展示

nrow(out)
# [1] 9

length(spl)
# [1] 14

sapply(spl, nrow)
# 1997.0 1998.0 1999.0 2000.0 2001.0 2002.0 2003.0 1997.1 1998.1 1999.1 2000.1 2001.1 2002.1 2003.1 
#      0      0      0      0      0      2      6      6      7      6      8      8      4      6 

即使我们从 spl 中过滤掉 0 行帧(确实 给我们长度 9),splout 不一样:

names(Filter(c, sapply(spl, nrow)))
# [1] "2002.0" "2003.0" "1997.1" "1998.1" "1999.1" "2000.1" "2001.1" "2002.1" "2003.1"
do.call(paste, c(out, sep = "."))
# [1] "1997.1" "1998.1" "1999.1" "2000.1" "2001.1" "2002.0" "2002.1" "2003.0" "2003.1"

(不要关注我使用的代码,而是关注我展示的内容。)

(老实说,你应该很高兴你发现了这个问题。否则,你很可能已经识别出具有显着性的模型,正确的,但分配给了错误的组。损坏的结果。)

如何解决?这是一个建议,转向 dplyr 方法:

library(dplyr)
pdata %>%
  group_by(chart_year, ifUS) %>%
  summarize(coef = coefLM(cur_data())) %>%
  ungroup()
# `summarise()` regrouping output by 'chart_year' (override with `.groups` argument)
# # A tibble: 9 x 3
#   chart_year  ifUS      coef
#        <int> <int>     <dbl>
# 1       1997     1  0.000602
# 2       1998     1  0.0288  
# 3       1999     1 -0.0217  
# 4       2000     1  0.0299  
# 5       2001     1 -0.0125  
# 6       2002     0 NA       
# 7       2002     1 -0.468   
# 8       2003     0 -0.00962 
# 9       2003     1  0.0479  

使用这种方法的一个优点是所使用的数据和产生的系数始终与相同的 categories/factors.

相关联

快速 演练,虽然 dplyr 上更详细的教程会带来更多学习点:

  • %>% 只是一个“管道运算符”,它有助于直观地分解正在发生的事情。

    这两个在功能上是相同的:

    pdata %>% somefunc(.)
    somefunc(pdata)
    

    它的优势在于更长的管道:

    pdata %>% somefunc(.) %>% anotherfunc(., arg = 7) %>% finalfunc(., n = 1)
    finalfunc(anotherfunc(somefunc(pdata), arg = 7), n = 1)
    
  • group_by确保所有data-creating/changing操作一次只对一组数据进行操作。 (注意:“分组”的东西会一直粘在数据上,直到用 ungroup() 主动删除,并且一些计算根据数据大小而有所不同。我一知道就养成总是 ungroup 的习惯不需要它,这样我以后就不会不小心计算错了。)

  • mutate(此处未使用)添加或替换框架中的列,保持相同的行数; summarize 通常将一组减少到一行(尽管 dplyr 的较新版本应该能够汇总到多于 1 行......关键是它对形状进行了重大更改的数据)。对于 summarize,所有其他列都被删除(因为在汇总到 1 行结果时不能保留正常的 n 行列)。


不需要您学习的备选方案 dplyr:拆分单个名称可以更轻松地重新连接到其余数据。可以使用 paste 之类的,尽管我更喜欢 interaction 函数的传达意图(声明性)流程(...调用 paste)。

pdata$groupname <- interaction(pdata$chart_year, pdata$ifUS)
head(pdata)
#   movie_odid chart_date chart_year  revenue theaters running_time ifUS groupname
# 1      10100 1997-12-19       1997 52969336     2674          194    1    1997.1
# 2      10100 1997-12-26       1997 71183357     2711          194    1    1997.1
# 3     520100 1997-07-04       1997 76457208     3020           98    1    1997.1
# 4     520100 1997-07-11       1997 43593212     3020           98    1    1997.1
# 5     650100 1997-05-23       1997 81172327     3281          134    1    1997.1
# 6     650100 1997-05-30       1997 45111185     3282          134    1    1997.1

这并没有做太多,但我们将拆分这个单一字段(与您的 2 字段拆分的效果相同)和 运行 您的函数。

我将使用它的表亲函数 by 而不是 split,它做同样的事情并且 运行 在每一帧上都有一个函数。 if by 的输出实际上只是一个 list,即使它看起来非常不同。

out <- by(pdata, pdata$groupname, FUN = coefLM)
out
# pdata$groupname: 1997.0
# [1] NA
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
# pdata$groupname: 1998.0
# [1] NA
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
# pdata$groupname: 1999.0
# [1] NA
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
# pdata$groupname: 2000.0
# [1] NA
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
# pdata$groupname: 2001.0
# [1] NA
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
# pdata$groupname: 2002.0
# [1] NA
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
# pdata$groupname: 2003.0
# [1] -0.009621244
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
# pdata$groupname: 1997.1
# [1] 0.0006017306
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
# pdata$groupname: 1998.1
# [1] 0.02882223
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
# pdata$groupname: 1999.1
# [1] -0.02168886
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
# pdata$groupname: 2000.1
# [1] 0.02991097
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
# pdata$groupname: 2001.1
# [1] -0.01250767
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
# pdata$groupname: 2002.1
# [1] -0.4683953
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
# pdata$groupname: 2003.1
# [1] 0.04785696