运行 总计减法

Running Total with subtraction

我有一个数据集,其中包含加利福尼亚 public 所学校的关闭和开学日期。问题底部可用 heredput()。数据还列出了学校的类型和位置。我正在尝试创建一个 运行 总计列,该列还考虑了学校停课和学校类型。

这是我想出的解决方案,它基本上需要我根据条件使用 ifelse:

对许多不同的 1 和 0 进行编码
# open charter schools
pubschls$open_chart <-  ifelse(pubschls$Charter=="Y" & is.na(pubschls$ClosedDate)==TRUE, 1, 0)
# open public schools
pubschls$open_pub <- ifelse(pubschls$Charter=="N" & is.na(pubschls$ClosedDate)==TRUE, 1, 0)
# closed charters
pubschls$closed_chart <- ifelse(pubschls$Charter=="Y" & is.na(pubschls$ClosedDate)==FALSE, 1,  0)
# closed public schools 
pubschls$closed_pub <- ifelse(pubschls$Charter=="N" & is.na(pubschls$ClosedDate)==FALSE, 1,  0)
lausd <- filter(pubschls, NCESDist=="0622710")
# count number open during each year

然后我将各列相减以获得总计。

la_schools_count <- aggregate(lausd[c('open_chart','closed_chart','open_pub','closed_pub')],
 by=list(year(lausd$OpenDate)), sum)

    # find net charters by subtracting closed from open
    la_schools_count$net_chart <- la_schools_count$open_chart - la_schools_count$closed_chart
    # find net public schools by subtracting closed from open
    la_schools_count$net_pub <- la_schools_count$open_pub - la_schools_count$closed_pub
    # add running totals 
    la_schools_count$cum_chart <- cumsum(la_schools_count$net_chart)
    la_schools_count$cum_pub <- cumsum(la_schools_count$net_pub)
    # total totals 
    la_schools_count$total <- la_schools_count$cum_chart + la_schools_count$cum_pub

我的输出如下所示:

la_schools_count <- select(la_schools_count, "year", "cum_chart", "cum_pub", "pen_rate", "total")
     year cum_chart cum_pub  pen_rate total
1  1952         1       0 100.00000     1
2  1956         1       1  50.00000     2
3  1969         1       2  33.33333     3
4  1980        55     469  10.49618   524
5  1989        55     470  10.47619   525
6  1990        55     470  10.47619   525
7  1991        55     473  10.41667   528
8  1992        55     476  10.35782   531
9  1993        55     477  10.33835   532
10 1994        56     478  10.48689   534
11 1995        57     478  10.65421   535
12 1996        57     479  10.63433   536
13 1997        58     481  10.76067   539
14 1998        59     480  10.94620   539
15 1999        61     480  11.27542   541
16 2000        61     481  11.25461   542
17 2001        62     482  11.39706   544
18 2002        64     484  11.67883   548
19 2003        73     485  13.08244   558
20 2004        83     496  14.33506   579
21 2005        90     524  14.65798   614
22 2006        96     532  15.28662   628
23 2007        90     534  14.42308   624
24 2008        97     539  15.25157   636
25 2009       108     546  16.51376   654
26 2010       124     566  17.97101   690
27 2011       140     580  19.44444   720
28 2012       144     605  19.22563   749
29 2013       162     609  21.01167   771
30 2014       179     611  22.65823   790
31 2015       195     611  24.19355   806
32 2016       203     614  24.84700   817
33 2017       211     619  25.42169   830

我只是想知道是否可以用更好的方式来完成。就像基于条件对所有行的 apply 语句?

dput:
structure(list(CDSCode = c("19647330100289", "19647330100297", 
"19647330100669", "19647330100677", "19647330100743", "19647330100750"
), OpenDate = structure(c(12324, 12297, 12240, 12299, 12634, 
12310), class = "Date"), ClosedDate = structure(c(NA, 15176, 
NA, NA, NA, NA), class = "Date"), Charter = c("Y", "Y", "Y", 
"Y", "Y", "Y")), .Names = c("CDSCode", "OpenDate", "ClosedDate", 
"Charter"), row.names = c(NA, -6L), class = c("tbl_df", "tbl", 
"data.frame"))

除了 pen_rate,我遵循了你的代码并了解了你在做什么。 pen_rate 似乎是用 cum_chart 除以 total 计算得出的。我下载了原始数据集并执行了以下操作。我称数据集为foo。 Whenclosed_pub),我合并了 CharterClosedDate。我检查了 ClosedDate 是否为 NA,并将逻辑输出转换为数字(1 = 打开,0 = 关闭)。这就是我创建四个组的方式(即 open_chart、closed_chart、open_pub 和 closed_pub)。我想这会要求你少打字。由于日期是字符,我使用 substr() 提取年份。如果你有一个日期对象,你需要做一些别的事情。一旦你有了年份,你就可以用它对数据进行分组,并使用 count() 计算每种类型的学校有多少所学校。这部分相当于您的 aggregate() 代码。然后,使用 spread() 将输出转换为宽格式数据,并按照您在代码中演示的那样进行其余计算。最终输出似乎与您在问题中的输出不同,但我的结果与我通过 运行 您的代码获得的结果相同。希望对您有所帮助。

library(dplyr)
library(tidyr)
library(readxl)

# Get the necessary data
foo <- read_xls("pubschls.xls") %>%
       select(NCESDist, CDSCode, OpenDate, ClosedDate, Charter) %>%
       filter(NCESDist == "0622710" & (!Charter %in% NA))


mutate(foo, group = paste(Charter, as.numeric(is.na(ClosedDate)), sep = "_"),
       year = substr(OpenDate, star = nchar(OpenDate) - 3, stop = nchar(OpenDate))) %>%
count(year, group) %>%
spread(key = group, value = n, fill = 0) %>%
mutate(net_chart = Y_1 - Y_0,
       net_pub = N_1 - N_0,
       cum_chart = cumsum(net_chart),
       cum_pub = cumsum(net_pub),
       total = cum_chart + cum_pub,
       pen_rate = cum_chart / total)

# A part of the outcome
#    year N_0 N_1 Y_0 Y_1 net_chart net_pub cum_chart cum_pub total   pen_rate
#1   1866   0   1   0   0         0       1         0       1     1 0.00000000
#2   1873   0   1   0   0         0       1         0       2     2 0.00000000
#3   1878   0   1   0   0         0       1         0       3     3 0.00000000
#4   1881   0   1   0   0         0       1         0       4     4 0.00000000
#5   1882   0   2   0   0         0       2         0       6     6 0.00000000
#110 2007   0   2  15   9        -6       2        87     393   480 0.18125000
#111 2008   2   8   9  15         6       6        93     399   492 0.18902439
#112 2009   1   9   4  15        11       8       104     407   511 0.20352250
#113 2010   5  26   5  21        16      21       120     428   548 0.21897810
#114 2011   2  16   2  18        16      14       136     442   578 0.23529412
#115 2012   2  27   3   7         4      25       140     467   607 0.23064250
#116 2013   1   5   1  19        18       4       158     471   629 0.25119237
#117 2014   1   3   1  18        17       2       175     473   648 0.27006173
#118 2015   0   0   2  18        16       0       191     473   664 0.28765060
#119 2016   0   3   0   8         8       3       199     476   675 0.29481481
#120 2017   0   5   0   9         9       5       208     481   689 0.30188679