运行 总计减法
Running Total with subtraction
我有一个数据集,其中包含加利福尼亚 public 所学校的关闭和开学日期。问题底部可用 here 或 dput()
。数据还列出了学校的类型和位置。我正在尝试创建一个 运行 总计列,该列还考虑了学校停课和学校类型。
这是我想出的解决方案,它基本上需要我根据条件使用 ifelse
:
对许多不同的 1 和 0 进行编码
# open charter schools
pubschls$open_chart <- ifelse(pubschls$Charter=="Y" & is.na(pubschls$ClosedDate)==TRUE, 1, 0)
# open public schools
pubschls$open_pub <- ifelse(pubschls$Charter=="N" & is.na(pubschls$ClosedDate)==TRUE, 1, 0)
# closed charters
pubschls$closed_chart <- ifelse(pubschls$Charter=="Y" & is.na(pubschls$ClosedDate)==FALSE, 1, 0)
# closed public schools
pubschls$closed_pub <- ifelse(pubschls$Charter=="N" & is.na(pubschls$ClosedDate)==FALSE, 1, 0)
lausd <- filter(pubschls, NCESDist=="0622710")
# count number open during each year
然后我将各列相减以获得总计。
la_schools_count <- aggregate(lausd[c('open_chart','closed_chart','open_pub','closed_pub')],
by=list(year(lausd$OpenDate)), sum)
# find net charters by subtracting closed from open
la_schools_count$net_chart <- la_schools_count$open_chart - la_schools_count$closed_chart
# find net public schools by subtracting closed from open
la_schools_count$net_pub <- la_schools_count$open_pub - la_schools_count$closed_pub
# add running totals
la_schools_count$cum_chart <- cumsum(la_schools_count$net_chart)
la_schools_count$cum_pub <- cumsum(la_schools_count$net_pub)
# total totals
la_schools_count$total <- la_schools_count$cum_chart + la_schools_count$cum_pub
我的输出如下所示:
la_schools_count <- select(la_schools_count, "year", "cum_chart", "cum_pub", "pen_rate", "total")
year cum_chart cum_pub pen_rate total
1 1952 1 0 100.00000 1
2 1956 1 1 50.00000 2
3 1969 1 2 33.33333 3
4 1980 55 469 10.49618 524
5 1989 55 470 10.47619 525
6 1990 55 470 10.47619 525
7 1991 55 473 10.41667 528
8 1992 55 476 10.35782 531
9 1993 55 477 10.33835 532
10 1994 56 478 10.48689 534
11 1995 57 478 10.65421 535
12 1996 57 479 10.63433 536
13 1997 58 481 10.76067 539
14 1998 59 480 10.94620 539
15 1999 61 480 11.27542 541
16 2000 61 481 11.25461 542
17 2001 62 482 11.39706 544
18 2002 64 484 11.67883 548
19 2003 73 485 13.08244 558
20 2004 83 496 14.33506 579
21 2005 90 524 14.65798 614
22 2006 96 532 15.28662 628
23 2007 90 534 14.42308 624
24 2008 97 539 15.25157 636
25 2009 108 546 16.51376 654
26 2010 124 566 17.97101 690
27 2011 140 580 19.44444 720
28 2012 144 605 19.22563 749
29 2013 162 609 21.01167 771
30 2014 179 611 22.65823 790
31 2015 195 611 24.19355 806
32 2016 203 614 24.84700 817
33 2017 211 619 25.42169 830
我只是想知道是否可以用更好的方式来完成。就像基于条件对所有行的 apply
语句?
dput:
structure(list(CDSCode = c("19647330100289", "19647330100297",
"19647330100669", "19647330100677", "19647330100743", "19647330100750"
), OpenDate = structure(c(12324, 12297, 12240, 12299, 12634,
12310), class = "Date"), ClosedDate = structure(c(NA, 15176,
NA, NA, NA, NA), class = "Date"), Charter = c("Y", "Y", "Y",
"Y", "Y", "Y")), .Names = c("CDSCode", "OpenDate", "ClosedDate",
"Charter"), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
除了 pen_rate
,我遵循了你的代码并了解了你在做什么。 pen_rate
似乎是用 cum_chart
除以 total
计算得出的。我下载了原始数据集并执行了以下操作。我称数据集为foo
。 Whenclosed_pub),我合并了 Charter
和 ClosedDate
。我检查了 ClosedDate 是否为 NA,并将逻辑输出转换为数字(1 = 打开,0 = 关闭)。这就是我创建四个组的方式(即 open_chart、closed_chart、open_pub 和 closed_pub)。我想这会要求你少打字。由于日期是字符,我使用 substr()
提取年份。如果你有一个日期对象,你需要做一些别的事情。一旦你有了年份,你就可以用它对数据进行分组,并使用 count()
计算每种类型的学校有多少所学校。这部分相当于您的 aggregate()
代码。然后,使用 spread()
将输出转换为宽格式数据,并按照您在代码中演示的那样进行其余计算。最终输出似乎与您在问题中的输出不同,但我的结果与我通过 运行 您的代码获得的结果相同。希望对您有所帮助。
library(dplyr)
library(tidyr)
library(readxl)
# Get the necessary data
foo <- read_xls("pubschls.xls") %>%
select(NCESDist, CDSCode, OpenDate, ClosedDate, Charter) %>%
filter(NCESDist == "0622710" & (!Charter %in% NA))
mutate(foo, group = paste(Charter, as.numeric(is.na(ClosedDate)), sep = "_"),
year = substr(OpenDate, star = nchar(OpenDate) - 3, stop = nchar(OpenDate))) %>%
count(year, group) %>%
spread(key = group, value = n, fill = 0) %>%
mutate(net_chart = Y_1 - Y_0,
net_pub = N_1 - N_0,
cum_chart = cumsum(net_chart),
cum_pub = cumsum(net_pub),
total = cum_chart + cum_pub,
pen_rate = cum_chart / total)
# A part of the outcome
# year N_0 N_1 Y_0 Y_1 net_chart net_pub cum_chart cum_pub total pen_rate
#1 1866 0 1 0 0 0 1 0 1 1 0.00000000
#2 1873 0 1 0 0 0 1 0 2 2 0.00000000
#3 1878 0 1 0 0 0 1 0 3 3 0.00000000
#4 1881 0 1 0 0 0 1 0 4 4 0.00000000
#5 1882 0 2 0 0 0 2 0 6 6 0.00000000
#110 2007 0 2 15 9 -6 2 87 393 480 0.18125000
#111 2008 2 8 9 15 6 6 93 399 492 0.18902439
#112 2009 1 9 4 15 11 8 104 407 511 0.20352250
#113 2010 5 26 5 21 16 21 120 428 548 0.21897810
#114 2011 2 16 2 18 16 14 136 442 578 0.23529412
#115 2012 2 27 3 7 4 25 140 467 607 0.23064250
#116 2013 1 5 1 19 18 4 158 471 629 0.25119237
#117 2014 1 3 1 18 17 2 175 473 648 0.27006173
#118 2015 0 0 2 18 16 0 191 473 664 0.28765060
#119 2016 0 3 0 8 8 3 199 476 675 0.29481481
#120 2017 0 5 0 9 9 5 208 481 689 0.30188679
我有一个数据集,其中包含加利福尼亚 public 所学校的关闭和开学日期。问题底部可用 here 或 dput()
。数据还列出了学校的类型和位置。我正在尝试创建一个 运行 总计列,该列还考虑了学校停课和学校类型。
这是我想出的解决方案,它基本上需要我根据条件使用 ifelse
:
# open charter schools
pubschls$open_chart <- ifelse(pubschls$Charter=="Y" & is.na(pubschls$ClosedDate)==TRUE, 1, 0)
# open public schools
pubschls$open_pub <- ifelse(pubschls$Charter=="N" & is.na(pubschls$ClosedDate)==TRUE, 1, 0)
# closed charters
pubschls$closed_chart <- ifelse(pubschls$Charter=="Y" & is.na(pubschls$ClosedDate)==FALSE, 1, 0)
# closed public schools
pubschls$closed_pub <- ifelse(pubschls$Charter=="N" & is.na(pubschls$ClosedDate)==FALSE, 1, 0)
lausd <- filter(pubschls, NCESDist=="0622710")
# count number open during each year
然后我将各列相减以获得总计。
la_schools_count <- aggregate(lausd[c('open_chart','closed_chart','open_pub','closed_pub')],
by=list(year(lausd$OpenDate)), sum)
# find net charters by subtracting closed from open
la_schools_count$net_chart <- la_schools_count$open_chart - la_schools_count$closed_chart
# find net public schools by subtracting closed from open
la_schools_count$net_pub <- la_schools_count$open_pub - la_schools_count$closed_pub
# add running totals
la_schools_count$cum_chart <- cumsum(la_schools_count$net_chart)
la_schools_count$cum_pub <- cumsum(la_schools_count$net_pub)
# total totals
la_schools_count$total <- la_schools_count$cum_chart + la_schools_count$cum_pub
我的输出如下所示:
la_schools_count <- select(la_schools_count, "year", "cum_chart", "cum_pub", "pen_rate", "total")
year cum_chart cum_pub pen_rate total
1 1952 1 0 100.00000 1
2 1956 1 1 50.00000 2
3 1969 1 2 33.33333 3
4 1980 55 469 10.49618 524
5 1989 55 470 10.47619 525
6 1990 55 470 10.47619 525
7 1991 55 473 10.41667 528
8 1992 55 476 10.35782 531
9 1993 55 477 10.33835 532
10 1994 56 478 10.48689 534
11 1995 57 478 10.65421 535
12 1996 57 479 10.63433 536
13 1997 58 481 10.76067 539
14 1998 59 480 10.94620 539
15 1999 61 480 11.27542 541
16 2000 61 481 11.25461 542
17 2001 62 482 11.39706 544
18 2002 64 484 11.67883 548
19 2003 73 485 13.08244 558
20 2004 83 496 14.33506 579
21 2005 90 524 14.65798 614
22 2006 96 532 15.28662 628
23 2007 90 534 14.42308 624
24 2008 97 539 15.25157 636
25 2009 108 546 16.51376 654
26 2010 124 566 17.97101 690
27 2011 140 580 19.44444 720
28 2012 144 605 19.22563 749
29 2013 162 609 21.01167 771
30 2014 179 611 22.65823 790
31 2015 195 611 24.19355 806
32 2016 203 614 24.84700 817
33 2017 211 619 25.42169 830
我只是想知道是否可以用更好的方式来完成。就像基于条件对所有行的 apply
语句?
dput:
structure(list(CDSCode = c("19647330100289", "19647330100297",
"19647330100669", "19647330100677", "19647330100743", "19647330100750"
), OpenDate = structure(c(12324, 12297, 12240, 12299, 12634,
12310), class = "Date"), ClosedDate = structure(c(NA, 15176,
NA, NA, NA, NA), class = "Date"), Charter = c("Y", "Y", "Y",
"Y", "Y", "Y")), .Names = c("CDSCode", "OpenDate", "ClosedDate",
"Charter"), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
除了 pen_rate
,我遵循了你的代码并了解了你在做什么。 pen_rate
似乎是用 cum_chart
除以 total
计算得出的。我下载了原始数据集并执行了以下操作。我称数据集为foo
。 Whenclosed_pub),我合并了 Charter
和 ClosedDate
。我检查了 ClosedDate 是否为 NA,并将逻辑输出转换为数字(1 = 打开,0 = 关闭)。这就是我创建四个组的方式(即 open_chart、closed_chart、open_pub 和 closed_pub)。我想这会要求你少打字。由于日期是字符,我使用 substr()
提取年份。如果你有一个日期对象,你需要做一些别的事情。一旦你有了年份,你就可以用它对数据进行分组,并使用 count()
计算每种类型的学校有多少所学校。这部分相当于您的 aggregate()
代码。然后,使用 spread()
将输出转换为宽格式数据,并按照您在代码中演示的那样进行其余计算。最终输出似乎与您在问题中的输出不同,但我的结果与我通过 运行 您的代码获得的结果相同。希望对您有所帮助。
library(dplyr)
library(tidyr)
library(readxl)
# Get the necessary data
foo <- read_xls("pubschls.xls") %>%
select(NCESDist, CDSCode, OpenDate, ClosedDate, Charter) %>%
filter(NCESDist == "0622710" & (!Charter %in% NA))
mutate(foo, group = paste(Charter, as.numeric(is.na(ClosedDate)), sep = "_"),
year = substr(OpenDate, star = nchar(OpenDate) - 3, stop = nchar(OpenDate))) %>%
count(year, group) %>%
spread(key = group, value = n, fill = 0) %>%
mutate(net_chart = Y_1 - Y_0,
net_pub = N_1 - N_0,
cum_chart = cumsum(net_chart),
cum_pub = cumsum(net_pub),
total = cum_chart + cum_pub,
pen_rate = cum_chart / total)
# A part of the outcome
# year N_0 N_1 Y_0 Y_1 net_chart net_pub cum_chart cum_pub total pen_rate
#1 1866 0 1 0 0 0 1 0 1 1 0.00000000
#2 1873 0 1 0 0 0 1 0 2 2 0.00000000
#3 1878 0 1 0 0 0 1 0 3 3 0.00000000
#4 1881 0 1 0 0 0 1 0 4 4 0.00000000
#5 1882 0 2 0 0 0 2 0 6 6 0.00000000
#110 2007 0 2 15 9 -6 2 87 393 480 0.18125000
#111 2008 2 8 9 15 6 6 93 399 492 0.18902439
#112 2009 1 9 4 15 11 8 104 407 511 0.20352250
#113 2010 5 26 5 21 16 21 120 428 548 0.21897810
#114 2011 2 16 2 18 16 14 136 442 578 0.23529412
#115 2012 2 27 3 7 4 25 140 467 607 0.23064250
#116 2013 1 5 1 19 18 4 158 471 629 0.25119237
#117 2014 1 3 1 18 17 2 175 473 648 0.27006173
#118 2015 0 0 2 18 16 0 191 473 664 0.28765060
#119 2016 0 3 0 8 8 3 199 476 675 0.29481481
#120 2017 0 5 0 9 9 5 208 481 689 0.30188679