基于变量对列进行操作
Operate on columns based on a variable
我有以下数据
df <- structure(list(year = c(2015L, 2015L, 2015L, 2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L,
2015L, 2015L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L,
2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L,
2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L,
2016L), newly_engaged = c(FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE), qualification = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("A2", "AS"), class = "factor"), subject = structure(c(7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L), .Label = c("Biology", "Chemistry", "Mathematics",
"Mathematics (Further)", "Mathematics (Pure)", "Mathematics (Statistics)",
"Physics"), class = "factor"), grade = structure(c(1L, 2L, 3L,
4L, 5L, 6L, 7L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 2L, 3L,
4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 2L, 3L, 4L, 5L, 6L,
7L), .Label = c("S", "A", "B", "C", "D", "E", "No.results"), class = "factor"),
c = c(2032L, 3871L, 3728L, 3130L, 2514L, 1796L, 591L, 7694L,
5486L, 4885L, 3790L, 2493L, 2734L, 1079L, 2142L, 2082L, 1703L,
1273L, 779L, 219L, 4096L, 2880L, 2366L, 1700L, 1139L, 1051L,
1807L, 3961L, 3921L, 3237L, 2521L, 1760L, 609L, 8160L, 6661L,
7035L, 5934L, 4811L, 6155L, 1009L, 2022L, 2127L, 1664L, 1224L,
779L, 192L, 4214L, 3350L, 3336L, 2701L, 2044L, 2280L), e = c(17662L,
17662L, 17662L, 17662L, 17662L, 17662L, 17662L, 27082L, 27082L,
27082L, 27082L, 27082L, 27082L, 9277L, 9277L, 9277L, 9277L,
9277L, 9277L, 9277L, 13232L, 13232L, 13232L, 13232L, 13232L,
13232L, 17816L, 17816L, 17816L, 17816L, 17816L, 17816L, 17816L,
38756L, 38756L, 38756L, 38756L, 38756L, 38756L, 9017L, 9017L,
9017L, 9017L, 9017L, 9017L, 9017L, 17925L, 17925L, 17925L,
17925L, 17925L, 17925L), m = c(0.115049258294644, 0.219171101800476,
0.211074623485449, 0.177216623258974, 0.142339485901936,
0.101687238138376, 0.0334616691201449, 0.2841001403146, 0.202569972675578,
0.180378110922384, 0.139945351155749, 0.0920537626467765,
0.100952662284912, 0.116309151665409, 0.230893607847364,
0.224425999784413, 0.183572275520103, 0.137221084402285,
0.0839711113506521, 0.0236067694297726, 0.309552599758162,
0.217654171704958, 0.178808948004837, 0.128476420798065,
0.0860792019347038, 0.0794286577992745, 0.101425684777728,
0.222328244274809, 0.220083071396498, 0.181690615177369,
0.14150202065559, 0.0987876066457117, 0.0341827570722946,
0.210548044173805, 0.171870162039426, 0.181520280730726,
0.153111776241098, 0.124135617710806, 0.158814119104139,
0.11189974492625, 0.224243096373517, 0.235887767550183, 0.184540312742597,
0.135743595430853, 0.0863923699678385, 0.0212931130087612,
0.235090655509066, 0.186889818688982, 0.186108786610879,
0.15068340306834, 0.114030683403068, 0.127196652719665)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -52L), .Names = c("year",
"newly_engaged", "qualification", "subject", "grade", "c", "e",
"m"))
我需要取2015年和2016年m
对应值的差值,来显示2015年到2016年分配的成绩比例的差异。我想我可以reshape2::cast
这和 ddplyr::summarise
来计算差异,但我不确定如何首先使用 cast
。
如果我们将 plyr
库与 dplyr
一起加载,则会发生错误,因为两者中的函数名称相同,并且这些函数可能会被另一个包屏蔽
df %>%
group_by(year) %>%
plyr::mutate(n = row_number()) %>%
group_by(n) %>%
summarise(m = diff(m))
Error in rank(x, ties.method = "first", na.last = "keep") :
argument "x" is missing, with no default
在那种情况下,明确指定 dply::
df %>%
group_by(year) %>%
dplyr::mutate(n = row_number()) %>%
group_by(n) %>%
dplyr::summarise(m = diff(m))
# A tibble: 26 × 2
# n m
# <int> <dbl>
#1 1 -0.0136235735
#2 2 0.0031571425
#3 3 0.0090084479
#4 4 0.0044739919
#5 5 -0.0008374652
#6 6 -0.0028996315
#7 7 0.0007210880
#8 8 -0.0735520961
#9 9 -0.0306998106
#10 10 0.0011421698
# ... with 16 more rows
使用 dplyr
和 tidyr
您可以轻松地重铸您的数据框,将 2015 年和 2016 年的 m 值并排给出,然后计算差值
library(dplyr)
library(tidyr)
df2 <- df %>% select(-c(c,e)) %>% spread(key=year,value=m) %>% mutate(diff=`2016`-`2015`)
df2
# A tibble: 26 × 7
newly_engaged qualification subject grade `2015` `2016` diff
<lgl> <fctr> <fctr> <fctr> <dbl> <dbl> <dbl>
1 FALSE A2 Physics S 0.11504926 0.10142568 -0.0136235735
2 FALSE A2 Physics A 0.21917110 0.22232824 0.0031571425
3 FALSE A2 Physics B 0.21107462 0.22008307 0.0090084479
4 FALSE A2 Physics C 0.17721662 0.18169062 0.0044739919
5 FALSE A2 Physics D 0.14233949 0.14150202 -0.0008374652
6 FALSE A2 Physics E 0.10168724 0.09878761 -0.0028996315
7 FALSE A2 Physics No.results 0.03346167 0.03418276 0.0007210880
8 FALSE AS Physics A 0.28410014 0.21054804 -0.0735520961
9 FALSE AS Physics B 0.20256997 0.17187016 -0.0306998106
10 FALSE AS Physics C 0.18037811 0.18152028 0.0011421698
# ... with 16 more rows
我有以下数据
df <- structure(list(year = c(2015L, 2015L, 2015L, 2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L,
2015L, 2015L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L,
2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L,
2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L,
2016L), newly_engaged = c(FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE), qualification = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("A2", "AS"), class = "factor"), subject = structure(c(7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L), .Label = c("Biology", "Chemistry", "Mathematics",
"Mathematics (Further)", "Mathematics (Pure)", "Mathematics (Statistics)",
"Physics"), class = "factor"), grade = structure(c(1L, 2L, 3L,
4L, 5L, 6L, 7L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 2L, 3L,
4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 2L, 3L, 4L, 5L, 6L,
7L), .Label = c("S", "A", "B", "C", "D", "E", "No.results"), class = "factor"),
c = c(2032L, 3871L, 3728L, 3130L, 2514L, 1796L, 591L, 7694L,
5486L, 4885L, 3790L, 2493L, 2734L, 1079L, 2142L, 2082L, 1703L,
1273L, 779L, 219L, 4096L, 2880L, 2366L, 1700L, 1139L, 1051L,
1807L, 3961L, 3921L, 3237L, 2521L, 1760L, 609L, 8160L, 6661L,
7035L, 5934L, 4811L, 6155L, 1009L, 2022L, 2127L, 1664L, 1224L,
779L, 192L, 4214L, 3350L, 3336L, 2701L, 2044L, 2280L), e = c(17662L,
17662L, 17662L, 17662L, 17662L, 17662L, 17662L, 27082L, 27082L,
27082L, 27082L, 27082L, 27082L, 9277L, 9277L, 9277L, 9277L,
9277L, 9277L, 9277L, 13232L, 13232L, 13232L, 13232L, 13232L,
13232L, 17816L, 17816L, 17816L, 17816L, 17816L, 17816L, 17816L,
38756L, 38756L, 38756L, 38756L, 38756L, 38756L, 9017L, 9017L,
9017L, 9017L, 9017L, 9017L, 9017L, 17925L, 17925L, 17925L,
17925L, 17925L, 17925L), m = c(0.115049258294644, 0.219171101800476,
0.211074623485449, 0.177216623258974, 0.142339485901936,
0.101687238138376, 0.0334616691201449, 0.2841001403146, 0.202569972675578,
0.180378110922384, 0.139945351155749, 0.0920537626467765,
0.100952662284912, 0.116309151665409, 0.230893607847364,
0.224425999784413, 0.183572275520103, 0.137221084402285,
0.0839711113506521, 0.0236067694297726, 0.309552599758162,
0.217654171704958, 0.178808948004837, 0.128476420798065,
0.0860792019347038, 0.0794286577992745, 0.101425684777728,
0.222328244274809, 0.220083071396498, 0.181690615177369,
0.14150202065559, 0.0987876066457117, 0.0341827570722946,
0.210548044173805, 0.171870162039426, 0.181520280730726,
0.153111776241098, 0.124135617710806, 0.158814119104139,
0.11189974492625, 0.224243096373517, 0.235887767550183, 0.184540312742597,
0.135743595430853, 0.0863923699678385, 0.0212931130087612,
0.235090655509066, 0.186889818688982, 0.186108786610879,
0.15068340306834, 0.114030683403068, 0.127196652719665)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -52L), .Names = c("year",
"newly_engaged", "qualification", "subject", "grade", "c", "e",
"m"))
我需要取2015年和2016年m
对应值的差值,来显示2015年到2016年分配的成绩比例的差异。我想我可以reshape2::cast
这和 ddplyr::summarise
来计算差异,但我不确定如何首先使用 cast
。
如果我们将 plyr
库与 dplyr
一起加载,则会发生错误,因为两者中的函数名称相同,并且这些函数可能会被另一个包屏蔽
df %>%
group_by(year) %>%
plyr::mutate(n = row_number()) %>%
group_by(n) %>%
summarise(m = diff(m))
Error in rank(x, ties.method = "first", na.last = "keep") :
argument "x" is missing, with no default
在那种情况下,明确指定 dply::
df %>%
group_by(year) %>%
dplyr::mutate(n = row_number()) %>%
group_by(n) %>%
dplyr::summarise(m = diff(m))
# A tibble: 26 × 2
# n m
# <int> <dbl>
#1 1 -0.0136235735
#2 2 0.0031571425
#3 3 0.0090084479
#4 4 0.0044739919
#5 5 -0.0008374652
#6 6 -0.0028996315
#7 7 0.0007210880
#8 8 -0.0735520961
#9 9 -0.0306998106
#10 10 0.0011421698
# ... with 16 more rows
使用 dplyr
和 tidyr
您可以轻松地重铸您的数据框,将 2015 年和 2016 年的 m 值并排给出,然后计算差值
library(dplyr)
library(tidyr)
df2 <- df %>% select(-c(c,e)) %>% spread(key=year,value=m) %>% mutate(diff=`2016`-`2015`)
df2
# A tibble: 26 × 7
newly_engaged qualification subject grade `2015` `2016` diff
<lgl> <fctr> <fctr> <fctr> <dbl> <dbl> <dbl>
1 FALSE A2 Physics S 0.11504926 0.10142568 -0.0136235735
2 FALSE A2 Physics A 0.21917110 0.22232824 0.0031571425
3 FALSE A2 Physics B 0.21107462 0.22008307 0.0090084479
4 FALSE A2 Physics C 0.17721662 0.18169062 0.0044739919
5 FALSE A2 Physics D 0.14233949 0.14150202 -0.0008374652
6 FALSE A2 Physics E 0.10168724 0.09878761 -0.0028996315
7 FALSE A2 Physics No.results 0.03346167 0.03418276 0.0007210880
8 FALSE AS Physics A 0.28410014 0.21054804 -0.0735520961
9 FALSE AS Physics B 0.20256997 0.17187016 -0.0306998106
10 FALSE AS Physics C 0.18037811 0.18152028 0.0011421698
# ... with 16 more rows