R虚拟变量
R DUMMY VARIABLE
DATA=data.frame( STUDENT=c(1 ,1 ,1 ,1 ,1 ,1 ,1 ,2 ,2 ,2 ,2 ,2 ,2),
SCORE=c(92 ,64 ,83 ,78 ,69 ,52 ,100 ,69 ,76 ,100 ,74 ,75 ,56),
DATE=c("1/21/2000" ,"1/21/2000" ,"5/29/2000" ,"7/10/2000" ,"7/22/2000" ,"7/22/2000" ,"8/10/2000" ,"2/20/2000" ,"8/29/2000" ,"9/15/2000" ,"3/7/2001" ,"7/7/2001" ,"8/18/2001"),
NEW1=c(1 ,0 ,1 ,1 ,1 ,0 ,1 ,1 ,1 ,1 ,1 ,1 ,1),
NEW2=c(NA ,-99 ,129 ,42 ,12 ,-99 ,19 ,NA ,191 ,17 ,173 ,122 ,42)))
我有 'DATA' 并希望创建 'NEW1' 以及 'NEW2' 和 'NEW1' 对于 'STUDENT' 和 [= 的独特组合基本上是 1 18=] 或 0 如果组合基本上重复。 'NEW2' 是你减去每个 'STUDENT' 的日期来计算测试之间的天数但是你跳过了 'NEW1' 中有 0 的那些,因为它是重复的。
与 dplyr
:
df %>%
mutate(DATE=as.Date(DATE,"%m/%d/%Y")) %>%
arrange(DATE) %>%
group_by(STUDENT,DATE) %>%
mutate(NEW1 = as.numeric(row_number()==1)) %>%
group_by(STUDENT) %>%
mutate(NEW2 = if_else(NEW1==1, as.numeric(DATE-lag(DATE)),-99))
输出:
STUDENT SCORE DATE NEW1 NEW2
<dbl> <dbl> <date> <dbl> <dbl>
1 1 92 2000-01-21 1 NA
2 1 64 2000-01-21 0 -99
3 2 69 2000-02-20 1 NA
4 1 83 2000-05-29 1 129
5 1 78 2000-07-10 1 42
6 1 69 2000-07-22 1 12
7 1 52 2000-07-22 0 -99
8 1 100 2000-08-10 1 19
9 2 76 2000-08-29 1 191
10 2 100 2000-09-15 1 17
11 2 74 2001-03-07 1 173
12 2 75 2001-07-07 1 122
13 2 56 2001-08-18 1 42
这是一个相当冗长的 data.table 方法:
library(data.table)
setDT(df)[, DATE:=as.Date(DATE,"%m/%d/%Y")] %>%
.[order(STUDENT,DATE),id:=1:.N, by=.(STUDENT,DATE)] %>%
.[,NEW1:=as.numeric(id==1)] %>%
.[,NEW2:=fifelse(id==1, as.numeric(DATE-lag(DATE)), -99),by=STUDENT] %>%
.[,id:=NULL] %>%
.[]
输出:
STUDENT SCORE DATE NEW1 NEW2
1: 1 92 2000-01-21 1 NA
2: 1 64 2000-01-21 0 -99
3: 1 83 2000-05-29 1 129
4: 1 78 2000-07-10 1 42
5: 1 69 2000-07-22 1 12
6: 1 52 2000-07-22 0 -99
7: 1 100 2000-08-10 1 19
8: 2 69 2000-02-20 1 NA
9: 2 76 2000-08-29 1 191
10: 2 100 2000-09-15 1 17
11: 2 74 2001-03-07 1 173
12: 2 75 2001-07-07 1 122
13: 2 56 2001-08-18 1 42
尝试使用 tibbles(来自 library(tidyverse)
):
data=tibble(STUDENT=c(1 ,1 ,1 ,1 ,1 ,1 ,1 ,2 ,2 ,2 ,2 ,2 ,2),
SCORE=c(92 ,64 ,83 ,78 ,69 ,52 ,100 ,69 ,76 ,100 ,74 ,75 ,56),
DATE=c("1/21/2000" ,"1/21/2000" ,"5/29/2000" ,"7/10/2000" ,"7/22/2000" ,"7/22/2000" ,"8/10/2000" ,"2/20/2000" ,"8/29/2000" ,"9/15/2000" ,"3/7/2001" ,"7/7/2001" ,"8/18/2001"),
NEW1=c(1 ,0 ,1 ,1 ,1 ,0 ,1 ,1 ,1 ,1 ,1 ,1 ,1),
NEW2=c(NA ,-99 ,129 ,42 ,12 ,-99 ,19 ,NA ,191 ,17 ,173 ,122 ,42))
data %>%
clean_names %>%
mutate(dupe_indicator = ifelse(student == lag(student) & date == lag(date), 0, 1)) %>%
mutate(time_diff = ifelse(dupe_indicator == 1 & student == lag(student), mdy(date) - mdy(lag(date)), "NA"))
DATA=data.frame( STUDENT=c(1 ,1 ,1 ,1 ,1 ,1 ,1 ,2 ,2 ,2 ,2 ,2 ,2),
SCORE=c(92 ,64 ,83 ,78 ,69 ,52 ,100 ,69 ,76 ,100 ,74 ,75 ,56),
DATE=c("1/21/2000" ,"1/21/2000" ,"5/29/2000" ,"7/10/2000" ,"7/22/2000" ,"7/22/2000" ,"8/10/2000" ,"2/20/2000" ,"8/29/2000" ,"9/15/2000" ,"3/7/2001" ,"7/7/2001" ,"8/18/2001"),
NEW1=c(1 ,0 ,1 ,1 ,1 ,0 ,1 ,1 ,1 ,1 ,1 ,1 ,1),
NEW2=c(NA ,-99 ,129 ,42 ,12 ,-99 ,19 ,NA ,191 ,17 ,173 ,122 ,42)))
我有 'DATA' 并希望创建 'NEW1' 以及 'NEW2' 和 'NEW1' 对于 'STUDENT' 和 [= 的独特组合基本上是 1 18=] 或 0 如果组合基本上重复。 'NEW2' 是你减去每个 'STUDENT' 的日期来计算测试之间的天数但是你跳过了 'NEW1' 中有 0 的那些,因为它是重复的。
与 dplyr
:
df %>%
mutate(DATE=as.Date(DATE,"%m/%d/%Y")) %>%
arrange(DATE) %>%
group_by(STUDENT,DATE) %>%
mutate(NEW1 = as.numeric(row_number()==1)) %>%
group_by(STUDENT) %>%
mutate(NEW2 = if_else(NEW1==1, as.numeric(DATE-lag(DATE)),-99))
输出:
STUDENT SCORE DATE NEW1 NEW2
<dbl> <dbl> <date> <dbl> <dbl>
1 1 92 2000-01-21 1 NA
2 1 64 2000-01-21 0 -99
3 2 69 2000-02-20 1 NA
4 1 83 2000-05-29 1 129
5 1 78 2000-07-10 1 42
6 1 69 2000-07-22 1 12
7 1 52 2000-07-22 0 -99
8 1 100 2000-08-10 1 19
9 2 76 2000-08-29 1 191
10 2 100 2000-09-15 1 17
11 2 74 2001-03-07 1 173
12 2 75 2001-07-07 1 122
13 2 56 2001-08-18 1 42
这是一个相当冗长的 data.table 方法:
library(data.table)
setDT(df)[, DATE:=as.Date(DATE,"%m/%d/%Y")] %>%
.[order(STUDENT,DATE),id:=1:.N, by=.(STUDENT,DATE)] %>%
.[,NEW1:=as.numeric(id==1)] %>%
.[,NEW2:=fifelse(id==1, as.numeric(DATE-lag(DATE)), -99),by=STUDENT] %>%
.[,id:=NULL] %>%
.[]
输出:
STUDENT SCORE DATE NEW1 NEW2
1: 1 92 2000-01-21 1 NA
2: 1 64 2000-01-21 0 -99
3: 1 83 2000-05-29 1 129
4: 1 78 2000-07-10 1 42
5: 1 69 2000-07-22 1 12
6: 1 52 2000-07-22 0 -99
7: 1 100 2000-08-10 1 19
8: 2 69 2000-02-20 1 NA
9: 2 76 2000-08-29 1 191
10: 2 100 2000-09-15 1 17
11: 2 74 2001-03-07 1 173
12: 2 75 2001-07-07 1 122
13: 2 56 2001-08-18 1 42
尝试使用 tibbles(来自 library(tidyverse)
):
data=tibble(STUDENT=c(1 ,1 ,1 ,1 ,1 ,1 ,1 ,2 ,2 ,2 ,2 ,2 ,2),
SCORE=c(92 ,64 ,83 ,78 ,69 ,52 ,100 ,69 ,76 ,100 ,74 ,75 ,56),
DATE=c("1/21/2000" ,"1/21/2000" ,"5/29/2000" ,"7/10/2000" ,"7/22/2000" ,"7/22/2000" ,"8/10/2000" ,"2/20/2000" ,"8/29/2000" ,"9/15/2000" ,"3/7/2001" ,"7/7/2001" ,"8/18/2001"),
NEW1=c(1 ,0 ,1 ,1 ,1 ,0 ,1 ,1 ,1 ,1 ,1 ,1 ,1),
NEW2=c(NA ,-99 ,129 ,42 ,12 ,-99 ,19 ,NA ,191 ,17 ,173 ,122 ,42))
data %>%
clean_names %>%
mutate(dupe_indicator = ifelse(student == lag(student) & date == lag(date), 0, 1)) %>%
mutate(time_diff = ifelse(dupe_indicator == 1 & student == lag(student), mdy(date) - mdy(lag(date)), "NA"))