如何聚合具有不同日期的数据并考虑 R 中的其他列?
How aggregate data with different dates and considering other columns in the R?
我想汇总 dataframe
行 TBCG2
,当 DATA_INGRESSO_ORGAO
不同时(参见 ID_SERVIDOR_PORTAL
列编号 977, 1089, 1365, 1666, 2597, 2779
和 3036
).我想按照下面的代码保留最早的日期。但是,对于 ID 2789
,我有 CARGO
不同的日期,在这种情况下,我想通过在其中添加一个 x 来修改其中一个的 ID
来保留这两行ID
。也就是说,我想保留一个ID_SERVIDOR_PORTAL = 2789
和另一个ID_SERVIDOR_PORTAL = 2789x
。这个数据框只是我数据库的一部分。我该如何进行?
url=url("https://raw.githack.com/fsbmat/salarioDocente/master/Teste/TBCG2.csv")
TBCG2 <- read.csv2(url, header = TRUE,encoding = "ASCII")
TBCG2$DATA_INGRESSO_ORGAO <- as.Date(as.character(TBCG2$DATA_INGRESSO_ORGAO), format = "%d/%m/%Y")
>head(TBCG2)
ID_SERVIDOR_PORTAL NOME CPF CARGO DATA_INGRESSO_ORGAO BRU_Jan2013
1 3 MARGLIO ***.200.427-** ETTB 2014-09-12 NA
2 5 JACUIAR ***.614.234-** SM 2016-06-20 NA
3 12 ANDLEAL ***.609.150-** SM 2012-11-13 7627.02
4 69 GIZONCA ***.852.867-** SM 2016-07-04 NA
5 70 CARANNA ***.232.227-** SM 1997-03-10 12360.61
6 94 FERILVA ***.251.114-** ETTB 2008-12-29 3703.82
BRU_Fev2013 BRU_Mar2013
1 NA NA
2 NA NA
3 7627.02 8618.53
4 NA NA
5 12360.61 13896.89
6 3703.82 4282.41
library(sqldf)
TBCG2 <- sqldf('select ID_SERVIDOR_PORTAL,NOME,CPF,CARGO,
min(DATA_INGRESSO_ORGAO) as DATA_INGRESSO_ORGAO,
sum(BRU_Jan2013 ) as BRU_Jan2013,
sum(BRU_Fev2013 ) as BRU_Fev2013,
sum(BRU_Mar2013 ) as BRU_Mar2013
from TBCG2
group by ID_SERVIDOR_PORTAL,NOME,CPF')
显然我找到了一个解决方案,可能不是最快的,因为循环,但重要的是它有效。这是代码:
url=url("https://raw.githack.com/fsbmat/salarioDocente/master/Teste/TBCG2.csv")
TBCG2 <- read.csv2(url, header = TRUE,encoding = "ASCII")
TBCG2$DATA_INGRESSO_ORGAO <- as.Date(as.character(TBCG2$DATA_INGRESSO_ORGAO), format = "%d/%m/%Y")
a <- c(NULL)
b <- c(NULL)
df <- TBCG2[duplicated(TBCG2$ID_SERVIDOR_PORTAL),]
ID <- df$ID_SERVIDOR_PORTAL
for (i in 1:length(ID)) {
a[i] <- min((1:nrow(TBCG2))[TBCG2$ID_SERVIDOR_PORTAL==ID[i]])
b[i] <- max((1:nrow(TBCG2))[TBCG2$ID_SERVIDOR_PORTAL==ID[i]])
TBCG2$ID_SERVIDOR_PORTAL[a[i]] <- ifelse(TBCG2$ID_SERVIDOR_PORTAL[a[i]]==TBCG2$ID_SERVIDOR_PORTAL[b[i]]&TBCG2$CARGO[a[i]]==TBCG2$CARGO[b[i]],TBCG2$ID_SERVIDOR_PORTAL[a[i]],as.numeric(paste(TBCG2$ID_SERVIDOR_PORTAL[a[i]],"001",sep="")))
}
library(sqldf)
TBCG2 <- sqldf('select ID_SERVIDOR_PORTAL,NOME,CPF,CARGO,
min(DATA_INGRESSO_ORGAO) as DATA_INGRESSO_ORGAO,
sum(BRU_Jan2013 ) as BRU_Jan2013,
sum(BRU_Fev2013 ) as BRU_Fev2013,
sum(BRU_Mar2013 ) as BRU_Mar2013
from TBCG2
group by ID_SERVIDOR_PORTAL,NOME,CPF')
我想汇总 dataframe
行 TBCG2
,当 DATA_INGRESSO_ORGAO
不同时(参见 ID_SERVIDOR_PORTAL
列编号 977, 1089, 1365, 1666, 2597, 2779
和 3036
).我想按照下面的代码保留最早的日期。但是,对于 ID 2789
,我有 CARGO
不同的日期,在这种情况下,我想通过在其中添加一个 x 来修改其中一个的 ID
来保留这两行ID
。也就是说,我想保留一个ID_SERVIDOR_PORTAL = 2789
和另一个ID_SERVIDOR_PORTAL = 2789x
。这个数据框只是我数据库的一部分。我该如何进行?
url=url("https://raw.githack.com/fsbmat/salarioDocente/master/Teste/TBCG2.csv")
TBCG2 <- read.csv2(url, header = TRUE,encoding = "ASCII")
TBCG2$DATA_INGRESSO_ORGAO <- as.Date(as.character(TBCG2$DATA_INGRESSO_ORGAO), format = "%d/%m/%Y")
>head(TBCG2)
ID_SERVIDOR_PORTAL NOME CPF CARGO DATA_INGRESSO_ORGAO BRU_Jan2013
1 3 MARGLIO ***.200.427-** ETTB 2014-09-12 NA
2 5 JACUIAR ***.614.234-** SM 2016-06-20 NA
3 12 ANDLEAL ***.609.150-** SM 2012-11-13 7627.02
4 69 GIZONCA ***.852.867-** SM 2016-07-04 NA
5 70 CARANNA ***.232.227-** SM 1997-03-10 12360.61
6 94 FERILVA ***.251.114-** ETTB 2008-12-29 3703.82
BRU_Fev2013 BRU_Mar2013
1 NA NA
2 NA NA
3 7627.02 8618.53
4 NA NA
5 12360.61 13896.89
6 3703.82 4282.41
library(sqldf)
TBCG2 <- sqldf('select ID_SERVIDOR_PORTAL,NOME,CPF,CARGO,
min(DATA_INGRESSO_ORGAO) as DATA_INGRESSO_ORGAO,
sum(BRU_Jan2013 ) as BRU_Jan2013,
sum(BRU_Fev2013 ) as BRU_Fev2013,
sum(BRU_Mar2013 ) as BRU_Mar2013
from TBCG2
group by ID_SERVIDOR_PORTAL,NOME,CPF')
显然我找到了一个解决方案,可能不是最快的,因为循环,但重要的是它有效。这是代码:
url=url("https://raw.githack.com/fsbmat/salarioDocente/master/Teste/TBCG2.csv")
TBCG2 <- read.csv2(url, header = TRUE,encoding = "ASCII")
TBCG2$DATA_INGRESSO_ORGAO <- as.Date(as.character(TBCG2$DATA_INGRESSO_ORGAO), format = "%d/%m/%Y")
a <- c(NULL)
b <- c(NULL)
df <- TBCG2[duplicated(TBCG2$ID_SERVIDOR_PORTAL),]
ID <- df$ID_SERVIDOR_PORTAL
for (i in 1:length(ID)) {
a[i] <- min((1:nrow(TBCG2))[TBCG2$ID_SERVIDOR_PORTAL==ID[i]])
b[i] <- max((1:nrow(TBCG2))[TBCG2$ID_SERVIDOR_PORTAL==ID[i]])
TBCG2$ID_SERVIDOR_PORTAL[a[i]] <- ifelse(TBCG2$ID_SERVIDOR_PORTAL[a[i]]==TBCG2$ID_SERVIDOR_PORTAL[b[i]]&TBCG2$CARGO[a[i]]==TBCG2$CARGO[b[i]],TBCG2$ID_SERVIDOR_PORTAL[a[i]],as.numeric(paste(TBCG2$ID_SERVIDOR_PORTAL[a[i]],"001",sep="")))
}
library(sqldf)
TBCG2 <- sqldf('select ID_SERVIDOR_PORTAL,NOME,CPF,CARGO,
min(DATA_INGRESSO_ORGAO) as DATA_INGRESSO_ORGAO,
sum(BRU_Jan2013 ) as BRU_Jan2013,
sum(BRU_Fev2013 ) as BRU_Fev2013,
sum(BRU_Mar2013 ) as BRU_Mar2013
from TBCG2
group by ID_SERVIDOR_PORTAL,NOME,CPF')