R:比数据整理 2 个数据帧更简单的方法
R: easier method than data wrangling 2 dataframes
我正在尝试根据我的任务数据框(其中 order # 匹配)中的日期和逻辑在我的主数据框中创建 2 列。下面的 2 个解决方案在我 spread
任务数据帧到我的主数据帧的地方工作,虽然下面的例子 运行 相当快,但在我的正常代码中,它非常占用内存并且需要很长时间来计算(主数据框有 800,000 行,任务数据框有 20 个左右的任务,可能有 150 万行)。
是否有更好的解决方案来获取 G1 和 G2 列(示例 1)和 Gate1_end & Gate 2_end(示例 2)而无需将我的任务数据帧连接到我的主数据帧?
示例 1
maindf <- data.frame(
Order = c(145,158,165,148,568,465,248,693,357,482),
Location = c("US","US","Canada","US","France","US","Mexico","Mexico","UK","UK"),
Animal = c("Cow","Dog","Cat","Dog","Cat","Cow","Horse","Dog","Dog","Cat"),
stringsAsFactors = FALSE
)
Tasksdf_Start <- data.frame(
Tasks_Start = c("RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start",
"RTQT_Start","RTQR_Start",
"QUOT_Start",
"QUOG_Start",
"RTQT_Start","QUOT_Start",
"RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start",
"RTQT_Start","RTQR_Start",
"QUOT_Start",
"QUOG_Start",
"RTQT_Start","QUOT_Start"
),
Dates = c("1/1/2020","1/2/2020","1/6/2020","1/20/2020",
"1/6/2020","1/9/2020",
"1/14/2020",
"1/17/2020",
"1/19/2020","1/8/2020",
"1/15/2020","1/3/2020", "1/6/2020","1/19/2020",
"1/22/2020","1/25/2020",
"1/1/2020",
"1/6/2020",
"1/24/2020","1/14/2020"
),
Order = c(145, 145, 145, 145,
158, 158,
165,
148,
568, 568,
465, 465, 465, 465,
248, 248,
693,
357,
482, 482
),
stringsAsFactors = FALSE
)
Tasksdf_End <- data.frame(
Tasks_End = c("CRDT_End", "CUST_End", "VEND_End",
"CUST_End",
"CRDT_End",
"CUST_End",
"CRDT_End", "CUST_End",
"VEND_End",
"CRDT_End",
"VEND_End",
"VEND_End",
"CRDT_End"
),
Dates = c("1/22/2020", "1/18/2020", "1/5/2020",
"1/15/2020",
"1/16/2020",
"1/18/2020",
"1/7/2020", "1/24/2020",
"1/1/2020",
"1/18/2020",
"1/8/2020",
"1/4/2020",
"1/6/2020"
),
Order = c(145, 145, 145,
158,
165,
148,
568, 568,
465,
248,
693,
357,
482
),
stringsAsFactors = FALSE
)
dfS <- left_join(maindf, Tasksdf_Start, by = c("Order" = "Order")) %>%
spread(Tasks_Start, Dates)
dfE <- left_join(maindf, Tasksdf_End, by = c("Order" = "Order")) %>%
spread(Tasks_End, Dates)
dfE <- dfE %>%
select(1,4:ncol(dfE))
df <- left_join(dfS, dfE, by = c("Order" = "Order"))
df$G1 <- pmin(df$QUOG_Start, df$QUOT_Start, df$RTQR_Start, df$RTQT_Start, na.rm = TRUE)
df$G2 <- pmax(df$CRDT_End, df$CUST_End, df$VEND_End, na.rm = TRUE)
例2(和例1很相似,但逻辑更复杂)
maindf <- data.frame(
Order = c(145,158,165,148,568,465,248,693,357,482),
Gates = c("Gate1","Gate2","Gate2","Gate3","Gate2","Gate4","Gate1","Gate1","Gate2","Gate3"),
Animal = c("Cow","Dog","Cat","Dog","Cat","Cow","Horse","Dog","Dog","Cat"),
stringsAsFactors = FALSE
)
Tasksdf_Start <- data.frame(
Tasks_Start = c("RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start", "CRDT_Start", "CUST_Start", "VEND_Start",
"RTQT_Start","RTQR_Start", "CUST_Start", "CRDT_Start",
"QUOT_Start", "CRDT_Start",
"QUOG_Start", "CUST_Start", "CRDT_Start",
"RTQT_Start","QUOT_Start","CRDT_Start", "CUST_Start",
"RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start", "VEND_Start",
"RTQT_Start","RTQR_Start", "CRDT_Start",
"QUOT_Start", "VEND_Start", "CRDT_Start",
"QUOG_Start", "VEND_Start",
"RTQT_Start","QUOT_Start", "CRDT_Start"
),
Dates = c("1/1/2020","1/2/2020","1/6/2020","1/20/2020", "1/8/2020","1/19/2020","1/8/2020",
"1/6/2020","1/9/2020", "1/1/2020","1/9/2020",
"1/14/2020", "1/1/2020",
"1/17/2020", "1/1/2020", "1/1/2020",
"1/19/2020","1/8/2020", "1/1/2020", "1/1/2020",
"1/15/2020","1/3/2020", "1/6/2020","1/19/2020", "1/1/2020",
"1/22/2020","1/25/2020", "1/1/2020",
"1/1/2020", "1/1/2020", "1/1/2020",
"1/6/2020", "1/1/2020",
"1/24/2020","1/14/2020", "1/1/2020"
),
Order = c(145, 145, 145, 145, 145, 145, 145,
158, 158, 158, 158,
165, 165,
148, 148, 148,
568, 568, 568, 568,
465, 465, 465, 465, 465,
248, 248, 248,
693, 693, 693,
357, 357,
482, 482, 482
),
stringsAsFactors = FALSE
)
Tasksdf_End <- data.frame(
Tasks_End = c("CRDT_End", "CUST_End", "VEND_End",
"CUST_End",
"CRDT_End",
"CUST_End",
"CRDT_End", "CUST_End",
"VEND_End",
"CRDT_End",
"VEND_End",
"VEND_End",
"CRDT_End"
),
Dates = c("1/22/2020", "1/18/2020", "1/5/2020",
"1/15/2020",
"1/16/2020",
"1/18/2020",
"1/7/2020", "1/24/2020",
"1/1/2020",
"1/18/2020",
"1/8/2020",
"1/4/2020",
"1/6/2020"
),
Order = c(145, 145, 145,
158,
165,
148,
568, 568,
465,
248,
693,
357,
482
),
stringsAsFactors = FALSE
)
dfS <- left_join(maindf, Tasksdf_Start, by = c("Order" = "Order")) %>%
spread(Tasks_Start, Dates)
dfE <- left_join(maindf, Tasksdf_End, by = c("Order" = "Order")) %>%
spread(Tasks_End, Dates)
dfE <- dfE %>%
select(1,4:ncol(dfE))
df <- left_join(dfS, dfE, by = c("Order" = "Order"))
df$Gate1_End <- if_else(df$Gates == "Gate1" | df$Gates == "Gate0", as.character(NA), pmin(df$QUOG_Start, df$QUOT_Start, df$RTQR_Start, df$RTQT_Start, na.rm = TRUE))
df <- df %>%
mutate(Gate2Open = rowSums(!is.na(select(.,one_of(c('CRDT_Start', 'CUST_Start', 'VEND_Start'))))),
Gate2Close = rowSums(!is.na(select(.,one_of(c('CRDT_End', 'CUST_End', 'VEND_End')))))
)
df$Gate2_End <- if_else(df$Gates == "Gate2" | df$Gates == "Gate1" | df$Gates == "Gate0", as.character(NA),
if_else((df$Gate2Close - df$Gate2Open) == 0,
pmax(df$CRDT_End, df$CUST_End, df$VEND_End, na.rm = TRUE),
as.character(NA))
)
使用mutate
您无需创建宽数据框即可添加摘要列:
示例 1:
# calculate min for the START values
start <- Tasksdf_Start %>%
group_by(Order) %>%
mutate(G1=min(Dates, na.rm = TRUE)) %>%
dplyr::select(Order, G1) %>%
summarise_each(lst(max)) %>% # merge identical rows
rename(G1=max)
# calculate max for the END vales
end <- Tasksdf_End %>%
group_by(Order) %>%
mutate(G2=max(Dates, na.rm = TRUE)) %>%
dplyr::select(Order, G2) %>%
summarise_each(lst(max)) %>% # merge identical rows
rename(G2=max)
# join everything
maindf <- maindf %>%
full_join(start, end, by ="Order")
示例 2:
# calculate min for START values
start <- Tasksdf_Start %>%
group_by(Order) %>%
mutate(Gate1_End=min(Dates, na.rm = TRUE)) %>%
dplyr::select(Order, Gate1_End) %>%
summarise_each(lst(max)) %>% # merge identical rows
rename(Gate1_End=max)
# sum up cases defined in c('CRDT_Start', 'CUST_Start', 'VEND_Start')
Gate2Open <- Tasksdf_Start %>%
filter(Tasks_Start %in% c('CRDT_Start', 'CUST_Start', 'VEND_Start')) %>%
group_by(Order) %>%
mutate(Gate2Open = n()) %>%
dplyr::select(Order, Gate2Open) %>%
summarise_each(lst(max)) %>% # merge identical rows
rename(Gate2Open = max)
# calculate max for END values & sum up cases for "_End" columns
end <- Tasksdf_End %>%
group_by(Order) %>%
mutate(Gate2Close = n())%>%
mutate(Gate2_End=max(Dates, na.rm = TRUE)) %>%
dplyr::select(Order, Gate2_End, Gate2Close) %>%
summarise_each(lst(max)) %>% # merge identical rows
rename(Gate2_End=Gate2_End_max, Gate2Close = Gate2Close_max)
# join & apply conditions
maindf <- maindf %>%
full_join(end, by ="Order") %>%
full_join(start, by = "Order") %>%
full_join(Gate2Open, by = "Order") %>%
group_by(Order) %>%
mutate(Gate1_End = case_when(Gates == "Gate1" | Gates == "Gate2" ~ as.character(NA),
Gates != "Gate1" & Gates != "Gate2" ~ Gate1_End),
Gate2_End = case_when(Gates == "Gate2" | Gates == "Gate1" | Gates == "Gate0" ~ as.character(NA),
Gate2Close - Gate2Open == 0 ~ Gate2_End))
也许会有更优雅的方式,但像这样你可以避免spread()
。
我正在尝试根据我的任务数据框(其中 order # 匹配)中的日期和逻辑在我的主数据框中创建 2 列。下面的 2 个解决方案在我 spread
任务数据帧到我的主数据帧的地方工作,虽然下面的例子 运行 相当快,但在我的正常代码中,它非常占用内存并且需要很长时间来计算(主数据框有 800,000 行,任务数据框有 20 个左右的任务,可能有 150 万行)。
是否有更好的解决方案来获取 G1 和 G2 列(示例 1)和 Gate1_end & Gate 2_end(示例 2)而无需将我的任务数据帧连接到我的主数据帧?
示例 1
maindf <- data.frame(
Order = c(145,158,165,148,568,465,248,693,357,482),
Location = c("US","US","Canada","US","France","US","Mexico","Mexico","UK","UK"),
Animal = c("Cow","Dog","Cat","Dog","Cat","Cow","Horse","Dog","Dog","Cat"),
stringsAsFactors = FALSE
)
Tasksdf_Start <- data.frame(
Tasks_Start = c("RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start",
"RTQT_Start","RTQR_Start",
"QUOT_Start",
"QUOG_Start",
"RTQT_Start","QUOT_Start",
"RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start",
"RTQT_Start","RTQR_Start",
"QUOT_Start",
"QUOG_Start",
"RTQT_Start","QUOT_Start"
),
Dates = c("1/1/2020","1/2/2020","1/6/2020","1/20/2020",
"1/6/2020","1/9/2020",
"1/14/2020",
"1/17/2020",
"1/19/2020","1/8/2020",
"1/15/2020","1/3/2020", "1/6/2020","1/19/2020",
"1/22/2020","1/25/2020",
"1/1/2020",
"1/6/2020",
"1/24/2020","1/14/2020"
),
Order = c(145, 145, 145, 145,
158, 158,
165,
148,
568, 568,
465, 465, 465, 465,
248, 248,
693,
357,
482, 482
),
stringsAsFactors = FALSE
)
Tasksdf_End <- data.frame(
Tasks_End = c("CRDT_End", "CUST_End", "VEND_End",
"CUST_End",
"CRDT_End",
"CUST_End",
"CRDT_End", "CUST_End",
"VEND_End",
"CRDT_End",
"VEND_End",
"VEND_End",
"CRDT_End"
),
Dates = c("1/22/2020", "1/18/2020", "1/5/2020",
"1/15/2020",
"1/16/2020",
"1/18/2020",
"1/7/2020", "1/24/2020",
"1/1/2020",
"1/18/2020",
"1/8/2020",
"1/4/2020",
"1/6/2020"
),
Order = c(145, 145, 145,
158,
165,
148,
568, 568,
465,
248,
693,
357,
482
),
stringsAsFactors = FALSE
)
dfS <- left_join(maindf, Tasksdf_Start, by = c("Order" = "Order")) %>%
spread(Tasks_Start, Dates)
dfE <- left_join(maindf, Tasksdf_End, by = c("Order" = "Order")) %>%
spread(Tasks_End, Dates)
dfE <- dfE %>%
select(1,4:ncol(dfE))
df <- left_join(dfS, dfE, by = c("Order" = "Order"))
df$G1 <- pmin(df$QUOG_Start, df$QUOT_Start, df$RTQR_Start, df$RTQT_Start, na.rm = TRUE)
df$G2 <- pmax(df$CRDT_End, df$CUST_End, df$VEND_End, na.rm = TRUE)
例2(和例1很相似,但逻辑更复杂)
maindf <- data.frame(
Order = c(145,158,165,148,568,465,248,693,357,482),
Gates = c("Gate1","Gate2","Gate2","Gate3","Gate2","Gate4","Gate1","Gate1","Gate2","Gate3"),
Animal = c("Cow","Dog","Cat","Dog","Cat","Cow","Horse","Dog","Dog","Cat"),
stringsAsFactors = FALSE
)
Tasksdf_Start <- data.frame(
Tasks_Start = c("RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start", "CRDT_Start", "CUST_Start", "VEND_Start",
"RTQT_Start","RTQR_Start", "CUST_Start", "CRDT_Start",
"QUOT_Start", "CRDT_Start",
"QUOG_Start", "CUST_Start", "CRDT_Start",
"RTQT_Start","QUOT_Start","CRDT_Start", "CUST_Start",
"RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start", "VEND_Start",
"RTQT_Start","RTQR_Start", "CRDT_Start",
"QUOT_Start", "VEND_Start", "CRDT_Start",
"QUOG_Start", "VEND_Start",
"RTQT_Start","QUOT_Start", "CRDT_Start"
),
Dates = c("1/1/2020","1/2/2020","1/6/2020","1/20/2020", "1/8/2020","1/19/2020","1/8/2020",
"1/6/2020","1/9/2020", "1/1/2020","1/9/2020",
"1/14/2020", "1/1/2020",
"1/17/2020", "1/1/2020", "1/1/2020",
"1/19/2020","1/8/2020", "1/1/2020", "1/1/2020",
"1/15/2020","1/3/2020", "1/6/2020","1/19/2020", "1/1/2020",
"1/22/2020","1/25/2020", "1/1/2020",
"1/1/2020", "1/1/2020", "1/1/2020",
"1/6/2020", "1/1/2020",
"1/24/2020","1/14/2020", "1/1/2020"
),
Order = c(145, 145, 145, 145, 145, 145, 145,
158, 158, 158, 158,
165, 165,
148, 148, 148,
568, 568, 568, 568,
465, 465, 465, 465, 465,
248, 248, 248,
693, 693, 693,
357, 357,
482, 482, 482
),
stringsAsFactors = FALSE
)
Tasksdf_End <- data.frame(
Tasks_End = c("CRDT_End", "CUST_End", "VEND_End",
"CUST_End",
"CRDT_End",
"CUST_End",
"CRDT_End", "CUST_End",
"VEND_End",
"CRDT_End",
"VEND_End",
"VEND_End",
"CRDT_End"
),
Dates = c("1/22/2020", "1/18/2020", "1/5/2020",
"1/15/2020",
"1/16/2020",
"1/18/2020",
"1/7/2020", "1/24/2020",
"1/1/2020",
"1/18/2020",
"1/8/2020",
"1/4/2020",
"1/6/2020"
),
Order = c(145, 145, 145,
158,
165,
148,
568, 568,
465,
248,
693,
357,
482
),
stringsAsFactors = FALSE
)
dfS <- left_join(maindf, Tasksdf_Start, by = c("Order" = "Order")) %>%
spread(Tasks_Start, Dates)
dfE <- left_join(maindf, Tasksdf_End, by = c("Order" = "Order")) %>%
spread(Tasks_End, Dates)
dfE <- dfE %>%
select(1,4:ncol(dfE))
df <- left_join(dfS, dfE, by = c("Order" = "Order"))
df$Gate1_End <- if_else(df$Gates == "Gate1" | df$Gates == "Gate0", as.character(NA), pmin(df$QUOG_Start, df$QUOT_Start, df$RTQR_Start, df$RTQT_Start, na.rm = TRUE))
df <- df %>%
mutate(Gate2Open = rowSums(!is.na(select(.,one_of(c('CRDT_Start', 'CUST_Start', 'VEND_Start'))))),
Gate2Close = rowSums(!is.na(select(.,one_of(c('CRDT_End', 'CUST_End', 'VEND_End')))))
)
df$Gate2_End <- if_else(df$Gates == "Gate2" | df$Gates == "Gate1" | df$Gates == "Gate0", as.character(NA),
if_else((df$Gate2Close - df$Gate2Open) == 0,
pmax(df$CRDT_End, df$CUST_End, df$VEND_End, na.rm = TRUE),
as.character(NA))
)
使用mutate
您无需创建宽数据框即可添加摘要列:
示例 1:
# calculate min for the START values
start <- Tasksdf_Start %>%
group_by(Order) %>%
mutate(G1=min(Dates, na.rm = TRUE)) %>%
dplyr::select(Order, G1) %>%
summarise_each(lst(max)) %>% # merge identical rows
rename(G1=max)
# calculate max for the END vales
end <- Tasksdf_End %>%
group_by(Order) %>%
mutate(G2=max(Dates, na.rm = TRUE)) %>%
dplyr::select(Order, G2) %>%
summarise_each(lst(max)) %>% # merge identical rows
rename(G2=max)
# join everything
maindf <- maindf %>%
full_join(start, end, by ="Order")
示例 2:
# calculate min for START values
start <- Tasksdf_Start %>%
group_by(Order) %>%
mutate(Gate1_End=min(Dates, na.rm = TRUE)) %>%
dplyr::select(Order, Gate1_End) %>%
summarise_each(lst(max)) %>% # merge identical rows
rename(Gate1_End=max)
# sum up cases defined in c('CRDT_Start', 'CUST_Start', 'VEND_Start')
Gate2Open <- Tasksdf_Start %>%
filter(Tasks_Start %in% c('CRDT_Start', 'CUST_Start', 'VEND_Start')) %>%
group_by(Order) %>%
mutate(Gate2Open = n()) %>%
dplyr::select(Order, Gate2Open) %>%
summarise_each(lst(max)) %>% # merge identical rows
rename(Gate2Open = max)
# calculate max for END values & sum up cases for "_End" columns
end <- Tasksdf_End %>%
group_by(Order) %>%
mutate(Gate2Close = n())%>%
mutate(Gate2_End=max(Dates, na.rm = TRUE)) %>%
dplyr::select(Order, Gate2_End, Gate2Close) %>%
summarise_each(lst(max)) %>% # merge identical rows
rename(Gate2_End=Gate2_End_max, Gate2Close = Gate2Close_max)
# join & apply conditions
maindf <- maindf %>%
full_join(end, by ="Order") %>%
full_join(start, by = "Order") %>%
full_join(Gate2Open, by = "Order") %>%
group_by(Order) %>%
mutate(Gate1_End = case_when(Gates == "Gate1" | Gates == "Gate2" ~ as.character(NA),
Gates != "Gate1" & Gates != "Gate2" ~ Gate1_End),
Gate2_End = case_when(Gates == "Gate2" | Gates == "Gate1" | Gates == "Gate0" ~ as.character(NA),
Gate2Close - Gate2Open == 0 ~ Gate2_End))
也许会有更优雅的方式,但像这样你可以避免spread()
。