如何对列表中的不同 tibbles 求和?
How to sum different tibbles in a list?
假设我有以下列表
My_list<-list(a=structure(list(MTU..UTC. = c("01.01.2016 00:00 - 01.01.2016 01:00",
"01.01.2016 01:00 - 01.01.2016 02:00", "01.01.2016 02:00 - 01.01.2016 03:00",
"01.01.2016 03:00 - 01.01.2016 04:00", "01.01.2016 04:00 - 01.01.2016 05:00",
"01.01.2016 05:00 - 01.01.2016 06:00", "01.01.2016 06:00 - 01.01.2016 07:00",
"01.01.2016 07:00 - 01.01.2016 08:00", "01.01.2016 08:00 - 01.01.2016 09:00",
"01.01.2016 09:00 - 01.01.2016 10:00"), Day.ahead.Price..EUR.MWh. = c(22.39,
20.59, 16.81, 17.41, 17.02, 15.86, 18.16, 17.73, 19.77, 23.75
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
)),
b=structure(list(MTU..UTC. = c("01.01.2016 00:00 - 01.01.2016 01:00",
"01.01.2016 01:00 - 01.01.2016 02:00", "01.01.2016 02:00 - 01.01.2016 03:00",
"01.01.2016 03:00 - 01.01.2016 04:00", "01.01.2016 04:00 - 01.01.2016 05:00",
"01.01.2016 05:00 - 01.01.2016 06:00", "01.01.2016 06:00 - 01.01.2016 07:00",
"01.01.2016 07:00 - 01.01.2016 08:00", "01.01.2016 08:00 - 01.01.2016 09:00",
"01.01.2016 09:00 - 01.01.2016 10:00"), Day.ahead.Price..EUR.MWh. = c(22.39,
20.59, 16.81, 17.41, 17.02, 15.86, 18.16, 17.73, 19.77, 23.75
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
)),
c=structure(list(MTU..UTC. = c("01.01.2016 00:00 - 01.01.2016 01:00",
"01.01.2016 01:00 - 01.01.2016 02:00", "01.01.2016 02:00 - 01.01.2016 03:00",
"01.01.2016 03:00 - 01.01.2016 04:00", "01.01.2016 04:00 - 01.01.2016 05:00",
"01.01.2016 05:00 - 01.01.2016 06:00", "01.01.2016 06:00 - 01.01.2016 07:00",
"01.01.2016 07:00 - 01.01.2016 08:00", "01.01.2016 08:00 - 01.01.2016 09:00",
"01.01.2016 09:00 - 01.01.2016 10:00"), Day.ahead.Price..EUR.MWh. = c(22.39,
20.59, 16.81, 17.41, 17.02, 15.86, 18.16, 17.73, 19.77, 23.75
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
)),
d=structure(list(MTU..UTC. = c("01.01.2015 00:00 - 01.01.2015 01:00",
"01.01.2015 01:00 - 01.01.2015 02:00", "01.01.2015 02:00 - 01.01.2015 03:00",
"01.01.2015 03:00 - 01.01.2015 04:00", "01.01.2015 04:00 - 01.01.2015 05:00",
"01.01.2015 05:00 - 01.01.2015 06:00", "01.01.2015 06:00 - 01.01.2015 07:00",
"01.01.2015 07:00 - 01.01.2015 08:00", "01.01.2015 08:00 - 01.01.2015 09:00",
"01.01.2015 09:00 - 01.01.2015 10:00"), Day.ahead.Price..EUR.MWh. = c(5,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame")))
我想把小题a和d相加(包含很多NA)这样我就可以得到下面的结果
$a
# A tibble: 5 x 2
MTU..UTC. Day.ahead.Price..EUR.MWh.
<chr> <dbl>
1 01.01.2016 00:00 - 01.01.2016 01:00 27.4 #New value
2 01.01.2016 01:00 - 01.01.2016 02:00 20.6
3 01.01.2016 02:00 - 01.01.2016 03:00 16.8
4 01.01.2016 03:00 - 01.01.2016 04:00 17.4
5 01.01.2016 04:00 - 01.01.2016 05:00 17.0
$b
# A tibble: 5 x 2
MTU..UTC. Day.ahead.Price..EUR.MWh.
<chr> <dbl>
1 01.01.2016 00:00 - 01.01.2016 01:00 22.4
2 01.01.2016 01:00 - 01.01.2016 02:00 20.6
3 01.01.2016 02:00 - 01.01.2016 03:00 16.8
4 01.01.2016 03:00 - 01.01.2016 04:00 17.4
5 01.01.2016 04:00 - 01.01.2016 05:00 17.0
$c
# A tibble: 5 x 2
MTU..UTC. Day.ahead.Price..EUR.MWh.
<chr> <dbl>
1 01.01.2016 00:00 - 01.01.2016 01:00 22.4
2 01.01.2016 01:00 - 01.01.2016 02:00 20.6
3 01.01.2016 02:00 - 01.01.2016 03:00 16.8
4 01.01.2016 03:00 - 01.01.2016 04:00 17.4
5 01.01.2016 04:00 - 01.01.2016 05:00 17.0
我尝试使用 reduce
获得结果,但是该函数对列表中的所有元素求和,我只想对这 2 个 tibbles 求和并保留其他。
有点乏味,但是你开始吧:
我建议首先使用 enframe
命令将列表转换为 tibble - 然后您可以使用 unnest
获得一个很好的数据结构。
因为日期不匹配(d 是 2015 年,所有其他日期是 2016 年),我们需要 1) 在日期列中添加一年(可能使用 lubridate
包)或 2) 我们只需将列添加到新数据即可。这并不理想 ,因为它仅适用于具有相同行号 的 data.frames,但如下所示。
我们首先在单独的 data.frame.
中过滤掉 d 的值
然后我们将它添加回原来的那个,我们用 pivot_wider
修改了它,现在每个 a、b 和 c 都是单独的列。
然后我们使用 mutate
命令对 a 和 d 求和(注意:这里我们假设 d 为 0,如果缺少 d 才能得到上面使用的值)。
library(tidyverse)
My_list %>%
enframe %>% # convert list to tibble
unnest(value) %>% # convert list column to rows
janitor::clean_names() %>% # clean the names (optional - janitor package)
filter(name == "d") %>% # get only the values where the group is d
rename(d = day_ahead_price_eur_m_wh) %>%
select(d)-> d
My_list %>%
enframe %>% # convert list to tibble
unnest(value) %>% # convert list column to rows
janitor::clean_names() %>% # clean the names (optional - janitor package)
filter(name != "d") %>% # get all groups apart from d
pivot_wider(id_cols = mtu_utc, names_from = name, values_from = day_ahead_price_eur_m_wh) %>% # pivot the data so that each group is a column
bind_cols(.,d) %>% # add the d column to the data
mutate(d = ifelse(is.na(d),0,d), # convert all missing values in d to a 0
a = a + d) %>% # sum a and d
select(-d) %>% # remove d (optional)
pivot_longer(-mtu_utc) %>% # get back to a longer dataframe (optional)
arrange(name) -> result # arrange sorts the data not by date but by group
现在 result
看起来像这样:
> result
# A tibble: 30 x 3
mtu_utc name value
<chr> <chr> <dbl>
1 01.01.2016 00:00 - 01.01.2016 01:00 a 27.4
2 01.01.2016 01:00 - 01.01.2016 02:00 a 20.6
3 01.01.2016 02:00 - 01.01.2016 03:00 a 16.8
4 01.01.2016 03:00 - 01.01.2016 04:00 a 17.4
5 01.01.2016 04:00 - 01.01.2016 05:00 a 17.0
6 01.01.2016 05:00 - 01.01.2016 06:00 a 15.9
7 01.01.2016 06:00 - 01.01.2016 07:00 a 18.2
8 01.01.2016 07:00 - 01.01.2016 08:00 a 17.7
9 01.01.2016 08:00 - 01.01.2016 09:00 a 19.8
10 01.01.2016 09:00 - 01.01.2016 10:00 a 23.8
# ... with 20 more rows
如果您想将其作为列表返回,则需要执行以下操作:
result %>%
group_split(name)
基础 R 解决方案:
# Clean vector names; store list elements back in a new list:
# cleaned_name_list => list of tibbles:
cleaned_name_list <- lapply(My_list, function(x){
setNames(
x,
tolower(gsub("\W+", "_", trimws(names(x), whitespace = "\s+|[[:punct:]]")))
)
}
)
# String scalar denoting the name of the tibble to add to the other tibbles:
# to_add => character vector
to_add <- "d"
# Integer vector denoting which list elements are to have to_add element added
# on: to_keep_idx => integer vector
to_keep_idx <- which(names(cleaned_name_list) != to_add)
# Add the element to all other elements in the list:
# cleaned_name_list => list of tibbles:
cleaned_name_list[to_keep_idx] <- lapply(
cleaned_name_list[to_keep_idx],
function(x){
y <- cleaned_name_list[[to_add]]
data.frame(
mtu_utc = x$mtuutc,
dayahead_price_eur_mwh =
rowSums(
cbind(
adjustment = y$dayahead_price_eur_mwh[match(x$mtuutc, y$mtuutc)],
act_price = x$dayahead_price_eur_mwh
),
na.rm = TRUE
)
)
}
)
假设我有以下列表
My_list<-list(a=structure(list(MTU..UTC. = c("01.01.2016 00:00 - 01.01.2016 01:00",
"01.01.2016 01:00 - 01.01.2016 02:00", "01.01.2016 02:00 - 01.01.2016 03:00",
"01.01.2016 03:00 - 01.01.2016 04:00", "01.01.2016 04:00 - 01.01.2016 05:00",
"01.01.2016 05:00 - 01.01.2016 06:00", "01.01.2016 06:00 - 01.01.2016 07:00",
"01.01.2016 07:00 - 01.01.2016 08:00", "01.01.2016 08:00 - 01.01.2016 09:00",
"01.01.2016 09:00 - 01.01.2016 10:00"), Day.ahead.Price..EUR.MWh. = c(22.39,
20.59, 16.81, 17.41, 17.02, 15.86, 18.16, 17.73, 19.77, 23.75
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
)),
b=structure(list(MTU..UTC. = c("01.01.2016 00:00 - 01.01.2016 01:00",
"01.01.2016 01:00 - 01.01.2016 02:00", "01.01.2016 02:00 - 01.01.2016 03:00",
"01.01.2016 03:00 - 01.01.2016 04:00", "01.01.2016 04:00 - 01.01.2016 05:00",
"01.01.2016 05:00 - 01.01.2016 06:00", "01.01.2016 06:00 - 01.01.2016 07:00",
"01.01.2016 07:00 - 01.01.2016 08:00", "01.01.2016 08:00 - 01.01.2016 09:00",
"01.01.2016 09:00 - 01.01.2016 10:00"), Day.ahead.Price..EUR.MWh. = c(22.39,
20.59, 16.81, 17.41, 17.02, 15.86, 18.16, 17.73, 19.77, 23.75
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
)),
c=structure(list(MTU..UTC. = c("01.01.2016 00:00 - 01.01.2016 01:00",
"01.01.2016 01:00 - 01.01.2016 02:00", "01.01.2016 02:00 - 01.01.2016 03:00",
"01.01.2016 03:00 - 01.01.2016 04:00", "01.01.2016 04:00 - 01.01.2016 05:00",
"01.01.2016 05:00 - 01.01.2016 06:00", "01.01.2016 06:00 - 01.01.2016 07:00",
"01.01.2016 07:00 - 01.01.2016 08:00", "01.01.2016 08:00 - 01.01.2016 09:00",
"01.01.2016 09:00 - 01.01.2016 10:00"), Day.ahead.Price..EUR.MWh. = c(22.39,
20.59, 16.81, 17.41, 17.02, 15.86, 18.16, 17.73, 19.77, 23.75
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
)),
d=structure(list(MTU..UTC. = c("01.01.2015 00:00 - 01.01.2015 01:00",
"01.01.2015 01:00 - 01.01.2015 02:00", "01.01.2015 02:00 - 01.01.2015 03:00",
"01.01.2015 03:00 - 01.01.2015 04:00", "01.01.2015 04:00 - 01.01.2015 05:00",
"01.01.2015 05:00 - 01.01.2015 06:00", "01.01.2015 06:00 - 01.01.2015 07:00",
"01.01.2015 07:00 - 01.01.2015 08:00", "01.01.2015 08:00 - 01.01.2015 09:00",
"01.01.2015 09:00 - 01.01.2015 10:00"), Day.ahead.Price..EUR.MWh. = c(5,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame")))
我想把小题a和d相加(包含很多NA)这样我就可以得到下面的结果
$a
# A tibble: 5 x 2
MTU..UTC. Day.ahead.Price..EUR.MWh.
<chr> <dbl>
1 01.01.2016 00:00 - 01.01.2016 01:00 27.4 #New value
2 01.01.2016 01:00 - 01.01.2016 02:00 20.6
3 01.01.2016 02:00 - 01.01.2016 03:00 16.8
4 01.01.2016 03:00 - 01.01.2016 04:00 17.4
5 01.01.2016 04:00 - 01.01.2016 05:00 17.0
$b
# A tibble: 5 x 2
MTU..UTC. Day.ahead.Price..EUR.MWh.
<chr> <dbl>
1 01.01.2016 00:00 - 01.01.2016 01:00 22.4
2 01.01.2016 01:00 - 01.01.2016 02:00 20.6
3 01.01.2016 02:00 - 01.01.2016 03:00 16.8
4 01.01.2016 03:00 - 01.01.2016 04:00 17.4
5 01.01.2016 04:00 - 01.01.2016 05:00 17.0
$c
# A tibble: 5 x 2
MTU..UTC. Day.ahead.Price..EUR.MWh.
<chr> <dbl>
1 01.01.2016 00:00 - 01.01.2016 01:00 22.4
2 01.01.2016 01:00 - 01.01.2016 02:00 20.6
3 01.01.2016 02:00 - 01.01.2016 03:00 16.8
4 01.01.2016 03:00 - 01.01.2016 04:00 17.4
5 01.01.2016 04:00 - 01.01.2016 05:00 17.0
我尝试使用 reduce
获得结果,但是该函数对列表中的所有元素求和,我只想对这 2 个 tibbles 求和并保留其他。
有点乏味,但是你开始吧:
我建议首先使用 enframe
命令将列表转换为 tibble - 然后您可以使用 unnest
获得一个很好的数据结构。
因为日期不匹配(d 是 2015 年,所有其他日期是 2016 年),我们需要 1) 在日期列中添加一年(可能使用 lubridate
包)或 2) 我们只需将列添加到新数据即可。这并不理想 ,因为它仅适用于具有相同行号 的 data.frames,但如下所示。
我们首先在单独的 data.frame.
中过滤掉 d 的值然后我们将它添加回原来的那个,我们用 pivot_wider
修改了它,现在每个 a、b 和 c 都是单独的列。
然后我们使用 mutate
命令对 a 和 d 求和(注意:这里我们假设 d 为 0,如果缺少 d 才能得到上面使用的值)。
library(tidyverse)
My_list %>%
enframe %>% # convert list to tibble
unnest(value) %>% # convert list column to rows
janitor::clean_names() %>% # clean the names (optional - janitor package)
filter(name == "d") %>% # get only the values where the group is d
rename(d = day_ahead_price_eur_m_wh) %>%
select(d)-> d
My_list %>%
enframe %>% # convert list to tibble
unnest(value) %>% # convert list column to rows
janitor::clean_names() %>% # clean the names (optional - janitor package)
filter(name != "d") %>% # get all groups apart from d
pivot_wider(id_cols = mtu_utc, names_from = name, values_from = day_ahead_price_eur_m_wh) %>% # pivot the data so that each group is a column
bind_cols(.,d) %>% # add the d column to the data
mutate(d = ifelse(is.na(d),0,d), # convert all missing values in d to a 0
a = a + d) %>% # sum a and d
select(-d) %>% # remove d (optional)
pivot_longer(-mtu_utc) %>% # get back to a longer dataframe (optional)
arrange(name) -> result # arrange sorts the data not by date but by group
现在 result
看起来像这样:
> result
# A tibble: 30 x 3
mtu_utc name value
<chr> <chr> <dbl>
1 01.01.2016 00:00 - 01.01.2016 01:00 a 27.4
2 01.01.2016 01:00 - 01.01.2016 02:00 a 20.6
3 01.01.2016 02:00 - 01.01.2016 03:00 a 16.8
4 01.01.2016 03:00 - 01.01.2016 04:00 a 17.4
5 01.01.2016 04:00 - 01.01.2016 05:00 a 17.0
6 01.01.2016 05:00 - 01.01.2016 06:00 a 15.9
7 01.01.2016 06:00 - 01.01.2016 07:00 a 18.2
8 01.01.2016 07:00 - 01.01.2016 08:00 a 17.7
9 01.01.2016 08:00 - 01.01.2016 09:00 a 19.8
10 01.01.2016 09:00 - 01.01.2016 10:00 a 23.8
# ... with 20 more rows
如果您想将其作为列表返回,则需要执行以下操作:
result %>%
group_split(name)
基础 R 解决方案:
# Clean vector names; store list elements back in a new list:
# cleaned_name_list => list of tibbles:
cleaned_name_list <- lapply(My_list, function(x){
setNames(
x,
tolower(gsub("\W+", "_", trimws(names(x), whitespace = "\s+|[[:punct:]]")))
)
}
)
# String scalar denoting the name of the tibble to add to the other tibbles:
# to_add => character vector
to_add <- "d"
# Integer vector denoting which list elements are to have to_add element added
# on: to_keep_idx => integer vector
to_keep_idx <- which(names(cleaned_name_list) != to_add)
# Add the element to all other elements in the list:
# cleaned_name_list => list of tibbles:
cleaned_name_list[to_keep_idx] <- lapply(
cleaned_name_list[to_keep_idx],
function(x){
y <- cleaned_name_list[[to_add]]
data.frame(
mtu_utc = x$mtuutc,
dayahead_price_eur_mwh =
rowSums(
cbind(
adjustment = y$dayahead_price_eur_mwh[match(x$mtuutc, y$mtuutc)],
act_price = x$dayahead_price_eur_mwh
),
na.rm = TRUE
)
)
}
)