如何对列表中的不同 tibbles 求和?

How to sum different tibbles in a list?

假设我有以下列表

My_list<-list(a=structure(list(MTU..UTC. = c("01.01.2016 00:00 - 01.01.2016 01:00", 
                                             "01.01.2016 01:00 - 01.01.2016 02:00", "01.01.2016 02:00 - 01.01.2016 03:00", 
                                             "01.01.2016 03:00 - 01.01.2016 04:00", "01.01.2016 04:00 - 01.01.2016 05:00", 
                                             "01.01.2016 05:00 - 01.01.2016 06:00", "01.01.2016 06:00 - 01.01.2016 07:00", 
                                             "01.01.2016 07:00 - 01.01.2016 08:00", "01.01.2016 08:00 - 01.01.2016 09:00", 
                                             "01.01.2016 09:00 - 01.01.2016 10:00"), Day.ahead.Price..EUR.MWh. = c(22.39, 
                                                                                                                   20.59, 16.81, 17.41, 17.02, 15.86, 18.16, 17.73, 19.77, 23.75
                                             )), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
                                             )),
              b=structure(list(MTU..UTC. = c("01.01.2016 00:00 - 01.01.2016 01:00", 
                                             "01.01.2016 01:00 - 01.01.2016 02:00", "01.01.2016 02:00 - 01.01.2016 03:00", 
                                             "01.01.2016 03:00 - 01.01.2016 04:00", "01.01.2016 04:00 - 01.01.2016 05:00", 
                                             "01.01.2016 05:00 - 01.01.2016 06:00", "01.01.2016 06:00 - 01.01.2016 07:00", 
                                             "01.01.2016 07:00 - 01.01.2016 08:00", "01.01.2016 08:00 - 01.01.2016 09:00", 
                                             "01.01.2016 09:00 - 01.01.2016 10:00"), Day.ahead.Price..EUR.MWh. = c(22.39, 
                                                                                                                   20.59, 16.81, 17.41, 17.02, 15.86, 18.16, 17.73, 19.77, 23.75
                                             )), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
                                             )),
              c=structure(list(MTU..UTC. = c("01.01.2016 00:00 - 01.01.2016 01:00", 
                                             "01.01.2016 01:00 - 01.01.2016 02:00", "01.01.2016 02:00 - 01.01.2016 03:00", 
                                             "01.01.2016 03:00 - 01.01.2016 04:00", "01.01.2016 04:00 - 01.01.2016 05:00", 
                                             "01.01.2016 05:00 - 01.01.2016 06:00", "01.01.2016 06:00 - 01.01.2016 07:00", 
                                             "01.01.2016 07:00 - 01.01.2016 08:00", "01.01.2016 08:00 - 01.01.2016 09:00", 
                                             "01.01.2016 09:00 - 01.01.2016 10:00"), Day.ahead.Price..EUR.MWh. = c(22.39, 
                                                                                                                   20.59, 16.81, 17.41, 17.02, 15.86, 18.16, 17.73, 19.77, 23.75
                                             )), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
                                             )),
              d=structure(list(MTU..UTC. = c("01.01.2015 00:00 - 01.01.2015 01:00", 
                                             "01.01.2015 01:00 - 01.01.2015 02:00", "01.01.2015 02:00 - 01.01.2015 03:00", 
                                             "01.01.2015 03:00 - 01.01.2015 04:00", "01.01.2015 04:00 - 01.01.2015 05:00", 
                                             "01.01.2015 05:00 - 01.01.2015 06:00", "01.01.2015 06:00 - 01.01.2015 07:00", 
                                             "01.01.2015 07:00 - 01.01.2015 08:00", "01.01.2015 08:00 - 01.01.2015 09:00", 
                                             "01.01.2015 09:00 - 01.01.2015 10:00"), Day.ahead.Price..EUR.MWh. = c(5, 
                                                                                                                   NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), row.names = c(NA, -10L), class = c("tbl_df", 
                                                                                                                                                                            "tbl", "data.frame")))

我想把小题a和d相加(包含很多NA)这样我就可以得到下面的结果

$a
# A tibble: 5 x 2
  MTU..UTC.                           Day.ahead.Price..EUR.MWh.
  <chr>                                                   <dbl>
1 01.01.2016 00:00 - 01.01.2016 01:00                      27.4 #New value
2 01.01.2016 01:00 - 01.01.2016 02:00                      20.6
3 01.01.2016 02:00 - 01.01.2016 03:00                      16.8
4 01.01.2016 03:00 - 01.01.2016 04:00                      17.4
5 01.01.2016 04:00 - 01.01.2016 05:00                      17.0

$b
# A tibble: 5 x 2
  MTU..UTC.                           Day.ahead.Price..EUR.MWh.
  <chr>                                                   <dbl>
1 01.01.2016 00:00 - 01.01.2016 01:00                      22.4
2 01.01.2016 01:00 - 01.01.2016 02:00                      20.6
3 01.01.2016 02:00 - 01.01.2016 03:00                      16.8
4 01.01.2016 03:00 - 01.01.2016 04:00                      17.4
5 01.01.2016 04:00 - 01.01.2016 05:00                      17.0

$c
# A tibble: 5 x 2
  MTU..UTC.                           Day.ahead.Price..EUR.MWh.
  <chr>                                                   <dbl>
1 01.01.2016 00:00 - 01.01.2016 01:00                      22.4
2 01.01.2016 01:00 - 01.01.2016 02:00                      20.6
3 01.01.2016 02:00 - 01.01.2016 03:00                      16.8
4 01.01.2016 03:00 - 01.01.2016 04:00                      17.4
5 01.01.2016 04:00 - 01.01.2016 05:00                      17.0

我尝试使用 reduce 获得结果,但是该函数对列表中的所有元素求和,我只想对这 2 个 tibbles 求和并保留其他。

有点乏味,但是你开始吧:

我建议首先使用 enframe 命令将列表转换为 tibble - 然后您可以使用 unnest 获得一个很好的数据结构。

因为日期不匹配(d 是 2015 年,所有其他日期是 2016 年),我们需要 1) 在日期列中添加一年(可能使用 lubridate 包)或 2) 我们只需将列添加到新数据即可。这并不理想 ,因为它仅适用于具有相同行号 的 data.frames,但如下所示。

我们首先在单独的 data.frame.

中过滤掉 d 的值

然后我们将它添加回原来的那个,我们用 pivot_wider 修改了它,现在每个 a、b 和 c 都是单独的列。

然后我们使用 mutate 命令对 a 和 d 求和(注意:这里我们假设 d 为 0,如果缺少 d 才能得到上面使用的值)。

library(tidyverse)
My_list %>% 
  enframe %>% # convert list to tibble
  unnest(value) %>% # convert list column to rows
  janitor::clean_names() %>% # clean the names (optional - janitor package)
  filter(name == "d") %>% # get only the values where the group is d
  rename(d = day_ahead_price_eur_m_wh) %>% 
  select(d)-> d

My_list %>% 
  enframe %>% # convert list to tibble
  unnest(value) %>% # convert list column to rows
  janitor::clean_names() %>% # clean the names (optional - janitor package)
  filter(name != "d") %>% # get all groups apart from d
  pivot_wider(id_cols = mtu_utc, names_from = name, values_from = day_ahead_price_eur_m_wh) %>% # pivot the data so that each group is a column
  bind_cols(.,d) %>% # add the d column to the data
  mutate(d = ifelse(is.na(d),0,d), # convert all missing values in d to a 0
         a = a + d) %>% # sum a and d
  select(-d) %>% # remove d (optional)
  pivot_longer(-mtu_utc) %>% # get back to a longer dataframe (optional)
  arrange(name) -> result # arrange sorts the data not by date but by group

现在 result 看起来像这样:

> result
# A tibble: 30 x 3
   mtu_utc                             name  value
   <chr>                               <chr> <dbl>
 1 01.01.2016 00:00 - 01.01.2016 01:00 a      27.4
 2 01.01.2016 01:00 - 01.01.2016 02:00 a      20.6
 3 01.01.2016 02:00 - 01.01.2016 03:00 a      16.8
 4 01.01.2016 03:00 - 01.01.2016 04:00 a      17.4
 5 01.01.2016 04:00 - 01.01.2016 05:00 a      17.0
 6 01.01.2016 05:00 - 01.01.2016 06:00 a      15.9
 7 01.01.2016 06:00 - 01.01.2016 07:00 a      18.2
 8 01.01.2016 07:00 - 01.01.2016 08:00 a      17.7
 9 01.01.2016 08:00 - 01.01.2016 09:00 a      19.8
10 01.01.2016 09:00 - 01.01.2016 10:00 a      23.8
# ... with 20 more rows

如果您想将其作为列表返回,则需要执行以下操作:

result %>%   
  group_split(name)

基础 R 解决方案:

# Clean vector names; store list elements back in a new list:
# cleaned_name_list => list of tibbles: 
cleaned_name_list <- lapply(My_list, function(x){
    setNames(
      x, 
      tolower(gsub("\W+", "_", trimws(names(x), whitespace = "\s+|[[:punct:]]")))
    )
  }
)

# String scalar denoting the name of the tibble to add to the other tibbles:
# to_add => character vector
to_add <- "d"

# Integer vector denoting which list elements are to have to_add element added
# on: to_keep_idx => integer vector
to_keep_idx <- which(names(cleaned_name_list) != to_add)

# Add the element to all other elements in the list: 
# cleaned_name_list => list of tibbles:
cleaned_name_list[to_keep_idx] <- lapply(
  cleaned_name_list[to_keep_idx], 
  function(x){
    y <- cleaned_name_list[[to_add]]
    data.frame(
      mtu_utc = x$mtuutc,
      dayahead_price_eur_mwh = 
      rowSums(
        cbind(
          adjustment = y$dayahead_price_eur_mwh[match(x$mtuutc, y$mtuutc)],
          act_price = x$dayahead_price_eur_mwh
        ),
        na.rm = TRUE
      )
    )
  }
)