不能在嵌套的 data.frame 中对列进行子集化

Cannot subset columns inside a nested data.frame

我有一个 data.frame,其中包含多个字符列,还有一个 data.frame。因此,我的 data.frame 里面有一个 data.frame。我的目标是将一个字符列与嵌套 data.frame 内的一列子集化。但是,每当我尝试按名称对嵌套列进行子集化时,它都会声明它不存在。您可以在此处查看 data.frame:

df = structure(
  list(
    `$id` = c("21", "22", "23"),
    Id = c("159347",
           "161863", "22646"),
    Name = c("159347", "161863", "22646"),
    SumPeriod = structure(
      list(
        AccPeriodBasTwrAtMarketPrice = c(0.0969367972082358, 0.537983489472227,-0.107066381156318),
        AccPeriodLocTwrAtMarketPrice = c(0.0969367972082358,
                                         0.537983489472227,-0.107066381156318),
        BopDate = c(
          "2022-02-28T00:00:00",
          "2022-02-28T00:00:00",
          "2022-02-28T00:00:00"
        ),
        BopBasHoldingValueAtMarketPrice = c(7592266.52,
                                            5135960.59, 7166815.5),
        BopBasInterestAccrual = c(0, 0, 0),
        EopDate = c(
          "2022-02-28T00:00:00",
          "2022-02-28T00:00:00",
          "2022-02-28T00:00:00"
        ),
        EopBasHoldingValueAtMarketPrice = c(7599626.22,
                                            5163591.21, 7159142.25),
        EopBasInterestAccrual = c(0, 0,
                                  0),
        AccPeriodBasTwrAtExposureValue = c(0.0969367972082358,
                                           0.537983489472227,-0.107066381156318),
        AccPeriodLocTwrAtExposureValue = c(0.0969367972082358,
                                           0.537983489472227,-0.107066381156318),
        AccBasIrr = c(0.0969367972082358,
                      0.537983489472227,-0.107066381156318),
        AccLocIrr = c(0.096936797208258,
                      0.537983489472227,-0.107066381156318),
        AccBasMwr = c(0.0484449181280957,
                      0.268270120259021,-0.0535618639528656),
        PeriodBasIrr = c(0.0969367972082358,
                         0.537983489472227,-0.107066381156318),
        PeriodLocIrr = c(0.096936797208258,
                         0.537983489472227,-0.107066381156318),
        PeriodBasTwrAtMarketPrice = c(0.0969367972082358,
                                      0.537983489472227,-0.107066381156318),
        PeriodLocTwrAtMarketPrice = c(0.0969367972082358,
                                      0.537983489472227,-0.107066381156318),
        PeriodBasTwrDeposit = c(0,
                                0, 0),
        PeriodBasTwrWithdrawal = c(0, 0, 0),
        PeriodBasTwrDepositWithdrawal = c(0,
                                          0, 0),
        PeriodBasTwrDividendTax = c(0, 0, 0),
        PeriodBasTwr = c(7359.70000000112,
                         27630.6200000001,-7673.25),
        PeriodBasMwr = c(0.0484449181280957,
                         0.268270120259021,-0.0535618639528656),
        BenchmarkCalcType = c(
          "BenchmarkNotCalculated",
          "BenchmarkNotCalculated",
          "BenchmarkNotCalculated"
        ),
        EopBenchmarkName = c("",
                             "", ""),
        AccBasBenchmarkReturnPct = c(0, 0, 0),
        PeriodBasBenchmarkReturnPct = c(0,
                                        0, 0)
      ),
      class = "data.frame",
      row.names = c(NA, 3L)
    ),
    Series = list(
      structure(
        list(
          AccPeriodBasTwrAtMarketPrice = 0.0969367972082358,
          AccPeriodLocTwrAtMarketPrice = 0.0969367972082358,
          BopDate = "2022-02-28T00:00:00",
          BopBasHoldingValueAtMarketPrice = 7592266.52,
          BopBasInterestAccrual = 0,
          EopDate = "2022-02-28T00:00:00",
          EopBasHoldingValueAtMarketPrice = 7599626.22,
          EopBasInterestAccrual = 0,
          AccPeriodBasTwrAtExposureValue = 0.0969367972082358,
          AccPeriodLocTwrAtExposureValue = 0.0969367972082358,
          AccBasIrr = 0,
          AccLocIrr = 0,
          AccBasMwr = 0.0968429207825055,
          PeriodBasIrr = 0,
          PeriodLocIrr = 0,
          PeriodBasTwrAtMarketPrice = 0.0969367972082358,
          PeriodLocTwrAtMarketPrice = 0.0969367972082358,
          PeriodBasTwrDeposit = 0,
          PeriodBasTwrWithdrawal = 0,
          PeriodBasTwrDepositWithdrawal = 0,
          PeriodBasTwrDividendTax = 0,
          PeriodBasTwr = 7359.70000000112,
          PeriodBasMwr = 0.0484449181280957,
          BenchmarkCalcType = "BenchmarkNotCalculated",
          EopBenchmarkName = "",
          AccBasBenchmarkReturnPct = 0,
          PeriodBasBenchmarkReturnPct = 0
        ),
        class = "data.frame",
        row.names = 1L
      ),
      structure(
        list(
          AccPeriodBasTwrAtMarketPrice = 0.537983489472227,
          AccPeriodLocTwrAtMarketPrice = 0.537983489472227,
          BopDate = "2022-02-28T00:00:00",
          BopBasHoldingValueAtMarketPrice = 5135960.59,
          BopBasInterestAccrual = 0,
          EopDate = "2022-02-28T00:00:00",
          EopBasHoldingValueAtMarketPrice = 5163591.21,
          EopBasInterestAccrual = 0,
          AccPeriodBasTwrAtExposureValue = 0.537983489472227,
          AccPeriodLocTwrAtExposureValue = 0.537983489472227,
          AccBasIrr = 0,
          AccLocIrr = 0,
          AccBasMwr = 0.535104714457055,
          PeriodBasIrr = 0,
          PeriodLocIrr = 0,
          PeriodBasTwrAtMarketPrice = 0.537983489472227,
          PeriodLocTwrAtMarketPrice = 0.537983489472227,
          PeriodBasTwrDeposit = 0,
          PeriodBasTwrWithdrawal = 0,
          PeriodBasTwrDepositWithdrawal = 0,
          PeriodBasTwrDividendTax = 0,
          PeriodBasTwr = 27630.6200000001,
          PeriodBasMwr = 0.26827012025902,
          BenchmarkCalcType = "BenchmarkNotCalculated",
          EopBenchmarkName = "",
          AccBasBenchmarkReturnPct = 0,
          PeriodBasBenchmarkReturnPct = 0
        ),
        class = "data.frame",
        row.names = 1L
      ),
      structure(
        list(
          AccPeriodBasTwrAtMarketPrice = -0.107066381156318,
          AccPeriodLocTwrAtMarketPrice = -0.107066381156318,
          BopDate = "2022-02-28T00:00:00",
          BopBasHoldingValueAtMarketPrice = 7166815.5,
          BopBasInterestAccrual = 0,
          EopDate = "2022-02-28T00:00:00",
          EopBasHoldingValueAtMarketPrice = 7159142.25,
          EopBasInterestAccrual = 0,
          AccPeriodBasTwrAtExposureValue = -0.107066381156318,
          AccPeriodLocTwrAtExposureValue = -0.107066381156318,
          AccBasIrr = 0,
          AccLocIrr = 0,
          AccBasMwr = -0.107181136120043,
          PeriodBasIrr = 0,
          PeriodLocIrr = 0,
          PeriodBasTwrAtMarketPrice = -0.107066381156318,
          PeriodLocTwrAtMarketPrice = -0.107066381156318,
          PeriodBasTwrDeposit = 0,
          PeriodBasTwrWithdrawal = 0,
          PeriodBasTwrDepositWithdrawal = 0,
          PeriodBasTwrDividendTax = 0,
          PeriodBasTwr = -7673.25,
          PeriodBasMwr = -0.0535618639528656,
          BenchmarkCalcType = "BenchmarkNotCalculated",
          EopBenchmarkName = "",
          AccBasBenchmarkReturnPct = 0,
          PeriodBasBenchmarkReturnPct = 0
        ),
        class = "data.frame",
        row.names = 1L
      )
    )
  ),
  class = "data.frame",
  row.names = c(NA,
                3L)
)

所以在上面的 data.frame 中,我唯一感兴趣的列是“Id”和“EopBasHoldingValueAtMarketPrice”,其中后者是嵌套 data.frame 中的一个列,称为“SumPeriod”。我尝试通过以下方式实现这一目标:

df_subset = subset(df, select = c("Id", "SumPeriod$EopBasHoldingValueAtMarketPrice"))

但是我得到错误:

frame`(x, r, vars, drop = drop) : undefined columns selected

大家知道如何实现吗?

SumPeriod 是您的数据框中的数据框。所以你在你的标签中建议dplyr,你可以用这种方法解决它:

library(dplyr)
library(tidyr)
df %>% 
  as_tibble() %>% 
  unnest(SumPeriod) %>% 
  select(Id, EopBasHoldingValueAtMarketPrice)

输出为:

# A tibble: 3 × 2
  Id     EopBasHoldingValueAtMarketPrice
  <chr>                            <dbl>
1 159347                        7599626.
2 161863                        5163591.
3 22646                         7159142.

data.table接近

library(data.table)
DT <- as.data.table(df)
DT[, .(Id, 
       EopBasHoldingValueAtMarketPrice = lapply(Series, function(x) x$EopBasHoldingValueAtMarketPrice))]
#        Id EopBasHoldingValueAtMarketPrice
# 1: 159347                         7599626
# 2: 161863                         5163591
# 3:  22646                         7159142

如果您对 dplyr 感到满意,请使用 Stephan 的解决方案。如果您需要基本解决方案,您可以将嵌套框架中的所需变量分配给父框架,然后使用 subset():

library(tidyverse)
df["EopBasHoldingValueAtMarketPrice"] <- df$SumPeriod$EopBasHoldingValueAtMarketPrice
df_subset <- subset(df, select = c("Id", "EopBasHoldingValueAtMarketPrice"))
df_subset
#>       Id EopBasHoldingValueAtMarketPrice
#> 1 159347                         7599626
#> 2 161863                         5163591
#> 3  22646                         7159142

reprex package (v2.0.1)

于 2022-03-28 创建

您没有任何名为 "SumPeriod$EopBasHoldingValueAtMarketPrice" 的内容,这是一个从 SumPeriod 中提取列的表达式。 @Stephan 给了你一个 dplyr 解决方案;这是一个基本的 R 解决方案:

df1 <- subset(df, select = c("Id", "SumPeriod"))
df1$SumPeriod <- subset(df1$SumPeriod, select = "EopBasHoldingValueAtMarketPrice")

这会将结构保留为嵌套数据框。

数据框里面有数据框的原因吗?你可以把这些东西都放在一个简单的数据框中

x <- df$SumPeriod
y <- df$Serie
y <- rbind(y[[1]],y[[2]],y[[3]])

df <- cbind(df[,c("Id","Name","$id")],x,y)

那么,您建议的子集就可以工作了

df_subset = subset(df, select = c("Id", "EopBasHoldingValueAtMarketPrice"))