不能在嵌套的 data.frame 中对列进行子集化
Cannot subset columns inside a nested data.frame
我有一个 data.frame,其中包含多个字符列,还有一个 data.frame。因此,我的 data.frame 里面有一个 data.frame。我的目标是将一个字符列与嵌套 data.frame 内的一列子集化。但是,每当我尝试按名称对嵌套列进行子集化时,它都会声明它不存在。您可以在此处查看 data.frame:
df = structure(
list(
`$id` = c("21", "22", "23"),
Id = c("159347",
"161863", "22646"),
Name = c("159347", "161863", "22646"),
SumPeriod = structure(
list(
AccPeriodBasTwrAtMarketPrice = c(0.0969367972082358, 0.537983489472227,-0.107066381156318),
AccPeriodLocTwrAtMarketPrice = c(0.0969367972082358,
0.537983489472227,-0.107066381156318),
BopDate = c(
"2022-02-28T00:00:00",
"2022-02-28T00:00:00",
"2022-02-28T00:00:00"
),
BopBasHoldingValueAtMarketPrice = c(7592266.52,
5135960.59, 7166815.5),
BopBasInterestAccrual = c(0, 0, 0),
EopDate = c(
"2022-02-28T00:00:00",
"2022-02-28T00:00:00",
"2022-02-28T00:00:00"
),
EopBasHoldingValueAtMarketPrice = c(7599626.22,
5163591.21, 7159142.25),
EopBasInterestAccrual = c(0, 0,
0),
AccPeriodBasTwrAtExposureValue = c(0.0969367972082358,
0.537983489472227,-0.107066381156318),
AccPeriodLocTwrAtExposureValue = c(0.0969367972082358,
0.537983489472227,-0.107066381156318),
AccBasIrr = c(0.0969367972082358,
0.537983489472227,-0.107066381156318),
AccLocIrr = c(0.096936797208258,
0.537983489472227,-0.107066381156318),
AccBasMwr = c(0.0484449181280957,
0.268270120259021,-0.0535618639528656),
PeriodBasIrr = c(0.0969367972082358,
0.537983489472227,-0.107066381156318),
PeriodLocIrr = c(0.096936797208258,
0.537983489472227,-0.107066381156318),
PeriodBasTwrAtMarketPrice = c(0.0969367972082358,
0.537983489472227,-0.107066381156318),
PeriodLocTwrAtMarketPrice = c(0.0969367972082358,
0.537983489472227,-0.107066381156318),
PeriodBasTwrDeposit = c(0,
0, 0),
PeriodBasTwrWithdrawal = c(0, 0, 0),
PeriodBasTwrDepositWithdrawal = c(0,
0, 0),
PeriodBasTwrDividendTax = c(0, 0, 0),
PeriodBasTwr = c(7359.70000000112,
27630.6200000001,-7673.25),
PeriodBasMwr = c(0.0484449181280957,
0.268270120259021,-0.0535618639528656),
BenchmarkCalcType = c(
"BenchmarkNotCalculated",
"BenchmarkNotCalculated",
"BenchmarkNotCalculated"
),
EopBenchmarkName = c("",
"", ""),
AccBasBenchmarkReturnPct = c(0, 0, 0),
PeriodBasBenchmarkReturnPct = c(0,
0, 0)
),
class = "data.frame",
row.names = c(NA, 3L)
),
Series = list(
structure(
list(
AccPeriodBasTwrAtMarketPrice = 0.0969367972082358,
AccPeriodLocTwrAtMarketPrice = 0.0969367972082358,
BopDate = "2022-02-28T00:00:00",
BopBasHoldingValueAtMarketPrice = 7592266.52,
BopBasInterestAccrual = 0,
EopDate = "2022-02-28T00:00:00",
EopBasHoldingValueAtMarketPrice = 7599626.22,
EopBasInterestAccrual = 0,
AccPeriodBasTwrAtExposureValue = 0.0969367972082358,
AccPeriodLocTwrAtExposureValue = 0.0969367972082358,
AccBasIrr = 0,
AccLocIrr = 0,
AccBasMwr = 0.0968429207825055,
PeriodBasIrr = 0,
PeriodLocIrr = 0,
PeriodBasTwrAtMarketPrice = 0.0969367972082358,
PeriodLocTwrAtMarketPrice = 0.0969367972082358,
PeriodBasTwrDeposit = 0,
PeriodBasTwrWithdrawal = 0,
PeriodBasTwrDepositWithdrawal = 0,
PeriodBasTwrDividendTax = 0,
PeriodBasTwr = 7359.70000000112,
PeriodBasMwr = 0.0484449181280957,
BenchmarkCalcType = "BenchmarkNotCalculated",
EopBenchmarkName = "",
AccBasBenchmarkReturnPct = 0,
PeriodBasBenchmarkReturnPct = 0
),
class = "data.frame",
row.names = 1L
),
structure(
list(
AccPeriodBasTwrAtMarketPrice = 0.537983489472227,
AccPeriodLocTwrAtMarketPrice = 0.537983489472227,
BopDate = "2022-02-28T00:00:00",
BopBasHoldingValueAtMarketPrice = 5135960.59,
BopBasInterestAccrual = 0,
EopDate = "2022-02-28T00:00:00",
EopBasHoldingValueAtMarketPrice = 5163591.21,
EopBasInterestAccrual = 0,
AccPeriodBasTwrAtExposureValue = 0.537983489472227,
AccPeriodLocTwrAtExposureValue = 0.537983489472227,
AccBasIrr = 0,
AccLocIrr = 0,
AccBasMwr = 0.535104714457055,
PeriodBasIrr = 0,
PeriodLocIrr = 0,
PeriodBasTwrAtMarketPrice = 0.537983489472227,
PeriodLocTwrAtMarketPrice = 0.537983489472227,
PeriodBasTwrDeposit = 0,
PeriodBasTwrWithdrawal = 0,
PeriodBasTwrDepositWithdrawal = 0,
PeriodBasTwrDividendTax = 0,
PeriodBasTwr = 27630.6200000001,
PeriodBasMwr = 0.26827012025902,
BenchmarkCalcType = "BenchmarkNotCalculated",
EopBenchmarkName = "",
AccBasBenchmarkReturnPct = 0,
PeriodBasBenchmarkReturnPct = 0
),
class = "data.frame",
row.names = 1L
),
structure(
list(
AccPeriodBasTwrAtMarketPrice = -0.107066381156318,
AccPeriodLocTwrAtMarketPrice = -0.107066381156318,
BopDate = "2022-02-28T00:00:00",
BopBasHoldingValueAtMarketPrice = 7166815.5,
BopBasInterestAccrual = 0,
EopDate = "2022-02-28T00:00:00",
EopBasHoldingValueAtMarketPrice = 7159142.25,
EopBasInterestAccrual = 0,
AccPeriodBasTwrAtExposureValue = -0.107066381156318,
AccPeriodLocTwrAtExposureValue = -0.107066381156318,
AccBasIrr = 0,
AccLocIrr = 0,
AccBasMwr = -0.107181136120043,
PeriodBasIrr = 0,
PeriodLocIrr = 0,
PeriodBasTwrAtMarketPrice = -0.107066381156318,
PeriodLocTwrAtMarketPrice = -0.107066381156318,
PeriodBasTwrDeposit = 0,
PeriodBasTwrWithdrawal = 0,
PeriodBasTwrDepositWithdrawal = 0,
PeriodBasTwrDividendTax = 0,
PeriodBasTwr = -7673.25,
PeriodBasMwr = -0.0535618639528656,
BenchmarkCalcType = "BenchmarkNotCalculated",
EopBenchmarkName = "",
AccBasBenchmarkReturnPct = 0,
PeriodBasBenchmarkReturnPct = 0
),
class = "data.frame",
row.names = 1L
)
)
),
class = "data.frame",
row.names = c(NA,
3L)
)
所以在上面的 data.frame 中,我唯一感兴趣的列是“Id”和“EopBasHoldingValueAtMarketPrice”,其中后者是嵌套 data.frame 中的一个列,称为“SumPeriod”。我尝试通过以下方式实现这一目标:
df_subset = subset(df, select = c("Id", "SumPeriod$EopBasHoldingValueAtMarketPrice"))
但是我得到错误:
frame`(x, r, vars, drop = drop) : undefined columns selected
大家知道如何实现吗?
SumPeriod
是您的数据框中的数据框。所以你在你的标签中建议dplyr
,你可以用这种方法解决它:
library(dplyr)
library(tidyr)
df %>%
as_tibble() %>%
unnest(SumPeriod) %>%
select(Id, EopBasHoldingValueAtMarketPrice)
输出为:
# A tibble: 3 × 2
Id EopBasHoldingValueAtMarketPrice
<chr> <dbl>
1 159347 7599626.
2 161863 5163591.
3 22646 7159142.
data.table
接近
library(data.table)
DT <- as.data.table(df)
DT[, .(Id,
EopBasHoldingValueAtMarketPrice = lapply(Series, function(x) x$EopBasHoldingValueAtMarketPrice))]
# Id EopBasHoldingValueAtMarketPrice
# 1: 159347 7599626
# 2: 161863 5163591
# 3: 22646 7159142
如果您对 dplyr
感到满意,请使用 Stephan 的解决方案。如果您需要基本解决方案,您可以将嵌套框架中的所需变量分配给父框架,然后使用 subset()
:
library(tidyverse)
df["EopBasHoldingValueAtMarketPrice"] <- df$SumPeriod$EopBasHoldingValueAtMarketPrice
df_subset <- subset(df, select = c("Id", "EopBasHoldingValueAtMarketPrice"))
df_subset
#> Id EopBasHoldingValueAtMarketPrice
#> 1 159347 7599626
#> 2 161863 5163591
#> 3 22646 7159142
由 reprex package (v2.0.1)
于 2022-03-28 创建
您没有任何名为 "SumPeriod$EopBasHoldingValueAtMarketPrice"
的内容,这是一个从 SumPeriod
中提取列的表达式。 @Stephan 给了你一个 dplyr
解决方案;这是一个基本的 R 解决方案:
df1 <- subset(df, select = c("Id", "SumPeriod"))
df1$SumPeriod <- subset(df1$SumPeriod, select = "EopBasHoldingValueAtMarketPrice")
这会将结构保留为嵌套数据框。
数据框里面有数据框的原因吗?你可以把这些东西都放在一个简单的数据框中
x <- df$SumPeriod
y <- df$Serie
y <- rbind(y[[1]],y[[2]],y[[3]])
df <- cbind(df[,c("Id","Name","$id")],x,y)
那么,您建议的子集就可以工作了
df_subset = subset(df, select = c("Id", "EopBasHoldingValueAtMarketPrice"))
我有一个 data.frame,其中包含多个字符列,还有一个 data.frame。因此,我的 data.frame 里面有一个 data.frame。我的目标是将一个字符列与嵌套 data.frame 内的一列子集化。但是,每当我尝试按名称对嵌套列进行子集化时,它都会声明它不存在。您可以在此处查看 data.frame:
df = structure(
list(
`$id` = c("21", "22", "23"),
Id = c("159347",
"161863", "22646"),
Name = c("159347", "161863", "22646"),
SumPeriod = structure(
list(
AccPeriodBasTwrAtMarketPrice = c(0.0969367972082358, 0.537983489472227,-0.107066381156318),
AccPeriodLocTwrAtMarketPrice = c(0.0969367972082358,
0.537983489472227,-0.107066381156318),
BopDate = c(
"2022-02-28T00:00:00",
"2022-02-28T00:00:00",
"2022-02-28T00:00:00"
),
BopBasHoldingValueAtMarketPrice = c(7592266.52,
5135960.59, 7166815.5),
BopBasInterestAccrual = c(0, 0, 0),
EopDate = c(
"2022-02-28T00:00:00",
"2022-02-28T00:00:00",
"2022-02-28T00:00:00"
),
EopBasHoldingValueAtMarketPrice = c(7599626.22,
5163591.21, 7159142.25),
EopBasInterestAccrual = c(0, 0,
0),
AccPeriodBasTwrAtExposureValue = c(0.0969367972082358,
0.537983489472227,-0.107066381156318),
AccPeriodLocTwrAtExposureValue = c(0.0969367972082358,
0.537983489472227,-0.107066381156318),
AccBasIrr = c(0.0969367972082358,
0.537983489472227,-0.107066381156318),
AccLocIrr = c(0.096936797208258,
0.537983489472227,-0.107066381156318),
AccBasMwr = c(0.0484449181280957,
0.268270120259021,-0.0535618639528656),
PeriodBasIrr = c(0.0969367972082358,
0.537983489472227,-0.107066381156318),
PeriodLocIrr = c(0.096936797208258,
0.537983489472227,-0.107066381156318),
PeriodBasTwrAtMarketPrice = c(0.0969367972082358,
0.537983489472227,-0.107066381156318),
PeriodLocTwrAtMarketPrice = c(0.0969367972082358,
0.537983489472227,-0.107066381156318),
PeriodBasTwrDeposit = c(0,
0, 0),
PeriodBasTwrWithdrawal = c(0, 0, 0),
PeriodBasTwrDepositWithdrawal = c(0,
0, 0),
PeriodBasTwrDividendTax = c(0, 0, 0),
PeriodBasTwr = c(7359.70000000112,
27630.6200000001,-7673.25),
PeriodBasMwr = c(0.0484449181280957,
0.268270120259021,-0.0535618639528656),
BenchmarkCalcType = c(
"BenchmarkNotCalculated",
"BenchmarkNotCalculated",
"BenchmarkNotCalculated"
),
EopBenchmarkName = c("",
"", ""),
AccBasBenchmarkReturnPct = c(0, 0, 0),
PeriodBasBenchmarkReturnPct = c(0,
0, 0)
),
class = "data.frame",
row.names = c(NA, 3L)
),
Series = list(
structure(
list(
AccPeriodBasTwrAtMarketPrice = 0.0969367972082358,
AccPeriodLocTwrAtMarketPrice = 0.0969367972082358,
BopDate = "2022-02-28T00:00:00",
BopBasHoldingValueAtMarketPrice = 7592266.52,
BopBasInterestAccrual = 0,
EopDate = "2022-02-28T00:00:00",
EopBasHoldingValueAtMarketPrice = 7599626.22,
EopBasInterestAccrual = 0,
AccPeriodBasTwrAtExposureValue = 0.0969367972082358,
AccPeriodLocTwrAtExposureValue = 0.0969367972082358,
AccBasIrr = 0,
AccLocIrr = 0,
AccBasMwr = 0.0968429207825055,
PeriodBasIrr = 0,
PeriodLocIrr = 0,
PeriodBasTwrAtMarketPrice = 0.0969367972082358,
PeriodLocTwrAtMarketPrice = 0.0969367972082358,
PeriodBasTwrDeposit = 0,
PeriodBasTwrWithdrawal = 0,
PeriodBasTwrDepositWithdrawal = 0,
PeriodBasTwrDividendTax = 0,
PeriodBasTwr = 7359.70000000112,
PeriodBasMwr = 0.0484449181280957,
BenchmarkCalcType = "BenchmarkNotCalculated",
EopBenchmarkName = "",
AccBasBenchmarkReturnPct = 0,
PeriodBasBenchmarkReturnPct = 0
),
class = "data.frame",
row.names = 1L
),
structure(
list(
AccPeriodBasTwrAtMarketPrice = 0.537983489472227,
AccPeriodLocTwrAtMarketPrice = 0.537983489472227,
BopDate = "2022-02-28T00:00:00",
BopBasHoldingValueAtMarketPrice = 5135960.59,
BopBasInterestAccrual = 0,
EopDate = "2022-02-28T00:00:00",
EopBasHoldingValueAtMarketPrice = 5163591.21,
EopBasInterestAccrual = 0,
AccPeriodBasTwrAtExposureValue = 0.537983489472227,
AccPeriodLocTwrAtExposureValue = 0.537983489472227,
AccBasIrr = 0,
AccLocIrr = 0,
AccBasMwr = 0.535104714457055,
PeriodBasIrr = 0,
PeriodLocIrr = 0,
PeriodBasTwrAtMarketPrice = 0.537983489472227,
PeriodLocTwrAtMarketPrice = 0.537983489472227,
PeriodBasTwrDeposit = 0,
PeriodBasTwrWithdrawal = 0,
PeriodBasTwrDepositWithdrawal = 0,
PeriodBasTwrDividendTax = 0,
PeriodBasTwr = 27630.6200000001,
PeriodBasMwr = 0.26827012025902,
BenchmarkCalcType = "BenchmarkNotCalculated",
EopBenchmarkName = "",
AccBasBenchmarkReturnPct = 0,
PeriodBasBenchmarkReturnPct = 0
),
class = "data.frame",
row.names = 1L
),
structure(
list(
AccPeriodBasTwrAtMarketPrice = -0.107066381156318,
AccPeriodLocTwrAtMarketPrice = -0.107066381156318,
BopDate = "2022-02-28T00:00:00",
BopBasHoldingValueAtMarketPrice = 7166815.5,
BopBasInterestAccrual = 0,
EopDate = "2022-02-28T00:00:00",
EopBasHoldingValueAtMarketPrice = 7159142.25,
EopBasInterestAccrual = 0,
AccPeriodBasTwrAtExposureValue = -0.107066381156318,
AccPeriodLocTwrAtExposureValue = -0.107066381156318,
AccBasIrr = 0,
AccLocIrr = 0,
AccBasMwr = -0.107181136120043,
PeriodBasIrr = 0,
PeriodLocIrr = 0,
PeriodBasTwrAtMarketPrice = -0.107066381156318,
PeriodLocTwrAtMarketPrice = -0.107066381156318,
PeriodBasTwrDeposit = 0,
PeriodBasTwrWithdrawal = 0,
PeriodBasTwrDepositWithdrawal = 0,
PeriodBasTwrDividendTax = 0,
PeriodBasTwr = -7673.25,
PeriodBasMwr = -0.0535618639528656,
BenchmarkCalcType = "BenchmarkNotCalculated",
EopBenchmarkName = "",
AccBasBenchmarkReturnPct = 0,
PeriodBasBenchmarkReturnPct = 0
),
class = "data.frame",
row.names = 1L
)
)
),
class = "data.frame",
row.names = c(NA,
3L)
)
所以在上面的 data.frame 中,我唯一感兴趣的列是“Id”和“EopBasHoldingValueAtMarketPrice”,其中后者是嵌套 data.frame 中的一个列,称为“SumPeriod”。我尝试通过以下方式实现这一目标:
df_subset = subset(df, select = c("Id", "SumPeriod$EopBasHoldingValueAtMarketPrice"))
但是我得到错误:
frame`(x, r, vars, drop = drop) : undefined columns selected
大家知道如何实现吗?
SumPeriod
是您的数据框中的数据框。所以你在你的标签中建议dplyr
,你可以用这种方法解决它:
library(dplyr)
library(tidyr)
df %>%
as_tibble() %>%
unnest(SumPeriod) %>%
select(Id, EopBasHoldingValueAtMarketPrice)
输出为:
# A tibble: 3 × 2
Id EopBasHoldingValueAtMarketPrice
<chr> <dbl>
1 159347 7599626.
2 161863 5163591.
3 22646 7159142.
data.table
接近
library(data.table)
DT <- as.data.table(df)
DT[, .(Id,
EopBasHoldingValueAtMarketPrice = lapply(Series, function(x) x$EopBasHoldingValueAtMarketPrice))]
# Id EopBasHoldingValueAtMarketPrice
# 1: 159347 7599626
# 2: 161863 5163591
# 3: 22646 7159142
如果您对 dplyr
感到满意,请使用 Stephan 的解决方案。如果您需要基本解决方案,您可以将嵌套框架中的所需变量分配给父框架,然后使用 subset()
:
library(tidyverse)
df["EopBasHoldingValueAtMarketPrice"] <- df$SumPeriod$EopBasHoldingValueAtMarketPrice
df_subset <- subset(df, select = c("Id", "EopBasHoldingValueAtMarketPrice"))
df_subset
#> Id EopBasHoldingValueAtMarketPrice
#> 1 159347 7599626
#> 2 161863 5163591
#> 3 22646 7159142
由 reprex package (v2.0.1)
于 2022-03-28 创建您没有任何名为 "SumPeriod$EopBasHoldingValueAtMarketPrice"
的内容,这是一个从 SumPeriod
中提取列的表达式。 @Stephan 给了你一个 dplyr
解决方案;这是一个基本的 R 解决方案:
df1 <- subset(df, select = c("Id", "SumPeriod"))
df1$SumPeriod <- subset(df1$SumPeriod, select = "EopBasHoldingValueAtMarketPrice")
这会将结构保留为嵌套数据框。
数据框里面有数据框的原因吗?你可以把这些东西都放在一个简单的数据框中
x <- df$SumPeriod
y <- df$Serie
y <- rbind(y[[1]],y[[2]],y[[3]])
df <- cbind(df[,c("Id","Name","$id")],x,y)
那么,您建议的子集就可以工作了
df_subset = subset(df, select = c("Id", "EopBasHoldingValueAtMarketPrice"))