从数据框中提取最高的正值和负值,并使用 R 将它们填充到格式化文本中
Extract top positive and negative values from dataframe and fill them into a formatted text using R
我正在尝试提取 material 的信息,其价格涨幅和跌幅在 pct_change
列的前 3 位。
数据:
df <- structure(list(material = c("Copper", "Aluminum", "Iron", "Zinc",
"Nickel", "Silver", "Gold", "Tin"), price = c(17125, 8312, 2228.5,
2934, 4315, 8178, 4411, 680), pct_change = c(0.025449102, 0,
-0.024939838, 0.062470043, -0.043873255, -0.004625122, 0.045031392,
-0.037508846)), class = "data.frame", row.names = c(NA, -8L))
我的预期结果是一段文字如下:
text <- 'The top 3 commodities that price rise most are: Zinc (6.25%), Gold (4.5%), and Copper (2.54%),
the top 3 commodities that fall most are: Nickel (-4.39%), Tin (-3.75%) and Iron (-2.49%).'
我的试用代码有效,但不够简洁,有人可以分享其他更有效的解决方案吗?谢谢。
top3 <- df %>%
arrange(desc(pct_change)) %>%
mutate(pct_change=scales::percent(pct_change)) %>%
slice_head(n=3)
tail3 <- df %>%
arrange(pct_change) %>%
mutate(pct_change=scales::percent(pct_change)) %>%
slice_head(n=3)
com_name_up1 <- top3$material[1]
com_pct_up1 <- top3$pct_change[1]
com_name_up2 <- top3$material[2]
com_pct_up2 <- top3$pct_change[2]
com_name_up3 <- top3$material[3]
com_pct_up3 <- top3$pct_change[3]
com_name_down1 <- tail3$material[1]
com_pct_down1 <- tail3$pct_change[1]
com_name_down2 <- tail3$material[2]
com_pct_down2 <- tail3$pct_change[2]
com_name_down3 <- tail3$material[3]
com_pct_down3 <- tail3$pct_change[3]
text <- glue('The top 3 commodities that price rose most are: {com_name_up1} ({com_pct_up1}),
{com_name_up2} ({com_pct_up2}), and {com_name_up3} ({com_pct_up3}),
the top 3 commodities that fell most are: {com_name_down1} ({com_pct_down1}),
{com_name_down2} ({com_pct_down2}) and {com_name_down3} ({com_pct_down3}).')
更新文本模板以考虑案例 pct_change
所有负值或正值,或少于 3 个正值和负值:
如果 pct_change
值都是正数:
'All {n} commodities price rose, the top 3 rose most are: {top3[[1L]]}, {top3[[2L]]}, and {top3[[3L]]}.'
如果pct_change
值都是负数:
'All {n} commodities price fell, the top 3 fell most are: {top3[[1L]]}, {top3[[2L]]}, and {top3[[3L]]}'
如果正值、零值和负值pct_change
都存在,我将设置n1_1、n2_1和n3_1来计算数字, n1_2、n2_2、n3_3获取top n,最大值为3:
'我们有{n1_1}个价格上涨的商品,前{n1_2}涨幅最大的是:{top3[[1L]]}, {top3[[2L]]} , 和 {top3[[3L]]}; \
{n2_1}种价格不变的商品,包括:{same3[[1L]]}、{same3[[2L]]}、{same3[[3L]]}; \
价格下跌的{n3_1}商品,跌幅最大的{n3_2}是:{bot3[[1L]]}、{bot3[[2L]]}和{bot3[[3L] ]}.'
最终代码,@ekoam 贡献:
to_string <- function(x, sep = ", ") {
if (length(x) < 2L)
return(x)
out <- character(length(x) + length(x) - 1L)
out[seq.int(1L, by = 2L, length.out = length(x))] <- x
out[seq.int(2L, by = 2L, length.out = length(x) - 1L)] <- sep
out[[length(out) - 1L]] <- " and "
paste0(out, collapse = "")
}
text_summary <- function(df) {
switch(
df$sign[[1L]] + 2L,
\(x) dplyr::slice_min(x, pct_change, n = 3L),
\(x) dplyr::slice_sample(x, n = 3L),
\(x) dplyr::slice_max(x, pct_change, n = 3L)
)(df[, c("material", "pct_change")]) |>
dplyr::mutate(pct_change = scales::percent(pct_change)) |>
glue::glue_data("{material} ({pct_change})")
}
plural <- function(w, n) {
data <- c("commodity" = "commodities", "is" = "are")
if (n == 1L)
return(w)
data[[w]]
}
content <- df |>
dplyr::group_by(sign = sign(pct_change)) |>
dplyr::summarize(
n = dplyr::n(),
text = text_summary(dplyr::cur_data_all()),
.groups = "keep"
)
s1 <- all(content$sign < 0L)
s2 <- all(content$sign > 0L)
tmpl <-
if (s1) {
c("commodities price fell, the top {length(text)} fell most {plural('is', length(text))}: {to_string(text)}", "", "")
} else if (s2) {
c("", "", "commodities price rose, the top {length(text)} rose most {plural('is', length(text))}: {to_string(text)}")
} else {
c("{n[[1L]]} {plural('commodity',n[[1L]])} that price fell, the commodities with larger price declines are: {plural('is', length(text))}: {to_string(text)}",
"{n[[1L]]} {plural('commodity',n[[1L]])} that price kept the same, including: {to_string(text)}",
"{n[[1L]]} {plural('commodity',n[[1L]])} that price rose, the commodities with larger increases are: {plural('is', length(text))}: {to_string(text)}")
}
prefix <- c("We have ", "All the ")[[(s1 || s2) + 1L]]
content <- content |>
dplyr::mutate(tmpl = tmpl[sign + 2L]) |>
dplyr::summarize(text = glue::glue(tmpl[[1L]])) |>
dplyr::arrange(-sign) |>
dplyr::summarize(text = paste0(prefix, to_string(text, "; "), "."))
content$text
输出:
"We have 3 commodities that price rose, the commodities with larger increases are: are: Zinc (6.2%), Gold (4.5%) and Copper (2.5%); 1 commodity that price kept the same, including: Aluminum (0%) and 4 commodities that price fell, the commodities with larger price declines are: are: Nickel (-4.39%), Tin (-3.75%) and Iron (-2.49%)."
老实说,我不确定它是否真的更短,但您可以先将 material
和 pct_change
粘在 table 中。
然后我将其分组并折叠字符串
df %>%
arrange(desc(pct_change)) %>%
mutate(
t1 = sprintf('%s (%.2f%%)', material, pct_change*100),
rank1 = case_when(
row_number() <= 3 ~ 'Top',
row_number() > n() -3 ~ 'Bot'
)
) %>%
group_by(rank1) %>%
summarise(
t2 = paste(t1, collapse = ', ')
)
rank1 t2
<chr> <chr>
1 Bot Iron (-2.49%), Tin (-3.75%), Nickel (-4.39%)
2 Top Zinc (6.25%), Gold (4.50%), Copper (2.54%)
3 NA Aluminum (0.72%), Silver (-0.46%)
两条一般性建议:
glue
具有非常灵活的语法,允许您将任何有效的 R 表达式传递到 "{...}"
。利用此功能将有助于缩短您的代码。
- 你应该把结构相似的部分抽象出来,总结成一个函数。
这是代码
report3 <- function(df, f) {
df |>
f(pct_change, n = 3L) |>
dplyr::mutate(pct_change = scales::percent(pct_change)) |>
glue::glue_data("{material} ({pct_change})")
}
top3 <- report3(df, dplyr::slice_max)
bot3 <- report3(df, dplyr::slice_min)
text <- glue::glue('The top 3 commodities that price rose most are: \
{top3[[1L]]}, {top3[[2L]]}, and {top3[[3L]]}; \
the top 3 commodities that fell most are: \
{bot3[[1L]]}, {bot3[[2L]]} and {bot3[[3L]]}.')
输出
> text
The top 3 commodities that price rose most are: Zinc (6.2%), Gold (4.5%), and Copper (2.5%); the top 3 commodities that fell most are: Nickel (-4.39%), Tin (-3.75%) and Iron (-2.49%).
下面的代码应该足以涵盖您提供的所有案例
to_string <- function(x, sep = ", ") {
if (length(x) < 2L)
return(x)
out <- character(length(x) + length(x) - 1L)
out[seq.int(1L, by = 2L, length.out = length(x))] <- x
out[seq.int(2L, by = 2L, length.out = length(x) - 1L)] <- sep
out[[length(out) - 1L]] <- " and "
paste0(out, collapse = "")
}
text_summary <- function(df) {
switch(
df$sign[[1L]] + 2L,
\(x) dplyr::slice_min(x, pct_change, n = 3L),
\(x) dplyr::slice_sample(x, n = 3L),
\(x) dplyr::slice_max(x, pct_change, n = 3L)
)(df[, c("material", "pct_change")]) |>
dplyr::mutate(pct_change = scales::percent(pct_change)) |>
glue::glue_data("{material} ({pct_change})")
}
plural <- function(w, n) {
data <- c("commodity" = "commodities", "is" = "are")
if (n == 1L)
return(w)
data[[w]]
}
content <- df |>
dplyr::group_by(sign = sign(pct_change)) |>
dplyr::summarize(
n = dplyr::n(),
text = text_summary(dplyr::cur_data_all()),
.groups = "keep"
)
s1 <- all(content$sign < 0L)
s2 <- all(content$sign > 0L)
tmpl <-
if (s1) {
c("commodities price fell, the top {length(text)} fell most {plural('is', length(text))}: {to_string(text)}", "", "")
} else if (s2) {
c("", "", "commodities price rose, the top {length(text)} rose most {plural('is', length(text))}: {to_string(text)}")
} else {
c("{n[[1L]]} {plural('commodity',n[[1L]])} that price fell, the top {length(text)} fell most {plural('is', length(text))}: {to_string(text)}",
"{n[[1L]]} {plural('commodity',n[[1L]])} that price kept the same, including: {to_string(text)}",
"{n[[1L]]} {plural('commodity',n[[1L]])} that price rose, the top {length(text)} rose most {plural('is', length(text))}: {to_string(text)}")
}
prefix <- c("We have ", "All the ")[[(s1 || s2) + 1L]]
content <- content |>
dplyr::mutate(tmpl = tmpl[sign + 2L]) |>
dplyr::summarize(text = glue::glue(tmpl[[1L]])) |>
dplyr::arrange(-sign) |>
dplyr::summarize(text = paste0(prefix, to_string(text, "; "), "."))
content$text
另一种可能的解决方案(而不是 stringr::str_c
,您可以使用更方便的 stringr::str_glue
,正如@ekoam 所建议的那样:
library(tidyverse)
df <- structure(list(material = c("Copper", "Aluminum", "Iron", "Zinc",
"Nickel", "Silver", "Gold", "Tin"), price = c(17125, 8312, 2228.5,
2934, 4315, 8178, 4411, 680), pct_change = c(0.025449102, 0.007166746,
-0.024939838, 0.062470043, -0.043873255, -0.004625122, 0.045031392,
-0.037508846)), class = "data.frame", row.names = c(NA, -8L))
top3 <- slice_max(df, pct_change, n = 3)
bottom3 <- slice_min(df, pct_change, n = 3)
str_c("The top 3 commodities that price rise most are: ",
top3$material[1]," (", round(100*top3$pct_change[1], 2),"%), ",
top3$material[2]," (", round(100*top3$pct_change[2],2),"%), and ",
top3$material[3]," (", round(100*top3$pct_change[3]),"%), the top 3
commodities that fall most are: ", bottom3$material[1]," (",
round(100*bottom3$pct_change[1], 2),"%), ", bottom3$material[2]," (",
round(100*bottom3$pct_change[2],2),"%), and ", bottom3$material[3],"
(", round(100*bottom3$pct_change[3]),"%).")
#> [1] "The top 3 commodities that price rise most are: Zinc (6.25%), Gold (4.5%), and Copper (3%), the top 3 commodities that fall most are: Nickel (-4.39%), Tin (-3.75%), and Iron (-2%)."
我正在尝试提取 material 的信息,其价格涨幅和跌幅在 pct_change
列的前 3 位。
数据:
df <- structure(list(material = c("Copper", "Aluminum", "Iron", "Zinc",
"Nickel", "Silver", "Gold", "Tin"), price = c(17125, 8312, 2228.5,
2934, 4315, 8178, 4411, 680), pct_change = c(0.025449102, 0,
-0.024939838, 0.062470043, -0.043873255, -0.004625122, 0.045031392,
-0.037508846)), class = "data.frame", row.names = c(NA, -8L))
我的预期结果是一段文字如下:
text <- 'The top 3 commodities that price rise most are: Zinc (6.25%), Gold (4.5%), and Copper (2.54%),
the top 3 commodities that fall most are: Nickel (-4.39%), Tin (-3.75%) and Iron (-2.49%).'
我的试用代码有效,但不够简洁,有人可以分享其他更有效的解决方案吗?谢谢。
top3 <- df %>%
arrange(desc(pct_change)) %>%
mutate(pct_change=scales::percent(pct_change)) %>%
slice_head(n=3)
tail3 <- df %>%
arrange(pct_change) %>%
mutate(pct_change=scales::percent(pct_change)) %>%
slice_head(n=3)
com_name_up1 <- top3$material[1]
com_pct_up1 <- top3$pct_change[1]
com_name_up2 <- top3$material[2]
com_pct_up2 <- top3$pct_change[2]
com_name_up3 <- top3$material[3]
com_pct_up3 <- top3$pct_change[3]
com_name_down1 <- tail3$material[1]
com_pct_down1 <- tail3$pct_change[1]
com_name_down2 <- tail3$material[2]
com_pct_down2 <- tail3$pct_change[2]
com_name_down3 <- tail3$material[3]
com_pct_down3 <- tail3$pct_change[3]
text <- glue('The top 3 commodities that price rose most are: {com_name_up1} ({com_pct_up1}),
{com_name_up2} ({com_pct_up2}), and {com_name_up3} ({com_pct_up3}),
the top 3 commodities that fell most are: {com_name_down1} ({com_pct_down1}),
{com_name_down2} ({com_pct_down2}) and {com_name_down3} ({com_pct_down3}).')
更新文本模板以考虑案例 pct_change
所有负值或正值,或少于 3 个正值和负值:
如果
pct_change
值都是正数:'All {n} commodities price rose, the top 3 rose most are: {top3[[1L]]}, {top3[[2L]]}, and {top3[[3L]]}.'
如果
pct_change
值都是负数:'All {n} commodities price fell, the top 3 fell most are: {top3[[1L]]}, {top3[[2L]]}, and {top3[[3L]]}'
如果正值、零值和负值
pct_change
都存在,我将设置n1_1、n2_1和n3_1来计算数字, n1_2、n2_2、n3_3获取top n,最大值为3:'我们有{n1_1}个价格上涨的商品,前{n1_2}涨幅最大的是:{top3[[1L]]}, {top3[[2L]]} , 和 {top3[[3L]]}; \ {n2_1}种价格不变的商品,包括:{same3[[1L]]}、{same3[[2L]]}、{same3[[3L]]}; \ 价格下跌的{n3_1}商品,跌幅最大的{n3_2}是:{bot3[[1L]]}、{bot3[[2L]]}和{bot3[[3L] ]}.'
最终代码,@ekoam 贡献:
to_string <- function(x, sep = ", ") {
if (length(x) < 2L)
return(x)
out <- character(length(x) + length(x) - 1L)
out[seq.int(1L, by = 2L, length.out = length(x))] <- x
out[seq.int(2L, by = 2L, length.out = length(x) - 1L)] <- sep
out[[length(out) - 1L]] <- " and "
paste0(out, collapse = "")
}
text_summary <- function(df) {
switch(
df$sign[[1L]] + 2L,
\(x) dplyr::slice_min(x, pct_change, n = 3L),
\(x) dplyr::slice_sample(x, n = 3L),
\(x) dplyr::slice_max(x, pct_change, n = 3L)
)(df[, c("material", "pct_change")]) |>
dplyr::mutate(pct_change = scales::percent(pct_change)) |>
glue::glue_data("{material} ({pct_change})")
}
plural <- function(w, n) {
data <- c("commodity" = "commodities", "is" = "are")
if (n == 1L)
return(w)
data[[w]]
}
content <- df |>
dplyr::group_by(sign = sign(pct_change)) |>
dplyr::summarize(
n = dplyr::n(),
text = text_summary(dplyr::cur_data_all()),
.groups = "keep"
)
s1 <- all(content$sign < 0L)
s2 <- all(content$sign > 0L)
tmpl <-
if (s1) {
c("commodities price fell, the top {length(text)} fell most {plural('is', length(text))}: {to_string(text)}", "", "")
} else if (s2) {
c("", "", "commodities price rose, the top {length(text)} rose most {plural('is', length(text))}: {to_string(text)}")
} else {
c("{n[[1L]]} {plural('commodity',n[[1L]])} that price fell, the commodities with larger price declines are: {plural('is', length(text))}: {to_string(text)}",
"{n[[1L]]} {plural('commodity',n[[1L]])} that price kept the same, including: {to_string(text)}",
"{n[[1L]]} {plural('commodity',n[[1L]])} that price rose, the commodities with larger increases are: {plural('is', length(text))}: {to_string(text)}")
}
prefix <- c("We have ", "All the ")[[(s1 || s2) + 1L]]
content <- content |>
dplyr::mutate(tmpl = tmpl[sign + 2L]) |>
dplyr::summarize(text = glue::glue(tmpl[[1L]])) |>
dplyr::arrange(-sign) |>
dplyr::summarize(text = paste0(prefix, to_string(text, "; "), "."))
content$text
输出:
"We have 3 commodities that price rose, the commodities with larger increases are: are: Zinc (6.2%), Gold (4.5%) and Copper (2.5%); 1 commodity that price kept the same, including: Aluminum (0%) and 4 commodities that price fell, the commodities with larger price declines are: are: Nickel (-4.39%), Tin (-3.75%) and Iron (-2.49%)."
老实说,我不确定它是否真的更短,但您可以先将 material
和 pct_change
粘在 table 中。
然后我将其分组并折叠字符串
df %>%
arrange(desc(pct_change)) %>%
mutate(
t1 = sprintf('%s (%.2f%%)', material, pct_change*100),
rank1 = case_when(
row_number() <= 3 ~ 'Top',
row_number() > n() -3 ~ 'Bot'
)
) %>%
group_by(rank1) %>%
summarise(
t2 = paste(t1, collapse = ', ')
)
rank1 t2
<chr> <chr>
1 Bot Iron (-2.49%), Tin (-3.75%), Nickel (-4.39%)
2 Top Zinc (6.25%), Gold (4.50%), Copper (2.54%)
3 NA Aluminum (0.72%), Silver (-0.46%)
两条一般性建议:
glue
具有非常灵活的语法,允许您将任何有效的 R 表达式传递到"{...}"
。利用此功能将有助于缩短您的代码。- 你应该把结构相似的部分抽象出来,总结成一个函数。
这是代码
report3 <- function(df, f) {
df |>
f(pct_change, n = 3L) |>
dplyr::mutate(pct_change = scales::percent(pct_change)) |>
glue::glue_data("{material} ({pct_change})")
}
top3 <- report3(df, dplyr::slice_max)
bot3 <- report3(df, dplyr::slice_min)
text <- glue::glue('The top 3 commodities that price rose most are: \
{top3[[1L]]}, {top3[[2L]]}, and {top3[[3L]]}; \
the top 3 commodities that fell most are: \
{bot3[[1L]]}, {bot3[[2L]]} and {bot3[[3L]]}.')
输出
> text
The top 3 commodities that price rose most are: Zinc (6.2%), Gold (4.5%), and Copper (2.5%); the top 3 commodities that fell most are: Nickel (-4.39%), Tin (-3.75%) and Iron (-2.49%).
下面的代码应该足以涵盖您提供的所有案例
to_string <- function(x, sep = ", ") {
if (length(x) < 2L)
return(x)
out <- character(length(x) + length(x) - 1L)
out[seq.int(1L, by = 2L, length.out = length(x))] <- x
out[seq.int(2L, by = 2L, length.out = length(x) - 1L)] <- sep
out[[length(out) - 1L]] <- " and "
paste0(out, collapse = "")
}
text_summary <- function(df) {
switch(
df$sign[[1L]] + 2L,
\(x) dplyr::slice_min(x, pct_change, n = 3L),
\(x) dplyr::slice_sample(x, n = 3L),
\(x) dplyr::slice_max(x, pct_change, n = 3L)
)(df[, c("material", "pct_change")]) |>
dplyr::mutate(pct_change = scales::percent(pct_change)) |>
glue::glue_data("{material} ({pct_change})")
}
plural <- function(w, n) {
data <- c("commodity" = "commodities", "is" = "are")
if (n == 1L)
return(w)
data[[w]]
}
content <- df |>
dplyr::group_by(sign = sign(pct_change)) |>
dplyr::summarize(
n = dplyr::n(),
text = text_summary(dplyr::cur_data_all()),
.groups = "keep"
)
s1 <- all(content$sign < 0L)
s2 <- all(content$sign > 0L)
tmpl <-
if (s1) {
c("commodities price fell, the top {length(text)} fell most {plural('is', length(text))}: {to_string(text)}", "", "")
} else if (s2) {
c("", "", "commodities price rose, the top {length(text)} rose most {plural('is', length(text))}: {to_string(text)}")
} else {
c("{n[[1L]]} {plural('commodity',n[[1L]])} that price fell, the top {length(text)} fell most {plural('is', length(text))}: {to_string(text)}",
"{n[[1L]]} {plural('commodity',n[[1L]])} that price kept the same, including: {to_string(text)}",
"{n[[1L]]} {plural('commodity',n[[1L]])} that price rose, the top {length(text)} rose most {plural('is', length(text))}: {to_string(text)}")
}
prefix <- c("We have ", "All the ")[[(s1 || s2) + 1L]]
content <- content |>
dplyr::mutate(tmpl = tmpl[sign + 2L]) |>
dplyr::summarize(text = glue::glue(tmpl[[1L]])) |>
dplyr::arrange(-sign) |>
dplyr::summarize(text = paste0(prefix, to_string(text, "; "), "."))
content$text
另一种可能的解决方案(而不是 stringr::str_c
,您可以使用更方便的 stringr::str_glue
,正如@ekoam 所建议的那样:
library(tidyverse)
df <- structure(list(material = c("Copper", "Aluminum", "Iron", "Zinc",
"Nickel", "Silver", "Gold", "Tin"), price = c(17125, 8312, 2228.5,
2934, 4315, 8178, 4411, 680), pct_change = c(0.025449102, 0.007166746,
-0.024939838, 0.062470043, -0.043873255, -0.004625122, 0.045031392,
-0.037508846)), class = "data.frame", row.names = c(NA, -8L))
top3 <- slice_max(df, pct_change, n = 3)
bottom3 <- slice_min(df, pct_change, n = 3)
str_c("The top 3 commodities that price rise most are: ",
top3$material[1]," (", round(100*top3$pct_change[1], 2),"%), ",
top3$material[2]," (", round(100*top3$pct_change[2],2),"%), and ",
top3$material[3]," (", round(100*top3$pct_change[3]),"%), the top 3
commodities that fall most are: ", bottom3$material[1]," (",
round(100*bottom3$pct_change[1], 2),"%), ", bottom3$material[2]," (",
round(100*bottom3$pct_change[2],2),"%), and ", bottom3$material[3],"
(", round(100*bottom3$pct_change[3]),"%).")
#> [1] "The top 3 commodities that price rise most are: Zinc (6.25%), Gold (4.5%), and Copper (3%), the top 3 commodities that fall most are: Nickel (-4.39%), Tin (-3.75%), and Iron (-2%)."