识别具有上升趋势的股票
Identify Stocks With Increasing Trends
我有一个“长”格式的数据框。第一列包含日期,第二列是股票名称,最后是收盘价。绘图时,这种格式非常简单。您可以使用股票名称列在单独的图上创建不同颜色的线条或分面。太好了。
示例数据如下:
dat <- structure(list(Date = structure(c(1592611200, 1592611200, 1592611200,
1592611200, 1592697600, 1592697600,
1592697600, 1592697600, 1592784000,
1592784000, 1592784000, 1592784000,
1592870400, 1592870400, 1592870400,
1592870400, 1592956800, 1592956800,
1592956800, 1592956800, 1593043200,
1593043200, 1593043200, 1593043200,
1593129600, 1593129600, 1593129600,
1593129600, 1593216000, 1593216000,
1593216000, 1593216000, 1593302400,
1593302400, 1593302400, 1593302400,
1593388800, 1593388800, 1593388800,
1593388800),
tzone = "UTC", class = c("POSIXct", "POSIXt")),
stock_name = c("AAPL", "AMZN", "HTZ", "NFLX", "AAPL",
"AMZN", "HTZ", "NFLX", "AAPL", "AMZN",
"HTZ", "NFLX", "AAPL", "AMZN", "HTZ",
"NFLX", "AAPL", "AMZN", "HTZ", "NFLX",
"AAPL", "AMZN", "HTZ", "NFLX", "AAPL",
"AMZN", "HTZ", "NFLX", "AAPL", "AMZN",
"HTZ", "NFLX", "AAPL", "AMZN", "HTZ",
"NFLX", "AAPL", "AMZN", "HTZ", "NFLX"),
closing_price = c(200, 1900, 100, 150, 210,
1950, 90, 160, 211, 1975, 75, 150,
213, 1980, 60, 140, 211, 1990, 50,
150, 213, 1991, 45, 160, 214, 1990,
40, 150, 215, 1998, 38, 140, 217,
2010, 30, 150, 216, 2020, 20, 150)),
row.names = c(NA, -40L), class = c("tbl_df", "tbl", "data.frame"))
然而,我们的目标是确定以下哪些股票具有上涨趋势。我的想法是对每只股票应用线性模型,然后提取斜率并根据哪些为正进行过滤。我遇到的问题是如何使用“长”形式的数据框来完成此操作。
实际上,数据框有额外的列,这些列不能很好地转换为数据框的“宽”格式。所以,在我看来,它需要保持“长”形式。
您将如何确定哪些股票具有上涨趋势?
目标数据框:
dat <- structure(list(Date = structure(c(1592611200, 1592611200, 1592611200,
1592611200, 1592697600, 1592697600,
1592697600, 1592697600, 1592784000,
1592784000, 1592784000, 1592784000,
1592870400, 1592870400, 1592870400,
1592870400, 1592956800, 1592956800,
1592956800, 1592956800, 1593043200,
1593043200, 1593043200, 1593043200,
1593129600, 1593129600, 1593129600,
1593129600, 1593216000, 1593216000,
1593216000, 1593216000, 1593302400,
1593302400, 1593302400, 1593302400,
1593388800, 1593388800, 1593388800,
1593388800),
tzone = "UTC", class = c("POSIXct", "POSIXt")),
stock_name = c("AAPL", "AMZN", "HTZ", "NFLX", "AAPL",
"AMZN", "HTZ", "NFLX", "AAPL", "AMZN",
"HTZ", "NFLX", "AAPL", "AMZN", "HTZ",
"NFLX", "AAPL", "AMZN", "HTZ", "NFLX",
"AAPL", "AMZN", "HTZ", "NFLX", "AAPL",
"AMZN", "HTZ", "NFLX", "AAPL", "AMZN",
"HTZ", "NFLX", "AAPL", "AMZN", "HTZ",
"NFLX", "AAPL", "AMZN", "HTZ", "NFLX"),
closing_price = c(200, 1900, 100, 150, 210,
1950, 90, 160, 211, 1975, 75, 150,
213, 1980, 60, 140, 211, 1990, 50,
150, 213, 1991, 45, 160, 214, 1990,
40, 150, 215, 1998, 38, 140, 217,
2010, 30, 150, 216, 2020, 20, 150),
trend = c("increasing", "increasing", "", "",
"increasing", "increasing", "", "",
"increasing", "increasing", "", "",
"increasing", "increasing", "", "",
"increasing", "increasing", "", "",
"increasing", "increasing", "", "",
"increasing", "increasing", "", "",
"increasing", "increasing", "", "")),
row.names = c(NA, -40L), class = c("tbl_df", "tbl", "data.frame"))
这是我目前得到的:
#function to label a trend as increasing
label_increasing <- function(stck_df){
mdl <- lm(closing_price ~ Date, data = stck_df)
#create a model using the date as a predictor
if(mdl$coefficients["Date"] > 0){
return("increasing")
#if the trend is increasing with date, return "increasing"
}#end if
}#end function
apple_dat <- dat %>%
filter(stock_name == "AAPL")
#filter just the apple stock
apple_label <- label_increasing(filtered_dat)
apple_label
#works for a single stock
labeled_dat <- dat %>%
group_by(stock_name) %>%
mutate(trend = label_increasing(.))
labeled_dat
#does not work for the full data frame
labeled_dat <- dat %>%
group_by(stock_name) %>%
mutate(trend = map(., label_increasing))
labeled_dat
#I have a feeling I need to do some mapping but this isn't quite right
最后,灵感来自纽约时报 Covid-19 仪表板。状态增加和减少的部分。找到 here.
用ggplot可视化数据,看看!
install.packages("ggplot2")
library(ggplot2)
ggplot(data = dat) +
geom_line(mapping = aes(x = Date, y = closing_price)) +
facet_wrap(~stock_name, scales = "free_y")
这是对您问题的回答您将如何确定这些股票中哪些有上涨趋势?(如果您想进行财务分析,我建议雇人。 )
您可以 nest
每个 stock_name
的数据并将您的函数映射到每个组。
编辑:我不得不修改 label_increasing()
,因此响应变量被命名为 closing_price
。
library(tidyverse)
label_increasing <- function(stck_df){
mdl <- lm(closing_price ~ Date, data = stck_df)
#create a model using the date as a predictor
if(mdl$coefficients["Date"] > 0){
return("increasing")
#if the trend is increasing with date, return "increasing"
} #end if
}#end function
dat %>%
group_by(stock_name) %>%
nest() %>%
mutate(trend = map(data, label_increasing)) %>%
unnest(trend)
#-----
# A tibble: 2 x 3
# Groups: stock_name [2]
stock_name data lm_mod
<chr> <list> <chr>
1 AAPL <tibble [10 x 3]> increasing
2 AMZN <tibble [10 x 3]> increasing
好吧,如果你想要整个时期,让我们以相同的方式开始它们并跟踪增长或增长百分比,那么你可以使用一个简单的过滤语句来选择只显示那些从头到尾增长的,无论多小
library(dplyr)
library(ggplot2)
dat %>% group_by(stock_name) %>%
arrange(Date) %>%
mutate(growth = closing_price - first(closing_price),
growth_percent = (closing_price - first(closing_price))/first(closing_price)*100) %>%
filter(last(growth) >= 0) %>%
ggplot(aes(x = Date, y = growth, group = stock_name, color = stock_name)) +
geom_line()
您的原始数据
dat <- structure(list(Date = structure(c(1592611200, 1592611200, 1592611200,
1592611200, 1592697600, 1592697600,
1592697600, 1592697600, 1592784000,
1592784000, 1592784000, 1592784000,
1592870400, 1592870400, 1592870400,
1592870400, 1592956800, 1592956800,
1592956800, 1592956800, 1593043200,
1593043200, 1593043200, 1593043200,
1593129600, 1593129600, 1593129600,
1593129600, 1593216000, 1593216000,
1593216000, 1593216000, 1593302400,
1593302400, 1593302400, 1593302400,
1593388800, 1593388800, 1593388800,
1593388800),
tzone = "UTC", class = c("POSIXct", "POSIXt")),
stock_name = c("AAPL", "AMZN", "HTZ", "NFLX", "AAPL",
"AMZN", "HTZ", "NFLX", "AAPL", "AMZN",
"HTZ", "NFLX", "AAPL", "AMZN", "HTZ",
"NFLX", "AAPL", "AMZN", "HTZ", "NFLX",
"AAPL", "AMZN", "HTZ", "NFLX", "AAPL",
"AMZN", "HTZ", "NFLX", "AAPL", "AMZN",
"HTZ", "NFLX", "AAPL", "AMZN", "HTZ",
"NFLX", "AAPL", "AMZN", "HTZ", "NFLX"),
closing_price = c(200, 1900, 100, 150, 210,
1950, 90, 160, 211, 1975, 75, 150,
213, 1980, 60, 140, 211, 1990, 50,
150, 213, 1991, 45, 160, 214, 1990,
40, 150, 215, 1998, 38, 140, 217,
2010, 30, 150, 216, 2020, 20, 150)),
row.names = c(NA, -40L), class = c("tbl_df", "tbl", "data.frame"))
# dat
我有一个“长”格式的数据框。第一列包含日期,第二列是股票名称,最后是收盘价。绘图时,这种格式非常简单。您可以使用股票名称列在单独的图上创建不同颜色的线条或分面。太好了。
示例数据如下:
dat <- structure(list(Date = structure(c(1592611200, 1592611200, 1592611200,
1592611200, 1592697600, 1592697600,
1592697600, 1592697600, 1592784000,
1592784000, 1592784000, 1592784000,
1592870400, 1592870400, 1592870400,
1592870400, 1592956800, 1592956800,
1592956800, 1592956800, 1593043200,
1593043200, 1593043200, 1593043200,
1593129600, 1593129600, 1593129600,
1593129600, 1593216000, 1593216000,
1593216000, 1593216000, 1593302400,
1593302400, 1593302400, 1593302400,
1593388800, 1593388800, 1593388800,
1593388800),
tzone = "UTC", class = c("POSIXct", "POSIXt")),
stock_name = c("AAPL", "AMZN", "HTZ", "NFLX", "AAPL",
"AMZN", "HTZ", "NFLX", "AAPL", "AMZN",
"HTZ", "NFLX", "AAPL", "AMZN", "HTZ",
"NFLX", "AAPL", "AMZN", "HTZ", "NFLX",
"AAPL", "AMZN", "HTZ", "NFLX", "AAPL",
"AMZN", "HTZ", "NFLX", "AAPL", "AMZN",
"HTZ", "NFLX", "AAPL", "AMZN", "HTZ",
"NFLX", "AAPL", "AMZN", "HTZ", "NFLX"),
closing_price = c(200, 1900, 100, 150, 210,
1950, 90, 160, 211, 1975, 75, 150,
213, 1980, 60, 140, 211, 1990, 50,
150, 213, 1991, 45, 160, 214, 1990,
40, 150, 215, 1998, 38, 140, 217,
2010, 30, 150, 216, 2020, 20, 150)),
row.names = c(NA, -40L), class = c("tbl_df", "tbl", "data.frame"))
然而,我们的目标是确定以下哪些股票具有上涨趋势。我的想法是对每只股票应用线性模型,然后提取斜率并根据哪些为正进行过滤。我遇到的问题是如何使用“长”形式的数据框来完成此操作。
实际上,数据框有额外的列,这些列不能很好地转换为数据框的“宽”格式。所以,在我看来,它需要保持“长”形式。
您将如何确定哪些股票具有上涨趋势?
目标数据框:
dat <- structure(list(Date = structure(c(1592611200, 1592611200, 1592611200,
1592611200, 1592697600, 1592697600,
1592697600, 1592697600, 1592784000,
1592784000, 1592784000, 1592784000,
1592870400, 1592870400, 1592870400,
1592870400, 1592956800, 1592956800,
1592956800, 1592956800, 1593043200,
1593043200, 1593043200, 1593043200,
1593129600, 1593129600, 1593129600,
1593129600, 1593216000, 1593216000,
1593216000, 1593216000, 1593302400,
1593302400, 1593302400, 1593302400,
1593388800, 1593388800, 1593388800,
1593388800),
tzone = "UTC", class = c("POSIXct", "POSIXt")),
stock_name = c("AAPL", "AMZN", "HTZ", "NFLX", "AAPL",
"AMZN", "HTZ", "NFLX", "AAPL", "AMZN",
"HTZ", "NFLX", "AAPL", "AMZN", "HTZ",
"NFLX", "AAPL", "AMZN", "HTZ", "NFLX",
"AAPL", "AMZN", "HTZ", "NFLX", "AAPL",
"AMZN", "HTZ", "NFLX", "AAPL", "AMZN",
"HTZ", "NFLX", "AAPL", "AMZN", "HTZ",
"NFLX", "AAPL", "AMZN", "HTZ", "NFLX"),
closing_price = c(200, 1900, 100, 150, 210,
1950, 90, 160, 211, 1975, 75, 150,
213, 1980, 60, 140, 211, 1990, 50,
150, 213, 1991, 45, 160, 214, 1990,
40, 150, 215, 1998, 38, 140, 217,
2010, 30, 150, 216, 2020, 20, 150),
trend = c("increasing", "increasing", "", "",
"increasing", "increasing", "", "",
"increasing", "increasing", "", "",
"increasing", "increasing", "", "",
"increasing", "increasing", "", "",
"increasing", "increasing", "", "",
"increasing", "increasing", "", "",
"increasing", "increasing", "", "")),
row.names = c(NA, -40L), class = c("tbl_df", "tbl", "data.frame"))
这是我目前得到的:
#function to label a trend as increasing
label_increasing <- function(stck_df){
mdl <- lm(closing_price ~ Date, data = stck_df)
#create a model using the date as a predictor
if(mdl$coefficients["Date"] > 0){
return("increasing")
#if the trend is increasing with date, return "increasing"
}#end if
}#end function
apple_dat <- dat %>%
filter(stock_name == "AAPL")
#filter just the apple stock
apple_label <- label_increasing(filtered_dat)
apple_label
#works for a single stock
labeled_dat <- dat %>%
group_by(stock_name) %>%
mutate(trend = label_increasing(.))
labeled_dat
#does not work for the full data frame
labeled_dat <- dat %>%
group_by(stock_name) %>%
mutate(trend = map(., label_increasing))
labeled_dat
#I have a feeling I need to do some mapping but this isn't quite right
最后,灵感来自纽约时报 Covid-19 仪表板。状态增加和减少的部分。找到 here.
用ggplot可视化数据,看看!
install.packages("ggplot2")
library(ggplot2)
ggplot(data = dat) +
geom_line(mapping = aes(x = Date, y = closing_price)) +
facet_wrap(~stock_name, scales = "free_y")
这是对您问题的回答您将如何确定这些股票中哪些有上涨趋势?(如果您想进行财务分析,我建议雇人。 )
您可以 nest
每个 stock_name
的数据并将您的函数映射到每个组。
编辑:我不得不修改 label_increasing()
,因此响应变量被命名为 closing_price
。
library(tidyverse)
label_increasing <- function(stck_df){
mdl <- lm(closing_price ~ Date, data = stck_df)
#create a model using the date as a predictor
if(mdl$coefficients["Date"] > 0){
return("increasing")
#if the trend is increasing with date, return "increasing"
} #end if
}#end function
dat %>%
group_by(stock_name) %>%
nest() %>%
mutate(trend = map(data, label_increasing)) %>%
unnest(trend)
#-----
# A tibble: 2 x 3
# Groups: stock_name [2]
stock_name data lm_mod
<chr> <list> <chr>
1 AAPL <tibble [10 x 3]> increasing
2 AMZN <tibble [10 x 3]> increasing
好吧,如果你想要整个时期,让我们以相同的方式开始它们并跟踪增长或增长百分比,那么你可以使用一个简单的过滤语句来选择只显示那些从头到尾增长的,无论多小
library(dplyr)
library(ggplot2)
dat %>% group_by(stock_name) %>%
arrange(Date) %>%
mutate(growth = closing_price - first(closing_price),
growth_percent = (closing_price - first(closing_price))/first(closing_price)*100) %>%
filter(last(growth) >= 0) %>%
ggplot(aes(x = Date, y = growth, group = stock_name, color = stock_name)) +
geom_line()
您的原始数据
dat <- structure(list(Date = structure(c(1592611200, 1592611200, 1592611200,
1592611200, 1592697600, 1592697600,
1592697600, 1592697600, 1592784000,
1592784000, 1592784000, 1592784000,
1592870400, 1592870400, 1592870400,
1592870400, 1592956800, 1592956800,
1592956800, 1592956800, 1593043200,
1593043200, 1593043200, 1593043200,
1593129600, 1593129600, 1593129600,
1593129600, 1593216000, 1593216000,
1593216000, 1593216000, 1593302400,
1593302400, 1593302400, 1593302400,
1593388800, 1593388800, 1593388800,
1593388800),
tzone = "UTC", class = c("POSIXct", "POSIXt")),
stock_name = c("AAPL", "AMZN", "HTZ", "NFLX", "AAPL",
"AMZN", "HTZ", "NFLX", "AAPL", "AMZN",
"HTZ", "NFLX", "AAPL", "AMZN", "HTZ",
"NFLX", "AAPL", "AMZN", "HTZ", "NFLX",
"AAPL", "AMZN", "HTZ", "NFLX", "AAPL",
"AMZN", "HTZ", "NFLX", "AAPL", "AMZN",
"HTZ", "NFLX", "AAPL", "AMZN", "HTZ",
"NFLX", "AAPL", "AMZN", "HTZ", "NFLX"),
closing_price = c(200, 1900, 100, 150, 210,
1950, 90, 160, 211, 1975, 75, 150,
213, 1980, 60, 140, 211, 1990, 50,
150, 213, 1991, 45, 160, 214, 1990,
40, 150, 215, 1998, 38, 140, 217,
2010, 30, 150, 216, 2020, 20, 150)),
row.names = c(NA, -40L), class = c("tbl_df", "tbl", "data.frame"))
# dat