使用 R 从两个数据帧中获取相关系数
Get Correlation coefficient from Two Dataframes Using R
我需要使用两个数据帧获得相关系数。
- 第一个数据帧
## ML
generate1 <- seq(ymd_h("2021-11-01-00"), ymd_h("2021-11-01-03"), by = "hours")
datex1 <- date(generate1)
generate2 <- seq(ymd_h("2021-11-02-00"), ymd_h("2021-11-02-03"), by = "hours")
datex2 <- date(generate2)
hourx <- hour(generate1)
method <- c(rep("ARIMA",8),rep("LSTM",8))
books <- c(390,154,154,153,352,170,229,124,458,224,196,485,492,235,139,116)
shirts <- c(312,397,119,357,464,444,453,155,484,454,282,288,141,262,148,258)
shoes <- c(306,274,480,330,143,190,213,477,141,323,316,473,269,149,333,145)
hats <- c(107,101,363,436,282,377,435,381,427,102,100,471,475,134,479,250)
data.predicted <- data.frame(datex = c(datex1,datex2,datex1,datex2),
hour = rep(hourx,4), method = method,
books, shirts, shoes, hats)
#data.predicted
# datex hour method books shirts shoes hats
#1 2021-11-01 0 ARIMA 390 312 306 107
#2 2021-11-01 1 ARIMA 154 397 274 101
#3 2021-11-01 2 ARIMA 154 119 480 363
#4 2021-11-01 3 ARIMA 153 357 330 436
#5 2021-11-02 0 ARIMA 352 464 143 282
#6 2021-11-02 1 ARIMA 170 444 190 377
#7 2021-11-02 2 ARIMA 229 453 213 435
#8 2021-11-02 3 ARIMA 124 155 477 381
#9 2021-11-01 0 LSTM 458 484 141 427
#10 2021-11-01 1 LSTM 224 454 323 102
#11 2021-11-01 2 LSTM 196 282 316 100
#12 2021-11-01 3 LSTM 485 288 473 471
#13 2021-11-02 0 LSTM 492 141 269 475
#14 2021-11-02 1 LSTM 235 262 149 134
#15 2021-11-02 2 LSTM 139 148 333 479
#16 2021-11-02 3 LSTM 116 258 145 250
- 第二个数据帧
## real
generate <- seq(ymd_h("2021-11-01-00"), ymd_h("2021-11-01-03"), by = "hours")
datex <- date(generate)
hourx <- hour(generate)
books <- c(220,120,150,114)
shirts <- c(319,400,130,360)
shoes <- c(300,280,300,330)
hats <- c(120,140,370,400)
data.real <- data.frame(datex, hourx, books, shirts, shoes, hats)
#data.real
# datex hourx books shirts shoes hats
#1 2021-11-01 0 220 319 300 120
#2 2021-11-01 1 120 400 280 140
#3 2021-11-01 2 150 130 300 370
#4 2021-11-01 3 114 360 330 400
我想要这个数据框的结果。相关性是基于真实数据,如果真实数据只有1天的数据,那么预测数据会有所调整。
## Result
metrics <-c("books","books","shirts","shirts","shoes","shoes","hats","hats")
method <-c("ARIMA","LSTM","ARIMA","LSTM","ARIMA","LSTM","ARIMA","LSTM")
correlation <- c(0.946898292,0.294308358,0.999957355,0.535718183,
0.167424749,0.547561054,0.993560612,0.085661117)
result.cor <- data.frame(metrics, method, correlation)
#result.cor
# metrics method correlation
#1 books ARIMA 0.94689829
#2 books LSTM 0.29430836
#3 shirts ARIMA 0.99995736
#4 shirts LSTM 0.53571818
#5 shoes ARIMA 0.16742475
#6 shoes LSTM 0.54756105
#7 hats ARIMA 0.99356061
#8 hats LSTM 0.08566112
我们可以看到ARIMA的值为0.94689829,来自
ARIMA.pred <- subset(data.predicted, method == "ARIMA" & datex == "2021-11-01")
#ARIMA.pred
# datex hour method books shirts shoes hats
#1 2021-11-01 0 ARIMA 390 312 306 107
#2 2021-11-01 1 ARIMA 154 397 274 101
#3 2021-11-01 2 ARIMA 154 119 480 363
#4 2021-11-01 3 ARIMA 153 357 330 436
data.real$books
#220 120 150 114
cor(ARIMA.pred$books, data.real$books)
#0.9468983
我如何创建函数来简化并获得结果?
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
library(tidyverse)
generate1 <- seq(ymd_h("2021-11-01-00"), ymd_h("2021-11-01-03"), by = "hours")
datex1 <- date(generate1)
generate2 <- seq(ymd_h("2021-11-02-00"), ymd_h("2021-11-02-03"), by = "hours")
datex2 <- date(generate2)
hourx <- hour(generate1)
method <- c(rep("ARIMA", 8), rep("LSTM", 8))
books <- c(390, 154, 154, 153, 352, 170, 229, 124, 458, 224, 196, 485, 492, 235, 139, 116)
shirts <- c(312, 397, 119, 357, 464, 444, 453, 155, 484, 454, 282, 288, 141, 262, 148, 258)
shoes <- c(306, 274, 480, 330, 143, 190, 213, 477, 141, 323, 316, 473, 269, 149, 333, 145)
hats <- c(107, 101, 363, 436, 282, 377, 435, 381, 427, 102, 100, 471, 475, 134, 479, 250)
data.predicted <- data.frame(
datex = c(datex1, datex2, datex1, datex2),
hour = rep(hourx, 4), method = method,
books, shirts, shoes, hats
)
generate <- seq(ymd_h("2021-11-01-00"), ymd_h("2021-11-01-03"), by = "hours")
datex <- date(generate)
hourx <- hour(generate)
books <- c(220, 120, 150, 114)
shirts <- c(319, 400, 130, 360)
shoes <- c(300, 280, 300, 330)
hats <- c(120, 140, 370, 400)
data.real <- data.frame(datex, hour = hourx, books, shirts, shoes, hats)
data.real
#> datex hour books shirts shoes hats
#> 1 2021-11-01 0 220 319 300 120
#> 2 2021-11-01 1 120 400 280 140
#> 3 2021-11-01 2 150 130 300 370
#> 4 2021-11-01 3 114 360 330 400
data.predicted
#> datex hour method books shirts shoes hats
#> 1 2021-11-01 0 ARIMA 390 312 306 107
#> 2 2021-11-01 1 ARIMA 154 397 274 101
#> 3 2021-11-01 2 ARIMA 154 119 480 363
#> 4 2021-11-01 3 ARIMA 153 357 330 436
#> 5 2021-11-02 0 ARIMA 352 464 143 282
#> 6 2021-11-02 1 ARIMA 170 444 190 377
#> 7 2021-11-02 2 ARIMA 229 453 213 435
#> 8 2021-11-02 3 ARIMA 124 155 477 381
#> 9 2021-11-01 0 LSTM 458 484 141 427
#> 10 2021-11-01 1 LSTM 224 454 323 102
#> 11 2021-11-01 2 LSTM 196 282 316 100
#> 12 2021-11-01 3 LSTM 485 288 473 471
#> 13 2021-11-02 0 LSTM 492 141 269 475
#> 14 2021-11-02 1 LSTM 235 262 149 134
#> 15 2021-11-02 2 LSTM 139 148 333 479
#> 16 2021-11-02 3 LSTM 116 258 145 250
covariates <- c("books", "shirts", "shoes", "hats")
methods <- c("ARIMA", "LSTM")
list(
data.real %>% mutate(type = "real"),
data.predicted %>% mutate(type = "predicted")
) %>%
bind_rows() %>%
nest(-datex) %>%
expand_grid(
covariate = covariates,
method = methods
) %>%
mutate(cor = list(data, covariate, method) %>% pmap_dbl(possibly(~ {
real <- .x %>%
filter(type == "real") %>%
pluck(.y)
predicted <- .x %>%
filter(method == ..3) %>%
pluck(.y)
cor(real, predicted)
}, NA))) %>%
select(-data)
#> Warning: All elements of `...` must be named.
#> Did you want `data = c(hour, books, shirts, shoes, hats, type, method)`?
#> # A tibble: 16 x 4
#> datex covariate method cor
#> <date> <chr> <chr> <dbl>
#> 1 2021-11-01 books ARIMA 0.947
#> 2 2021-11-01 books LSTM 0.294
#> 3 2021-11-01 shirts ARIMA 1.00
#> 4 2021-11-01 shirts LSTM 0.536
#> 5 2021-11-01 shoes ARIMA 0.167
#> 6 2021-11-01 shoes LSTM 0.548
#> 7 2021-11-01 hats ARIMA 0.994
#> 8 2021-11-01 hats LSTM 0.0857
#> 9 2021-11-02 books ARIMA NA
#> 10 2021-11-02 books LSTM NA
#> 11 2021-11-02 shirts ARIMA NA
#> 12 2021-11-02 shirts LSTM NA
#> 13 2021-11-02 shoes ARIMA NA
#> 14 2021-11-02 shoes LSTM NA
#> 15 2021-11-02 hats ARIMA NA
#> 16 2021-11-02 hats LSTM NA
由 reprex package (v2.0.1)
于 2021-11-25 创建
一种data.table方法
使用 setDT()
将其转换为 data.table 对象
library(data.table)
setDT(data.predicted)
setDT(data.real)
第一步是融化data.predicted和data.real。
data.predicted <- melt(data.predicted, id.vars = c('datex', 'hourx', 'method'), measure.vars = c('books', 'shirts', 'shoes', 'hats'), variable.name = 'metrics', value.name = 'Value_Pred')
data.real <- melt(data.real, id.vars = c('datex', 'hourx'), measure.vars = c('books', 'shirts', 'shoes', 'hats'), variable.name = 'metrics', value.name = 'Value_Real')
下一步是加入 datex、hourx 和 metrics 上的数据集。假设 data.predicted 应用左连接包含 data.real.
中存在的与日期和小时相关的所有数据点
data.predicted <- merge(data.predicted, data.real, by = c('datex', 'hourx', 'metrics'), all.x = TRUE)
最后一步是对每个 datex、指标和方法 值进行关联。
data.predicted <- data.predicted[,.(Cor_Col = cor(Value_Pred, Value_Real)), by = .(datex, metrics, method)]
data.predicted
datex metrics method Cor_Col
1: 2021-11-01 books ARIMA 0.94689829
2: 2021-11-01 books LSTM 0.29430836
3: 2021-11-01 shirts ARIMA 0.99995736
4: 2021-11-01 shirts LSTM 0.53571818
5: 2021-11-01 shoes ARIMA 0.16742475
6: 2021-11-01 shoes LSTM 0.54756105
7: 2021-11-01 hats ARIMA 0.99356061
8: 2021-11-01 hats LSTM 0.08566112
9: 2021-11-02 books ARIMA NA
10: 2021-11-02 books LSTM NA
11: 2021-11-02 shirts ARIMA NA
12: 2021-11-02 shirts LSTM NA
13: 2021-11-02 shoes ARIMA NA
14: 2021-11-02 shoes LSTM NA
15: 2021-11-02 hats ARIMA NA
16: 2021-11-02 hats LSTM NA
我需要使用两个数据帧获得相关系数。
- 第一个数据帧
## ML
generate1 <- seq(ymd_h("2021-11-01-00"), ymd_h("2021-11-01-03"), by = "hours")
datex1 <- date(generate1)
generate2 <- seq(ymd_h("2021-11-02-00"), ymd_h("2021-11-02-03"), by = "hours")
datex2 <- date(generate2)
hourx <- hour(generate1)
method <- c(rep("ARIMA",8),rep("LSTM",8))
books <- c(390,154,154,153,352,170,229,124,458,224,196,485,492,235,139,116)
shirts <- c(312,397,119,357,464,444,453,155,484,454,282,288,141,262,148,258)
shoes <- c(306,274,480,330,143,190,213,477,141,323,316,473,269,149,333,145)
hats <- c(107,101,363,436,282,377,435,381,427,102,100,471,475,134,479,250)
data.predicted <- data.frame(datex = c(datex1,datex2,datex1,datex2),
hour = rep(hourx,4), method = method,
books, shirts, shoes, hats)
#data.predicted
# datex hour method books shirts shoes hats
#1 2021-11-01 0 ARIMA 390 312 306 107
#2 2021-11-01 1 ARIMA 154 397 274 101
#3 2021-11-01 2 ARIMA 154 119 480 363
#4 2021-11-01 3 ARIMA 153 357 330 436
#5 2021-11-02 0 ARIMA 352 464 143 282
#6 2021-11-02 1 ARIMA 170 444 190 377
#7 2021-11-02 2 ARIMA 229 453 213 435
#8 2021-11-02 3 ARIMA 124 155 477 381
#9 2021-11-01 0 LSTM 458 484 141 427
#10 2021-11-01 1 LSTM 224 454 323 102
#11 2021-11-01 2 LSTM 196 282 316 100
#12 2021-11-01 3 LSTM 485 288 473 471
#13 2021-11-02 0 LSTM 492 141 269 475
#14 2021-11-02 1 LSTM 235 262 149 134
#15 2021-11-02 2 LSTM 139 148 333 479
#16 2021-11-02 3 LSTM 116 258 145 250
- 第二个数据帧
## real
generate <- seq(ymd_h("2021-11-01-00"), ymd_h("2021-11-01-03"), by = "hours")
datex <- date(generate)
hourx <- hour(generate)
books <- c(220,120,150,114)
shirts <- c(319,400,130,360)
shoes <- c(300,280,300,330)
hats <- c(120,140,370,400)
data.real <- data.frame(datex, hourx, books, shirts, shoes, hats)
#data.real
# datex hourx books shirts shoes hats
#1 2021-11-01 0 220 319 300 120
#2 2021-11-01 1 120 400 280 140
#3 2021-11-01 2 150 130 300 370
#4 2021-11-01 3 114 360 330 400
我想要这个数据框的结果。相关性是基于真实数据,如果真实数据只有1天的数据,那么预测数据会有所调整。
## Result
metrics <-c("books","books","shirts","shirts","shoes","shoes","hats","hats")
method <-c("ARIMA","LSTM","ARIMA","LSTM","ARIMA","LSTM","ARIMA","LSTM")
correlation <- c(0.946898292,0.294308358,0.999957355,0.535718183,
0.167424749,0.547561054,0.993560612,0.085661117)
result.cor <- data.frame(metrics, method, correlation)
#result.cor
# metrics method correlation
#1 books ARIMA 0.94689829
#2 books LSTM 0.29430836
#3 shirts ARIMA 0.99995736
#4 shirts LSTM 0.53571818
#5 shoes ARIMA 0.16742475
#6 shoes LSTM 0.54756105
#7 hats ARIMA 0.99356061
#8 hats LSTM 0.08566112
我们可以看到ARIMA的值为0.94689829,来自
ARIMA.pred <- subset(data.predicted, method == "ARIMA" & datex == "2021-11-01")
#ARIMA.pred
# datex hour method books shirts shoes hats
#1 2021-11-01 0 ARIMA 390 312 306 107
#2 2021-11-01 1 ARIMA 154 397 274 101
#3 2021-11-01 2 ARIMA 154 119 480 363
#4 2021-11-01 3 ARIMA 153 357 330 436
data.real$books
#220 120 150 114
cor(ARIMA.pred$books, data.real$books)
#0.9468983
我如何创建函数来简化并获得结果?
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
library(tidyverse)
generate1 <- seq(ymd_h("2021-11-01-00"), ymd_h("2021-11-01-03"), by = "hours")
datex1 <- date(generate1)
generate2 <- seq(ymd_h("2021-11-02-00"), ymd_h("2021-11-02-03"), by = "hours")
datex2 <- date(generate2)
hourx <- hour(generate1)
method <- c(rep("ARIMA", 8), rep("LSTM", 8))
books <- c(390, 154, 154, 153, 352, 170, 229, 124, 458, 224, 196, 485, 492, 235, 139, 116)
shirts <- c(312, 397, 119, 357, 464, 444, 453, 155, 484, 454, 282, 288, 141, 262, 148, 258)
shoes <- c(306, 274, 480, 330, 143, 190, 213, 477, 141, 323, 316, 473, 269, 149, 333, 145)
hats <- c(107, 101, 363, 436, 282, 377, 435, 381, 427, 102, 100, 471, 475, 134, 479, 250)
data.predicted <- data.frame(
datex = c(datex1, datex2, datex1, datex2),
hour = rep(hourx, 4), method = method,
books, shirts, shoes, hats
)
generate <- seq(ymd_h("2021-11-01-00"), ymd_h("2021-11-01-03"), by = "hours")
datex <- date(generate)
hourx <- hour(generate)
books <- c(220, 120, 150, 114)
shirts <- c(319, 400, 130, 360)
shoes <- c(300, 280, 300, 330)
hats <- c(120, 140, 370, 400)
data.real <- data.frame(datex, hour = hourx, books, shirts, shoes, hats)
data.real
#> datex hour books shirts shoes hats
#> 1 2021-11-01 0 220 319 300 120
#> 2 2021-11-01 1 120 400 280 140
#> 3 2021-11-01 2 150 130 300 370
#> 4 2021-11-01 3 114 360 330 400
data.predicted
#> datex hour method books shirts shoes hats
#> 1 2021-11-01 0 ARIMA 390 312 306 107
#> 2 2021-11-01 1 ARIMA 154 397 274 101
#> 3 2021-11-01 2 ARIMA 154 119 480 363
#> 4 2021-11-01 3 ARIMA 153 357 330 436
#> 5 2021-11-02 0 ARIMA 352 464 143 282
#> 6 2021-11-02 1 ARIMA 170 444 190 377
#> 7 2021-11-02 2 ARIMA 229 453 213 435
#> 8 2021-11-02 3 ARIMA 124 155 477 381
#> 9 2021-11-01 0 LSTM 458 484 141 427
#> 10 2021-11-01 1 LSTM 224 454 323 102
#> 11 2021-11-01 2 LSTM 196 282 316 100
#> 12 2021-11-01 3 LSTM 485 288 473 471
#> 13 2021-11-02 0 LSTM 492 141 269 475
#> 14 2021-11-02 1 LSTM 235 262 149 134
#> 15 2021-11-02 2 LSTM 139 148 333 479
#> 16 2021-11-02 3 LSTM 116 258 145 250
covariates <- c("books", "shirts", "shoes", "hats")
methods <- c("ARIMA", "LSTM")
list(
data.real %>% mutate(type = "real"),
data.predicted %>% mutate(type = "predicted")
) %>%
bind_rows() %>%
nest(-datex) %>%
expand_grid(
covariate = covariates,
method = methods
) %>%
mutate(cor = list(data, covariate, method) %>% pmap_dbl(possibly(~ {
real <- .x %>%
filter(type == "real") %>%
pluck(.y)
predicted <- .x %>%
filter(method == ..3) %>%
pluck(.y)
cor(real, predicted)
}, NA))) %>%
select(-data)
#> Warning: All elements of `...` must be named.
#> Did you want `data = c(hour, books, shirts, shoes, hats, type, method)`?
#> # A tibble: 16 x 4
#> datex covariate method cor
#> <date> <chr> <chr> <dbl>
#> 1 2021-11-01 books ARIMA 0.947
#> 2 2021-11-01 books LSTM 0.294
#> 3 2021-11-01 shirts ARIMA 1.00
#> 4 2021-11-01 shirts LSTM 0.536
#> 5 2021-11-01 shoes ARIMA 0.167
#> 6 2021-11-01 shoes LSTM 0.548
#> 7 2021-11-01 hats ARIMA 0.994
#> 8 2021-11-01 hats LSTM 0.0857
#> 9 2021-11-02 books ARIMA NA
#> 10 2021-11-02 books LSTM NA
#> 11 2021-11-02 shirts ARIMA NA
#> 12 2021-11-02 shirts LSTM NA
#> 13 2021-11-02 shoes ARIMA NA
#> 14 2021-11-02 shoes LSTM NA
#> 15 2021-11-02 hats ARIMA NA
#> 16 2021-11-02 hats LSTM NA
由 reprex package (v2.0.1)
于 2021-11-25 创建一种data.table方法
使用 setDT()
将其转换为 data.table 对象library(data.table)
setDT(data.predicted)
setDT(data.real)
第一步是融化data.predicted和data.real。
data.predicted <- melt(data.predicted, id.vars = c('datex', 'hourx', 'method'), measure.vars = c('books', 'shirts', 'shoes', 'hats'), variable.name = 'metrics', value.name = 'Value_Pred')
data.real <- melt(data.real, id.vars = c('datex', 'hourx'), measure.vars = c('books', 'shirts', 'shoes', 'hats'), variable.name = 'metrics', value.name = 'Value_Real')
下一步是加入 datex、hourx 和 metrics 上的数据集。假设 data.predicted 应用左连接包含 data.real.
中存在的与日期和小时相关的所有数据点data.predicted <- merge(data.predicted, data.real, by = c('datex', 'hourx', 'metrics'), all.x = TRUE)
最后一步是对每个 datex、指标和方法 值进行关联。
data.predicted <- data.predicted[,.(Cor_Col = cor(Value_Pred, Value_Real)), by = .(datex, metrics, method)]
data.predicted
datex metrics method Cor_Col
1: 2021-11-01 books ARIMA 0.94689829
2: 2021-11-01 books LSTM 0.29430836
3: 2021-11-01 shirts ARIMA 0.99995736
4: 2021-11-01 shirts LSTM 0.53571818
5: 2021-11-01 shoes ARIMA 0.16742475
6: 2021-11-01 shoes LSTM 0.54756105
7: 2021-11-01 hats ARIMA 0.99356061
8: 2021-11-01 hats LSTM 0.08566112
9: 2021-11-02 books ARIMA NA
10: 2021-11-02 books LSTM NA
11: 2021-11-02 shirts ARIMA NA
12: 2021-11-02 shirts LSTM NA
13: 2021-11-02 shoes ARIMA NA
14: 2021-11-02 shoes LSTM NA
15: 2021-11-02 hats ARIMA NA
16: 2021-11-02 hats LSTM NA