在 R 中如何 运行 来自不同数据帧的两个不等长度变量之间的相关性或简单线性回归

In R how to run Correlation or simple linear Regression between two variables of unequal lengths from different data frames

在 R 中,我想 运行 来自不同数据框的两个变量之间的相关性或简单线性回归 lm(userScoreDF$Score ~ Stock$Adj.Close),但由于它们的长度不等,我得到了一个错误.我没有合并数据,因为我不确定如何以按日期匹配两个变量的方式合并它们。

有没有办法 运行 对来自不同数据框的两个长度不等的变量进行相关或简单线性回归?有没有办法以按日期匹配两个变量的方式将变量组合到数据框中?这是我的数据:

    dput(userScoreDF)
structure(list(Group.date = structure(c(15737, 15746, 15747, 
15748, 15749, 15750, 15751, 15752, 15753, 15754, 15755, 15738, 
15756, 15757, 15758, 15759, 15760, 15761, 15762, 15763, 15764, 
15739, 15740, 15741, 15742, 15743, 15744, 15745, 15765, 15774, 
15775, 15776, 15777, 15778, 15779, 15780, 15781, 15782, 15783, 
15766, 15784, 15785, 15786, 15787, 15788, 15789, 15790, 15791, 
15792, 15793, 15767, 15794, 15795, 15768, 15769, 15770, 15771, 
15772, 15773, 15796, 15805, 15806, 15807, 15808, 15809, 15810, 
15811, 15812, 15813, 15814, 15797, 15815, 15816, 15817, 15818, 
15819, 15820, 15821, 15822, 15823, 15824, 15798, 15825, 15799, 
15800, 15801, 15802, 15803, 15804, 15826, 15835, 15836, 15837, 
15838, 15839, 15840, 15841, 15842, 15843, 15844, 15827, 15845, 
15846, 15847, 15848, 15849, 15850, 15851, 15852, 15853, 15854, 
15828, 15855, 15856, 15829, 15830, 15831, 15832, 15833, 15834, 
15857, 15866, 15867, 15868, 15869, 15870, 15871, 15872, 15873, 
15874, 15875, 15858, 15876, 15877, 15878, 15879, 15880, 15881, 
15882, 15883, 15884, 15885, 15859, 15886, 15860, 15861, 15862, 
15863, 15864, 15865, 15887, 15896, 15897, 15898, 15899, 15900, 
15901, 15902, 15903, 15904, 15905, 15888, 15906, 15907, 15908, 
15909, 15910, 15911, 15912, 15913, 15914, 15915, 15889, 15916, 
15917, 15890, 15891, 15892, 15893, 15894, 15895, 15918, 15919, 
15920), class = "Date"), Score = c(-1.13, -0.93, -1.14, -1.04, 
-0.81, -0.64, -1.12, -1.01, -0.6, -0.82, -1.05, -1.34, -0.86, 
-0.93, -0.99, -0.9, -0.76, -0.91, -1.03, -0.95, -1.22, -0.74, 
-0.95, -0.98, -0.96, -0.97, -0.95, -0.79, -1.27, -0.72, -1.06, 
-0.95, -1.05, -1.02, -0.67, -0.9, -0.7, -1.1, -0.95, -1.14, -1.07, 
-1.02, -0.88, -0.79, -1.05, -0.97, -0.9, -1.13, -1.05, -0.8, 
-0.84, -0.82, -0.53, -0.96, -0.84, -0.95, -0.99, -1.06, -0.98, 
-0.91, -0.94, -0.98, -1.03, -0.77, -0.75, -1.17, -1.02, -0.96, 
-0.95, -0.81, -0.96, -1.32, -0.9, -1.11, -1.05, -1.08, -0.8, 
-1.14, -0.82, -0.92, -0.96, -1.14, -1, -0.96, -1.14, -0.84, -0.83, 
-1.13, -1.11, -0.96, -1.06, -0.94, -0.85, -1.21, -0.95, -0.98, 
-0.99, -1.15, -1.18, -0.86, -0.9, -1.09, -1.04, -1.05, -1.07, 
-1.11, -1.18, -1.07, -0.99, -1.43, -1.02, -0.96, -1.18, -1.05, 
-0.88, -0.84, -1.11, -1.15, -1.18, -1.14, -1.4, -1.6, -1.16, 
-1.28, -1.33, -1.07, -0.98, -1.24, -0.81, -1.23, -1.05, -0.99, 
-1.53, -1.06, -1.26, -1.18, -1.46, -1.25, -1.31, -1.12, -0.98, 
-1.08, -1.13, -1.24, -1, -1.3, -1.04, -1.02, -1.19, -1.09, -1.21, 
-0.99, -1.07, -1.21, -1.06, -0.96, -1.05, -1.47, -1.52, -1.36, 
-1.22, -1.33, -1.36, -1.27, -1.16, -1.36, -1.25, -1.27, -1.3, 
-1.04, -0.71, -1.34, -1.19, -1.26, -1.55, -1.53, -1.59, -1.17, 
-1, -1.26, -1.14, -1.19, -1.17, -1.12)), .Names = c("Group.date", 
"Score"), row.names = c(NA, -184L), class = "data.frame")

dput(Stock)
   structure(list(Date = structure(c(15737, 15740, 15741, 15742, 
15743, 15744, 15747, 15748, 15749, 15750, 15751, 15755, 15756, 
15757, 15758, 15761, 15762, 15763, 15764, 15765, 15768, 15769, 
15770, 15771, 15772, 15775, 15776, 15777, 15778, 15779, 15782, 
15783, 15784, 15785, 15786, 15789, 15790, 15791, 15792, 15796, 
15797, 15798, 15799, 15800, 15803, 15804, 15805, 15806, 15807, 
15810, 15811, 15812, 15813, 15814, 15817, 15818, 15819, 15820, 
15821, 15824, 15825, 15826, 15827, 15828, 15831, 15832, 15833, 
15834, 15835, 15838, 15839, 15840, 15841, 15842, 15845, 15846, 
15847, 15848, 15849, 15853, 15854, 15855, 15856, 15859, 15860, 
15861, 15862, 15863, 15866, 15867, 15868, 15869, 15870, 15873, 
15874, 15875, 15876, 15877, 15880, 15881, 15882, 15883, 15884, 
15887, 15888, 15889, 15891, 15894, 15895, 15896, 15897, 15898, 
15901, 15902, 15903, 15904, 15905, 15908, 15909, 15910, 15911, 
15912, 15915, 15916, 15917, 15918, 15919), class = "Date"), Adj.Close = c(5.69, 
5.74, 5.71, 5.77, 5.74, 5.77, 5.79, 5.91, 5.86, 5.87, 5.91, 5.9, 
5.79, 5.79, 5.82, 5.73, 5.78, 5.86, 5.8, 5.8, 5.83, 5.87, 5.87, 
5.85, 5.88, 5.86, 5.92, 5.88, 5.86, 5.81, 5.87, 6.03, 6.03, 6.06, 
6.14, 6.03, 6.05, 6.04, 6.21, 6.25, 6.23, 6.16, 6.21, 6.23, 6.3, 
6.28, 6.25, 6.26, 6.22, 7.06, 7.2, 7.09, 7.19, 7.17, 7.17, 7.1, 
7.09, 7.14, 7.12, 7.12, 7.05, 7.06, 7.1, 7.15, 7.2, 7.22, 7.32, 
7.35, 7.36, 7.18, 7.26, 7.25, 7.28, 7.32, 7.29, 7.39, 7.3, 7.31, 
7.33, 7.27, 7.28, 7.34, 7.3, 7.22, 7.26, 7.2, 7.34, 7.24, 7.18, 
7.35, 7.35, 7.32, 7.32, 7.22, 7.32, 7, 7.07, 6.97, 6.86, 6.88, 
6.97, 6.98, 7.02, 7.07, 7.15, 7.19, 7.16, 7.07, 7.06, 7.18, 6.28, 
6.45, 6.72, 6.48, 6.25, 6.05, 6.07, 5.92, 5.85, 5.77, 5.82, 5.74, 
5.74, 6.16, 5.96, 6.38, 6.67)), .Names = c("Date", "Adj.Close"
), row.names = c(NA, 127L), class = "data.frame", na.action = structure(128:231, .Names = c("128", 
"129", "130", "131", "132", "133", "134", "135", "136", "137", 
"138", "139", "140", "141", "142", "143", "144", "145", "146", 
"147", "148", "149", "150", "151", "152", "153", "154", "155", 
"156", "157", "158", "159", "160", "161", "162", "163", "164", 
"165", "166", "167", "168", "169", "170", "171", "172", "173", 
"174", "175", "176", "177", "178", "179", "180", "181", "182", 
"183", "184", "185", "186", "187", "188", "189", "190", "191", 
"192", "193", "194", "195", "196", "197", "198", "199", "200", 
"201", "202", "203", "204", "205", "206", "207", "208", "209", 
"210", "211", "212", "213", "214", "215", "216", "217", "218", 
"219", "220", "221", "222", "223", "224", "225", "226", "227", 
"228", "229", "230", "231"), class = "omit"))

沿着各自的日期合并数据框并执行回归:

M <- merge(Stock, userScoreDF, by = 1)
lm(Score ~ Adj.Close, M)

或计算相关性:

with(M, cor(Score, Adj.Close))

根据您的描述,我通常会说这是个糟糕的主意。但是您只是忽略了指定它们具有重叠的日期。你只需要合并它们。

在这里,我将您的第一个 df 命名为 x,将您的第二个 df 命名为 y

x2 <- merge(x[which(x$Group.date %in% y$Date),], y, by.x= "Group.date", by.y= "Date")
lm(Score ~ Adj.Close, data= x2)

当然,更好的问题可能是你为什么在时间序列数据(即相关误差结构)上使用lm也就是说你'重新做错了。但是,嘿,你没有问过你的方法的统计有效性。