在 R 中如何 运行 来自不同数据帧的两个不等长度变量之间的相关性或简单线性回归
In R how to run Correlation or simple linear Regression between two variables of unequal lengths from different data frames
在 R 中,我想 运行 来自不同数据框的两个变量之间的相关性或简单线性回归 lm(userScoreDF$Score ~ Stock$Adj.Close)
,但由于它们的长度不等,我得到了一个错误.我没有合并数据,因为我不确定如何以按日期匹配两个变量的方式合并它们。
有没有办法 运行 对来自不同数据框的两个长度不等的变量进行相关或简单线性回归?有没有办法以按日期匹配两个变量的方式将变量组合到数据框中?这是我的数据:
dput(userScoreDF)
structure(list(Group.date = structure(c(15737, 15746, 15747,
15748, 15749, 15750, 15751, 15752, 15753, 15754, 15755, 15738,
15756, 15757, 15758, 15759, 15760, 15761, 15762, 15763, 15764,
15739, 15740, 15741, 15742, 15743, 15744, 15745, 15765, 15774,
15775, 15776, 15777, 15778, 15779, 15780, 15781, 15782, 15783,
15766, 15784, 15785, 15786, 15787, 15788, 15789, 15790, 15791,
15792, 15793, 15767, 15794, 15795, 15768, 15769, 15770, 15771,
15772, 15773, 15796, 15805, 15806, 15807, 15808, 15809, 15810,
15811, 15812, 15813, 15814, 15797, 15815, 15816, 15817, 15818,
15819, 15820, 15821, 15822, 15823, 15824, 15798, 15825, 15799,
15800, 15801, 15802, 15803, 15804, 15826, 15835, 15836, 15837,
15838, 15839, 15840, 15841, 15842, 15843, 15844, 15827, 15845,
15846, 15847, 15848, 15849, 15850, 15851, 15852, 15853, 15854,
15828, 15855, 15856, 15829, 15830, 15831, 15832, 15833, 15834,
15857, 15866, 15867, 15868, 15869, 15870, 15871, 15872, 15873,
15874, 15875, 15858, 15876, 15877, 15878, 15879, 15880, 15881,
15882, 15883, 15884, 15885, 15859, 15886, 15860, 15861, 15862,
15863, 15864, 15865, 15887, 15896, 15897, 15898, 15899, 15900,
15901, 15902, 15903, 15904, 15905, 15888, 15906, 15907, 15908,
15909, 15910, 15911, 15912, 15913, 15914, 15915, 15889, 15916,
15917, 15890, 15891, 15892, 15893, 15894, 15895, 15918, 15919,
15920), class = "Date"), Score = c(-1.13, -0.93, -1.14, -1.04,
-0.81, -0.64, -1.12, -1.01, -0.6, -0.82, -1.05, -1.34, -0.86,
-0.93, -0.99, -0.9, -0.76, -0.91, -1.03, -0.95, -1.22, -0.74,
-0.95, -0.98, -0.96, -0.97, -0.95, -0.79, -1.27, -0.72, -1.06,
-0.95, -1.05, -1.02, -0.67, -0.9, -0.7, -1.1, -0.95, -1.14, -1.07,
-1.02, -0.88, -0.79, -1.05, -0.97, -0.9, -1.13, -1.05, -0.8,
-0.84, -0.82, -0.53, -0.96, -0.84, -0.95, -0.99, -1.06, -0.98,
-0.91, -0.94, -0.98, -1.03, -0.77, -0.75, -1.17, -1.02, -0.96,
-0.95, -0.81, -0.96, -1.32, -0.9, -1.11, -1.05, -1.08, -0.8,
-1.14, -0.82, -0.92, -0.96, -1.14, -1, -0.96, -1.14, -0.84, -0.83,
-1.13, -1.11, -0.96, -1.06, -0.94, -0.85, -1.21, -0.95, -0.98,
-0.99, -1.15, -1.18, -0.86, -0.9, -1.09, -1.04, -1.05, -1.07,
-1.11, -1.18, -1.07, -0.99, -1.43, -1.02, -0.96, -1.18, -1.05,
-0.88, -0.84, -1.11, -1.15, -1.18, -1.14, -1.4, -1.6, -1.16,
-1.28, -1.33, -1.07, -0.98, -1.24, -0.81, -1.23, -1.05, -0.99,
-1.53, -1.06, -1.26, -1.18, -1.46, -1.25, -1.31, -1.12, -0.98,
-1.08, -1.13, -1.24, -1, -1.3, -1.04, -1.02, -1.19, -1.09, -1.21,
-0.99, -1.07, -1.21, -1.06, -0.96, -1.05, -1.47, -1.52, -1.36,
-1.22, -1.33, -1.36, -1.27, -1.16, -1.36, -1.25, -1.27, -1.3,
-1.04, -0.71, -1.34, -1.19, -1.26, -1.55, -1.53, -1.59, -1.17,
-1, -1.26, -1.14, -1.19, -1.17, -1.12)), .Names = c("Group.date",
"Score"), row.names = c(NA, -184L), class = "data.frame")
dput(Stock)
structure(list(Date = structure(c(15737, 15740, 15741, 15742,
15743, 15744, 15747, 15748, 15749, 15750, 15751, 15755, 15756,
15757, 15758, 15761, 15762, 15763, 15764, 15765, 15768, 15769,
15770, 15771, 15772, 15775, 15776, 15777, 15778, 15779, 15782,
15783, 15784, 15785, 15786, 15789, 15790, 15791, 15792, 15796,
15797, 15798, 15799, 15800, 15803, 15804, 15805, 15806, 15807,
15810, 15811, 15812, 15813, 15814, 15817, 15818, 15819, 15820,
15821, 15824, 15825, 15826, 15827, 15828, 15831, 15832, 15833,
15834, 15835, 15838, 15839, 15840, 15841, 15842, 15845, 15846,
15847, 15848, 15849, 15853, 15854, 15855, 15856, 15859, 15860,
15861, 15862, 15863, 15866, 15867, 15868, 15869, 15870, 15873,
15874, 15875, 15876, 15877, 15880, 15881, 15882, 15883, 15884,
15887, 15888, 15889, 15891, 15894, 15895, 15896, 15897, 15898,
15901, 15902, 15903, 15904, 15905, 15908, 15909, 15910, 15911,
15912, 15915, 15916, 15917, 15918, 15919), class = "Date"), Adj.Close = c(5.69,
5.74, 5.71, 5.77, 5.74, 5.77, 5.79, 5.91, 5.86, 5.87, 5.91, 5.9,
5.79, 5.79, 5.82, 5.73, 5.78, 5.86, 5.8, 5.8, 5.83, 5.87, 5.87,
5.85, 5.88, 5.86, 5.92, 5.88, 5.86, 5.81, 5.87, 6.03, 6.03, 6.06,
6.14, 6.03, 6.05, 6.04, 6.21, 6.25, 6.23, 6.16, 6.21, 6.23, 6.3,
6.28, 6.25, 6.26, 6.22, 7.06, 7.2, 7.09, 7.19, 7.17, 7.17, 7.1,
7.09, 7.14, 7.12, 7.12, 7.05, 7.06, 7.1, 7.15, 7.2, 7.22, 7.32,
7.35, 7.36, 7.18, 7.26, 7.25, 7.28, 7.32, 7.29, 7.39, 7.3, 7.31,
7.33, 7.27, 7.28, 7.34, 7.3, 7.22, 7.26, 7.2, 7.34, 7.24, 7.18,
7.35, 7.35, 7.32, 7.32, 7.22, 7.32, 7, 7.07, 6.97, 6.86, 6.88,
6.97, 6.98, 7.02, 7.07, 7.15, 7.19, 7.16, 7.07, 7.06, 7.18, 6.28,
6.45, 6.72, 6.48, 6.25, 6.05, 6.07, 5.92, 5.85, 5.77, 5.82, 5.74,
5.74, 6.16, 5.96, 6.38, 6.67)), .Names = c("Date", "Adj.Close"
), row.names = c(NA, 127L), class = "data.frame", na.action = structure(128:231, .Names = c("128",
"129", "130", "131", "132", "133", "134", "135", "136", "137",
"138", "139", "140", "141", "142", "143", "144", "145", "146",
"147", "148", "149", "150", "151", "152", "153", "154", "155",
"156", "157", "158", "159", "160", "161", "162", "163", "164",
"165", "166", "167", "168", "169", "170", "171", "172", "173",
"174", "175", "176", "177", "178", "179", "180", "181", "182",
"183", "184", "185", "186", "187", "188", "189", "190", "191",
"192", "193", "194", "195", "196", "197", "198", "199", "200",
"201", "202", "203", "204", "205", "206", "207", "208", "209",
"210", "211", "212", "213", "214", "215", "216", "217", "218",
"219", "220", "221", "222", "223", "224", "225", "226", "227",
"228", "229", "230", "231"), class = "omit"))
沿着各自的日期合并数据框并执行回归:
M <- merge(Stock, userScoreDF, by = 1)
lm(Score ~ Adj.Close, M)
或计算相关性:
with(M, cor(Score, Adj.Close))
根据您的描述,我通常会说这是个糟糕的主意。但是您只是忽略了指定它们具有重叠的日期。你只需要合并它们。
在这里,我将您的第一个 df 命名为 x
,将您的第二个 df 命名为 y
。
x2 <- merge(x[which(x$Group.date %in% y$Date),], y, by.x= "Group.date", by.y= "Date")
lm(Score ~ Adj.Close, data= x2)
当然,更好的问题可能是你为什么在时间序列数据(即相关误差结构)上使用lm
?也就是说你'重新做错了。但是,嘿,你没有问过你的方法的统计有效性。
在 R 中,我想 运行 来自不同数据框的两个变量之间的相关性或简单线性回归 lm(userScoreDF$Score ~ Stock$Adj.Close)
,但由于它们的长度不等,我得到了一个错误.我没有合并数据,因为我不确定如何以按日期匹配两个变量的方式合并它们。
有没有办法 运行 对来自不同数据框的两个长度不等的变量进行相关或简单线性回归?有没有办法以按日期匹配两个变量的方式将变量组合到数据框中?这是我的数据:
dput(userScoreDF)
structure(list(Group.date = structure(c(15737, 15746, 15747,
15748, 15749, 15750, 15751, 15752, 15753, 15754, 15755, 15738,
15756, 15757, 15758, 15759, 15760, 15761, 15762, 15763, 15764,
15739, 15740, 15741, 15742, 15743, 15744, 15745, 15765, 15774,
15775, 15776, 15777, 15778, 15779, 15780, 15781, 15782, 15783,
15766, 15784, 15785, 15786, 15787, 15788, 15789, 15790, 15791,
15792, 15793, 15767, 15794, 15795, 15768, 15769, 15770, 15771,
15772, 15773, 15796, 15805, 15806, 15807, 15808, 15809, 15810,
15811, 15812, 15813, 15814, 15797, 15815, 15816, 15817, 15818,
15819, 15820, 15821, 15822, 15823, 15824, 15798, 15825, 15799,
15800, 15801, 15802, 15803, 15804, 15826, 15835, 15836, 15837,
15838, 15839, 15840, 15841, 15842, 15843, 15844, 15827, 15845,
15846, 15847, 15848, 15849, 15850, 15851, 15852, 15853, 15854,
15828, 15855, 15856, 15829, 15830, 15831, 15832, 15833, 15834,
15857, 15866, 15867, 15868, 15869, 15870, 15871, 15872, 15873,
15874, 15875, 15858, 15876, 15877, 15878, 15879, 15880, 15881,
15882, 15883, 15884, 15885, 15859, 15886, 15860, 15861, 15862,
15863, 15864, 15865, 15887, 15896, 15897, 15898, 15899, 15900,
15901, 15902, 15903, 15904, 15905, 15888, 15906, 15907, 15908,
15909, 15910, 15911, 15912, 15913, 15914, 15915, 15889, 15916,
15917, 15890, 15891, 15892, 15893, 15894, 15895, 15918, 15919,
15920), class = "Date"), Score = c(-1.13, -0.93, -1.14, -1.04,
-0.81, -0.64, -1.12, -1.01, -0.6, -0.82, -1.05, -1.34, -0.86,
-0.93, -0.99, -0.9, -0.76, -0.91, -1.03, -0.95, -1.22, -0.74,
-0.95, -0.98, -0.96, -0.97, -0.95, -0.79, -1.27, -0.72, -1.06,
-0.95, -1.05, -1.02, -0.67, -0.9, -0.7, -1.1, -0.95, -1.14, -1.07,
-1.02, -0.88, -0.79, -1.05, -0.97, -0.9, -1.13, -1.05, -0.8,
-0.84, -0.82, -0.53, -0.96, -0.84, -0.95, -0.99, -1.06, -0.98,
-0.91, -0.94, -0.98, -1.03, -0.77, -0.75, -1.17, -1.02, -0.96,
-0.95, -0.81, -0.96, -1.32, -0.9, -1.11, -1.05, -1.08, -0.8,
-1.14, -0.82, -0.92, -0.96, -1.14, -1, -0.96, -1.14, -0.84, -0.83,
-1.13, -1.11, -0.96, -1.06, -0.94, -0.85, -1.21, -0.95, -0.98,
-0.99, -1.15, -1.18, -0.86, -0.9, -1.09, -1.04, -1.05, -1.07,
-1.11, -1.18, -1.07, -0.99, -1.43, -1.02, -0.96, -1.18, -1.05,
-0.88, -0.84, -1.11, -1.15, -1.18, -1.14, -1.4, -1.6, -1.16,
-1.28, -1.33, -1.07, -0.98, -1.24, -0.81, -1.23, -1.05, -0.99,
-1.53, -1.06, -1.26, -1.18, -1.46, -1.25, -1.31, -1.12, -0.98,
-1.08, -1.13, -1.24, -1, -1.3, -1.04, -1.02, -1.19, -1.09, -1.21,
-0.99, -1.07, -1.21, -1.06, -0.96, -1.05, -1.47, -1.52, -1.36,
-1.22, -1.33, -1.36, -1.27, -1.16, -1.36, -1.25, -1.27, -1.3,
-1.04, -0.71, -1.34, -1.19, -1.26, -1.55, -1.53, -1.59, -1.17,
-1, -1.26, -1.14, -1.19, -1.17, -1.12)), .Names = c("Group.date",
"Score"), row.names = c(NA, -184L), class = "data.frame")
dput(Stock)
structure(list(Date = structure(c(15737, 15740, 15741, 15742,
15743, 15744, 15747, 15748, 15749, 15750, 15751, 15755, 15756,
15757, 15758, 15761, 15762, 15763, 15764, 15765, 15768, 15769,
15770, 15771, 15772, 15775, 15776, 15777, 15778, 15779, 15782,
15783, 15784, 15785, 15786, 15789, 15790, 15791, 15792, 15796,
15797, 15798, 15799, 15800, 15803, 15804, 15805, 15806, 15807,
15810, 15811, 15812, 15813, 15814, 15817, 15818, 15819, 15820,
15821, 15824, 15825, 15826, 15827, 15828, 15831, 15832, 15833,
15834, 15835, 15838, 15839, 15840, 15841, 15842, 15845, 15846,
15847, 15848, 15849, 15853, 15854, 15855, 15856, 15859, 15860,
15861, 15862, 15863, 15866, 15867, 15868, 15869, 15870, 15873,
15874, 15875, 15876, 15877, 15880, 15881, 15882, 15883, 15884,
15887, 15888, 15889, 15891, 15894, 15895, 15896, 15897, 15898,
15901, 15902, 15903, 15904, 15905, 15908, 15909, 15910, 15911,
15912, 15915, 15916, 15917, 15918, 15919), class = "Date"), Adj.Close = c(5.69,
5.74, 5.71, 5.77, 5.74, 5.77, 5.79, 5.91, 5.86, 5.87, 5.91, 5.9,
5.79, 5.79, 5.82, 5.73, 5.78, 5.86, 5.8, 5.8, 5.83, 5.87, 5.87,
5.85, 5.88, 5.86, 5.92, 5.88, 5.86, 5.81, 5.87, 6.03, 6.03, 6.06,
6.14, 6.03, 6.05, 6.04, 6.21, 6.25, 6.23, 6.16, 6.21, 6.23, 6.3,
6.28, 6.25, 6.26, 6.22, 7.06, 7.2, 7.09, 7.19, 7.17, 7.17, 7.1,
7.09, 7.14, 7.12, 7.12, 7.05, 7.06, 7.1, 7.15, 7.2, 7.22, 7.32,
7.35, 7.36, 7.18, 7.26, 7.25, 7.28, 7.32, 7.29, 7.39, 7.3, 7.31,
7.33, 7.27, 7.28, 7.34, 7.3, 7.22, 7.26, 7.2, 7.34, 7.24, 7.18,
7.35, 7.35, 7.32, 7.32, 7.22, 7.32, 7, 7.07, 6.97, 6.86, 6.88,
6.97, 6.98, 7.02, 7.07, 7.15, 7.19, 7.16, 7.07, 7.06, 7.18, 6.28,
6.45, 6.72, 6.48, 6.25, 6.05, 6.07, 5.92, 5.85, 5.77, 5.82, 5.74,
5.74, 6.16, 5.96, 6.38, 6.67)), .Names = c("Date", "Adj.Close"
), row.names = c(NA, 127L), class = "data.frame", na.action = structure(128:231, .Names = c("128",
"129", "130", "131", "132", "133", "134", "135", "136", "137",
"138", "139", "140", "141", "142", "143", "144", "145", "146",
"147", "148", "149", "150", "151", "152", "153", "154", "155",
"156", "157", "158", "159", "160", "161", "162", "163", "164",
"165", "166", "167", "168", "169", "170", "171", "172", "173",
"174", "175", "176", "177", "178", "179", "180", "181", "182",
"183", "184", "185", "186", "187", "188", "189", "190", "191",
"192", "193", "194", "195", "196", "197", "198", "199", "200",
"201", "202", "203", "204", "205", "206", "207", "208", "209",
"210", "211", "212", "213", "214", "215", "216", "217", "218",
"219", "220", "221", "222", "223", "224", "225", "226", "227",
"228", "229", "230", "231"), class = "omit"))
沿着各自的日期合并数据框并执行回归:
M <- merge(Stock, userScoreDF, by = 1)
lm(Score ~ Adj.Close, M)
或计算相关性:
with(M, cor(Score, Adj.Close))
根据您的描述,我通常会说这是个糟糕的主意。但是您只是忽略了指定它们具有重叠的日期。你只需要合并它们。
在这里,我将您的第一个 df 命名为 x
,将您的第二个 df 命名为 y
。
x2 <- merge(x[which(x$Group.date %in% y$Date),], y, by.x= "Group.date", by.y= "Date")
lm(Score ~ Adj.Close, data= x2)
当然,更好的问题可能是你为什么在时间序列数据(即相关误差结构)上使用lm
?也就是说你'重新做错了。但是,嘿,你没有问过你的方法的统计有效性。