'data' 必须是矢量类型,'NULL' R - PCA 和 as.matrix
'data' must be of a vector type, was 'NULL' R - PCA and as.matrix
我正在做您可以在 iris
数据帧 PCA 中找到的最简单的示例,但我不断从 PCA 矩阵中得到相同的错误:
iris_tbl <- copy_to(sc, iris, "iris", overwrite = TRUE)
> pca_model <- tbl(sc, "iris") %>%
+ select(-Species) %>%
+ ml_pca()
> print(pca_model)
Explained variance:
PC1 PC2 PC3 PC4
0.924618723 0.053066483 0.017102610 0.005212184
Rotation:
PC1 PC2 PC3 PC4
Sepal_Length -0.36138659 -0.65658877 0.58202985 0.3154872
Sepal_Width 0.08452251 -0.73016143 -0.59791083 -0.3197231
Petal_Length -0.85667061 0.17337266 -0.07623608 -0.4798390
Petal_Width -0.35828920 0.07548102 -0.54583143 0.7536574
> D <- as.matrix(iris[1:4])
> E <- as.matrix(pca_model$components)
Error in array(x, c(length(x), 1L), if (!is.null(names(x))) list(names(x), :
'data' must be of a vector type, was 'NULL'
谁能指出错误在哪里?我想不通。
谢谢
对您的问题的简短回答是 ml_pca
return 是模型对象而不是结果对象(这些不是严格的官方术语)。如果您检查 pca_model
,您会看到(例如 str(pca_model)
)。例如,您可以认为 pca_model
更像是 lm
中的 return 而不是 prcomp
...基本上,您需要做的是使用模型 'predict'(我把它放在引号中而不是反引号,因为在这种情况下你不能使用 ml_predict
,不知道为什么)用你训练的相同数据来获得你想要的输出。对于 ml_pca_models
有一些方便的包装函数 tidy
,然后 augment
会带你去你需要去的地方。 注意:我不知道如何知道扩充意味着预测,整理意味着收集组件。
不确定您是想要组件(即载荷)还是旋转,所以我给了您两个。
install.packages("Rcpp")
install.packages("sparklyr")
library(sparklyr)
library(dplyr)
sc <- spark_connect(method="databricks") ##change this to for your cluster/spark deployment
iris_tbl <- copy_to(sc, iris, "iris", overwrite = TRUE)
pca_model <- tbl(sc, "iris") %>%
select(-Species) %>%
ml_pca()
print(pca_model)
# Explained variance:
#
# PC1 PC2 PC3 PC4
# 0.924618723 0.053066483 0.017102610 0.005212184
#
# Rotation:
# PC1 PC2 PC3 PC4
# Sepal_Length -0.36138659 -0.65658877 0.58202985 0.3154872
# Sepal_Width 0.08452251 -0.73016143 -0.59791083 -0.3197231
# Petal_Length -0.85667061 0.17337266 -0.07623608 -0.4798390
# Petal_Width -0.35828920 0.07548102 -0.54583143 0.7536574
class(pca_model)
#[1] "ml_model_pca" "ml_model"
str(pca_model)
#List of 8
# $ pipeline_model :List of 5
# ..$ uid : chr "pipeline_9bc1b484009"
# ..$ param_map : Named list()
# ..$ stages :List of 2
# .. ..$ :List of 3
# .. .. ..$ uid : chr "vector_assembler_9bc188edeed"
# .. .. ..$ param_map:List of 3
# .. .. .. ..$ input_cols :List of 4
# .. .. .. .. ..$ : chr "Sepal_Length"
# .. .. .. .. ..$ : chr "Sepal_Width"
# .. .. .. .. ..$ : chr "Petal_Length"
# .. .. .. .. ..$ : chr "Petal_Width"
# .. .. .. ..$ output_col : chr "assembled9bc3ab7e7e1"
# .. .. .. ..$ handle_invalid: chr "error"
# .. .. ..$ .jobj :Classes 'spark_jobj', 'shell_jobj'
# .. .. ..- attr(*, "class")= chr [1:3] "ml_vector_assembler" "ml_transformer" "ml_pipeline_stage"
# .. ..$ :List of 5
# .. .. ..$ uid : chr "pca_9bc60d84696"
loadings <- tidy(pca_model)
loadings
# A tibble: 4 x 5
# features PC1 PC2 PC3 PC4
#
#1 Sepal_Length -0.361 -0.657 0.582 0.315
#2 Sepal_Width 0.0845 -0.730 -0.598 -0.320
#3 Petal_Length -0.857 0.173 -0.0762 -0.480
#4 Petal_Width -0.358 0.0755 -0.546 0.754
rot <- augment(pca_model, iris_tbl) %>% collect() #augment predicts given a model and "new" data.
rot
# A tibble: 150 x 9
# Sepal_Length Sepal_Width Petal_Length Petal_Width Species PC1 PC2 PC3
#
# 1 5.1 3.5 1.4 0.2 setosa -2.82 -5.65 0.660
# 2 4.9 3 1.4 0.2 setosa -2.79 -5.15 0.842
# 3 4.7 3.2 1.3 0.2 setosa -2.61 -5.18 0.614
# 4 4.6 3.1 1.5 0.2 setosa -2.76 -5.01 0.600
# 5 5 3.6 1.4 0.2 setosa -2.77 -5.65 0.542
# 6 5.4 3.9 1.7 0.4 setosa -3.22 -6.07 0.463
# 7 4.6 3.4 1.4 0.3 setosa -2.68 -5.24 0.374
# 8 5 3.4 1.5 0.2 setosa -2.88 -5.49 0.654
# 9 4.4 2.9 1.4 0.2 setosa -2.62 -4.75 0.611
#10 4.9 3.1 1.5 0.1 setosa -2.83 -5.21 0.829
# ... with 140 more rows, and 1 more variable: PC4
我正在做您可以在 iris
数据帧 PCA 中找到的最简单的示例,但我不断从 PCA 矩阵中得到相同的错误:
iris_tbl <- copy_to(sc, iris, "iris", overwrite = TRUE)
> pca_model <- tbl(sc, "iris") %>%
+ select(-Species) %>%
+ ml_pca()
> print(pca_model)
Explained variance:
PC1 PC2 PC3 PC4
0.924618723 0.053066483 0.017102610 0.005212184
Rotation:
PC1 PC2 PC3 PC4
Sepal_Length -0.36138659 -0.65658877 0.58202985 0.3154872
Sepal_Width 0.08452251 -0.73016143 -0.59791083 -0.3197231
Petal_Length -0.85667061 0.17337266 -0.07623608 -0.4798390
Petal_Width -0.35828920 0.07548102 -0.54583143 0.7536574
> D <- as.matrix(iris[1:4])
> E <- as.matrix(pca_model$components)
Error in array(x, c(length(x), 1L), if (!is.null(names(x))) list(names(x), :
'data' must be of a vector type, was 'NULL'
谁能指出错误在哪里?我想不通。 谢谢
对您的问题的简短回答是 ml_pca
return 是模型对象而不是结果对象(这些不是严格的官方术语)。如果您检查 pca_model
,您会看到(例如 str(pca_model)
)。例如,您可以认为 pca_model
更像是 lm
中的 return 而不是 prcomp
...基本上,您需要做的是使用模型 'predict'(我把它放在引号中而不是反引号,因为在这种情况下你不能使用 ml_predict
,不知道为什么)用你训练的相同数据来获得你想要的输出。对于 ml_pca_models
有一些方便的包装函数 tidy
,然后 augment
会带你去你需要去的地方。 注意:我不知道如何知道扩充意味着预测,整理意味着收集组件。
不确定您是想要组件(即载荷)还是旋转,所以我给了您两个。
install.packages("Rcpp")
install.packages("sparklyr")
library(sparklyr)
library(dplyr)
sc <- spark_connect(method="databricks") ##change this to for your cluster/spark deployment
iris_tbl <- copy_to(sc, iris, "iris", overwrite = TRUE)
pca_model <- tbl(sc, "iris") %>%
select(-Species) %>%
ml_pca()
print(pca_model)
# Explained variance:
#
# PC1 PC2 PC3 PC4
# 0.924618723 0.053066483 0.017102610 0.005212184
#
# Rotation:
# PC1 PC2 PC3 PC4
# Sepal_Length -0.36138659 -0.65658877 0.58202985 0.3154872
# Sepal_Width 0.08452251 -0.73016143 -0.59791083 -0.3197231
# Petal_Length -0.85667061 0.17337266 -0.07623608 -0.4798390
# Petal_Width -0.35828920 0.07548102 -0.54583143 0.7536574
class(pca_model)
#[1] "ml_model_pca" "ml_model"
str(pca_model)
#List of 8
# $ pipeline_model :List of 5
# ..$ uid : chr "pipeline_9bc1b484009"
# ..$ param_map : Named list()
# ..$ stages :List of 2
# .. ..$ :List of 3
# .. .. ..$ uid : chr "vector_assembler_9bc188edeed"
# .. .. ..$ param_map:List of 3
# .. .. .. ..$ input_cols :List of 4
# .. .. .. .. ..$ : chr "Sepal_Length"
# .. .. .. .. ..$ : chr "Sepal_Width"
# .. .. .. .. ..$ : chr "Petal_Length"
# .. .. .. .. ..$ : chr "Petal_Width"
# .. .. .. ..$ output_col : chr "assembled9bc3ab7e7e1"
# .. .. .. ..$ handle_invalid: chr "error"
# .. .. ..$ .jobj :Classes 'spark_jobj', 'shell_jobj'
# .. .. ..- attr(*, "class")= chr [1:3] "ml_vector_assembler" "ml_transformer" "ml_pipeline_stage"
# .. ..$ :List of 5
# .. .. ..$ uid : chr "pca_9bc60d84696"
loadings <- tidy(pca_model)
loadings
# A tibble: 4 x 5
# features PC1 PC2 PC3 PC4
#
#1 Sepal_Length -0.361 -0.657 0.582 0.315
#2 Sepal_Width 0.0845 -0.730 -0.598 -0.320
#3 Petal_Length -0.857 0.173 -0.0762 -0.480
#4 Petal_Width -0.358 0.0755 -0.546 0.754
rot <- augment(pca_model, iris_tbl) %>% collect() #augment predicts given a model and "new" data.
rot
# A tibble: 150 x 9
# Sepal_Length Sepal_Width Petal_Length Petal_Width Species PC1 PC2 PC3
#
# 1 5.1 3.5 1.4 0.2 setosa -2.82 -5.65 0.660
# 2 4.9 3 1.4 0.2 setosa -2.79 -5.15 0.842
# 3 4.7 3.2 1.3 0.2 setosa -2.61 -5.18 0.614
# 4 4.6 3.1 1.5 0.2 setosa -2.76 -5.01 0.600
# 5 5 3.6 1.4 0.2 setosa -2.77 -5.65 0.542
# 6 5.4 3.9 1.7 0.4 setosa -3.22 -6.07 0.463
# 7 4.6 3.4 1.4 0.3 setosa -2.68 -5.24 0.374
# 8 5 3.4 1.5 0.2 setosa -2.88 -5.49 0.654
# 9 4.4 2.9 1.4 0.2 setosa -2.62 -4.75 0.611
#10 4.9 3.1 1.5 0.1 setosa -2.83 -5.21 0.829
# ... with 140 more rows, and 1 more variable: PC4