Julia 中的内核 PCA 实现
Kernel PCA Implementation in Julia
我正在尝试在 Julia notebook 中实现内核主成分分析 (kernel PCA) 的方法。更具体地说,我正在尝试复制本教程中完成的过程:https://sebastianraschka.com/Articles/2014_kernel_pca.html#References
但是教程在 python 中,因此我在 Julia 中复制该方法时遇到问题。
这是我目前在 Julia
中的代码
using LinearAlgebra, CSV, Plots, DataFrames
function sq_norm(X, rows, cols)
# X should be MxN matrix, and it will do the square norm between all N-dim vectors
# rows is the number of rows (INT)
# cols is the number of columns
result = zeros(rows, rows)
for i in 1:rows
for j in 1:rows
sum = 0.0
for k in 1:cols
sum = (X[i, k] - X[j, k])^2
end
# print("this is the sum at i: ")
# print(i)
# print(" and j: ")
# print(j)
# print(" sum: ")
# print(sum)
# print("\n")
result[i, j] = sum
end
end
return result
end
function kernel_mat_maker(gamma, data, rows)
#data must be a square symmetric matrix
result = zeros(rows, rows)
for i in 1:rows
for j in 1:rows
result[i, j] = exp( (-gamma) * data[i, j])
end
end
return result
end
function center_k(K, rows)
one_N = ones(rows, rows)
one_N = (1/rows) * one_N
return K - one_N*K - K*one_N + one_N*K*one_N
end
function data_splitter(data, filter, key)
# data should be Nx2 matrix
# filter will be a Nx1 matrix composed of 1's and 0's
# sum = 0
# siz = size(filter)
# for i in 1:100
# sum += filter[i]
# end
output1 = DataFrame(A = 1:50, B = 0)
output2 = DataFrame(A = 1:50, B = 0)
print("everything fine where expected\n")
for i in 1:size(data, 1)
if filter[i] == 1
output1 = data[i, :]
print("saved to output1 fine\n")
end
end
return output1
end
# data1 = CSV.read("C:\Users\JP-27\Desktop\X1data.csv", header=false)
# data2 = CSV.read("C:\Users\JP-27\Desktop\X2data.csv", header=false)
data = CSV.read("C:\Users\JP-27\Desktop\data.csv", header=true)
gdf = groupby(data, :a)
plot(gdf[1].x, gdf[1].y, seriestype=:scatter, legend=nothing)
plot!(gdf[2].x, gdf[2].y, seriestype=:scatter)
# select(data, 2:3)
# filter = select(data, :1)
newData = select(data, 2:3)
# print("this is newData:\n")
# print(newData)
# print("\n")
nddf = DataFrame(newData)
# print("this is nddf:\n")
# print(nddf)
# print("\n")
# CSV.write("C:\Users\JP-27\Desktop\ju_data_preprocessing.csv", nddf)
step1 = sq_norm(data, 100, 2)
# df1 = DataFrame(step1)
# CSV.write("C:\Users\JP-27\Desktop\ju_sq_dists.csv", df1)
step2 = kernel_mat_maker(15,step1,100)
# df2 = DataFrame(step2)
# CSV.write("C:\Users\JP-27\Desktop\ju_mat_sq_dists.csv", df2)
step3 = center_k(step2, 100)
# df3 = DataFrame(step3)
# CSV.write("C:\Users\JP-27\Desktop\juliaK.csv", df3)
e_vals = eigvals(step3)
e_vcts = eigvecs(step3)
e_vcts = real(e_vcts)
# print("this is e_vcts\n")
# print(e_vcts)
# print("\n")
# e_vects = DataFrame(e_vcts)
# CSV.write("C:\Users\JP-27\Desktop\juliaE_vcts.csv", e_vects)
result = DataFrame(e_vcts[:, 99:100])
# step11 = sq_norm(data1, 50, 2)
# step12 = kernel_mat_maker(15,step11,50)
# step13 = center_k(step12, 50)
# step21 = sq_norm(data2, 50, 2)
# step22 = kernel_mat_maker(15,step21,50)
# step23 = center_k(step22, 50)
# vals1 = eigvals(step13)
# vals2 = eigvals(step23)
# evects1 = eigvecs(step13)
# evects2 = eigvecs(step23)
# evects1 = real(evects1)
# evects2 = real(evects2)
# dataevect1 = DataFrame(evects1[:, 49:50])
# dataevect2 = DataFrame(evects2[:, 49:50])
#now we extract the last two columns of our two processed 50x50 matrices
# plot(dataevect1[1], dataevect1[2], seriestype = :scatter, title = "My Scatter Plot")
# plot!(dataevect2[1], dataevect2[2], seriestype = :scatter, title = "My Scatter Plot")
谁能帮我更正上面的实现?如果您知道一种更简单的方法来完成该过程,它不涉及使用将执行该过程的内核 pca 函数,那也会非常有帮助。
供您参考,内核 PCA 方法在 MultivariateStats (https://multivariatestatsjl.readthedocs.io/en/stable/kpca.html) 中实现。
如果您对细节感兴趣,这里是一个从头开始的实现:
https://github.com/Alexander-Barth/MachineLearningNotebooks/blob/master/kernel-pca.ipynb
我正在尝试在 Julia notebook 中实现内核主成分分析 (kernel PCA) 的方法。更具体地说,我正在尝试复制本教程中完成的过程:https://sebastianraschka.com/Articles/2014_kernel_pca.html#References
但是教程在 python 中,因此我在 Julia 中复制该方法时遇到问题。
这是我目前在 Julia
using LinearAlgebra, CSV, Plots, DataFrames
function sq_norm(X, rows, cols)
# X should be MxN matrix, and it will do the square norm between all N-dim vectors
# rows is the number of rows (INT)
# cols is the number of columns
result = zeros(rows, rows)
for i in 1:rows
for j in 1:rows
sum = 0.0
for k in 1:cols
sum = (X[i, k] - X[j, k])^2
end
# print("this is the sum at i: ")
# print(i)
# print(" and j: ")
# print(j)
# print(" sum: ")
# print(sum)
# print("\n")
result[i, j] = sum
end
end
return result
end
function kernel_mat_maker(gamma, data, rows)
#data must be a square symmetric matrix
result = zeros(rows, rows)
for i in 1:rows
for j in 1:rows
result[i, j] = exp( (-gamma) * data[i, j])
end
end
return result
end
function center_k(K, rows)
one_N = ones(rows, rows)
one_N = (1/rows) * one_N
return K - one_N*K - K*one_N + one_N*K*one_N
end
function data_splitter(data, filter, key)
# data should be Nx2 matrix
# filter will be a Nx1 matrix composed of 1's and 0's
# sum = 0
# siz = size(filter)
# for i in 1:100
# sum += filter[i]
# end
output1 = DataFrame(A = 1:50, B = 0)
output2 = DataFrame(A = 1:50, B = 0)
print("everything fine where expected\n")
for i in 1:size(data, 1)
if filter[i] == 1
output1 = data[i, :]
print("saved to output1 fine\n")
end
end
return output1
end
# data1 = CSV.read("C:\Users\JP-27\Desktop\X1data.csv", header=false)
# data2 = CSV.read("C:\Users\JP-27\Desktop\X2data.csv", header=false)
data = CSV.read("C:\Users\JP-27\Desktop\data.csv", header=true)
gdf = groupby(data, :a)
plot(gdf[1].x, gdf[1].y, seriestype=:scatter, legend=nothing)
plot!(gdf[2].x, gdf[2].y, seriestype=:scatter)
# select(data, 2:3)
# filter = select(data, :1)
newData = select(data, 2:3)
# print("this is newData:\n")
# print(newData)
# print("\n")
nddf = DataFrame(newData)
# print("this is nddf:\n")
# print(nddf)
# print("\n")
# CSV.write("C:\Users\JP-27\Desktop\ju_data_preprocessing.csv", nddf)
step1 = sq_norm(data, 100, 2)
# df1 = DataFrame(step1)
# CSV.write("C:\Users\JP-27\Desktop\ju_sq_dists.csv", df1)
step2 = kernel_mat_maker(15,step1,100)
# df2 = DataFrame(step2)
# CSV.write("C:\Users\JP-27\Desktop\ju_mat_sq_dists.csv", df2)
step3 = center_k(step2, 100)
# df3 = DataFrame(step3)
# CSV.write("C:\Users\JP-27\Desktop\juliaK.csv", df3)
e_vals = eigvals(step3)
e_vcts = eigvecs(step3)
e_vcts = real(e_vcts)
# print("this is e_vcts\n")
# print(e_vcts)
# print("\n")
# e_vects = DataFrame(e_vcts)
# CSV.write("C:\Users\JP-27\Desktop\juliaE_vcts.csv", e_vects)
result = DataFrame(e_vcts[:, 99:100])
# step11 = sq_norm(data1, 50, 2)
# step12 = kernel_mat_maker(15,step11,50)
# step13 = center_k(step12, 50)
# step21 = sq_norm(data2, 50, 2)
# step22 = kernel_mat_maker(15,step21,50)
# step23 = center_k(step22, 50)
# vals1 = eigvals(step13)
# vals2 = eigvals(step23)
# evects1 = eigvecs(step13)
# evects2 = eigvecs(step23)
# evects1 = real(evects1)
# evects2 = real(evects2)
# dataevect1 = DataFrame(evects1[:, 49:50])
# dataevect2 = DataFrame(evects2[:, 49:50])
#now we extract the last two columns of our two processed 50x50 matrices
# plot(dataevect1[1], dataevect1[2], seriestype = :scatter, title = "My Scatter Plot")
# plot!(dataevect2[1], dataevect2[2], seriestype = :scatter, title = "My Scatter Plot")
谁能帮我更正上面的实现?如果您知道一种更简单的方法来完成该过程,它不涉及使用将执行该过程的内核 pca 函数,那也会非常有帮助。
供您参考,内核 PCA 方法在 MultivariateStats (https://multivariatestatsjl.readthedocs.io/en/stable/kpca.html) 中实现。
如果您对细节感兴趣,这里是一个从头开始的实现:
https://github.com/Alexander-Barth/MachineLearningNotebooks/blob/master/kernel-pca.ipynb