如何使用自动编码器可视化降维? (Python | 张量流)
How to use an autoencoder to visualize dimensionality reduction? (Python | TensorFlow)
我正在尝试调整 Aymeric Damien's code 以可视化 TensorFlow
中实现的自动编码器执行的降维。我见过的所有示例都在 mnist
数字数据集上工作,但我想使用这种方法在二维中可视化 iris 数据集作为玩具示例,这样我就可以弄清楚如何针对我的现实世界调整它数据集。
我的问题是:如何才能使样本特定的二维嵌入可视化?
例如,鸢尾花数据集有 150 samples
和 4 attributes
。我添加了 4 noise attributes
得到总共 8 attributes
。 encoding/decoding 如下: [8, 4, 2, 4, 8]
但我不确定如何提取形状数组 (150, 2)
来可视化嵌入。我还没有找到任何关于如何使用 TensorFlow
可视化降维的教程。
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
# Set random seeds
np.random.seed(0)
tf.set_random_seed(0)
# Load data
iris = load_iris()
# Original Iris : (150,4)
X_iris = iris.data
# Iris with noise : (150,8)
X_iris_with_noise = np.concatenate([X_iris, np.random.random(size=X_iris.shape)], axis=1).astype(np.float32)
y_iris = iris.target
# PCA
pca_xy = PCA(n_components=2).fit_transform(X_iris_with_noise)
with plt.style.context("seaborn-white"):
fig, ax = plt.subplots()
ax.scatter(pca_xy[:,0], pca_xy[:,1], c=y_iris, cmap=plt.cm.Set2)
ax.set_title("PCA | Iris with noise")
# Training Parameters
learning_rate = 0.01
num_steps = 1000
batch_size = 10
display_step = 250
examples_to_show = 10
# Network Parameters
num_hidden_1 = 4 # 1st layer num features
num_hidden_2 = 2 # 2nd layer num features (the latent dim)
num_input = 8 # Iris data input
# tf Graph input
X = tf.placeholder(tf.float32, [None, num_input], name="input")
weights = {
'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1]), dtype=tf.float32, name="encoder_h1"),
'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2]), dtype=tf.float32, name="encoder_h2"),
'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1]), dtype=tf.float32, name="decoder_h1"),
'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input]), dtype=tf.float32, name="decoder_h2"),
}
biases = {
'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1]), dtype=tf.float32, name="encoder_b1"),
'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2]), dtype=tf.float32, name="encoder_b2"),
'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1]), dtype=tf.float32, name="decoder_b1"),
'decoder_b2': tf.Variable(tf.random_normal([num_input]), dtype=tf.float32, name="decoder_b2"),
}
# Building the encoder
def encoder(x):
# Encoder Hidden layer with sigmoid activation #1
layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']),
biases['encoder_b1']))
# Encoder Hidden layer with sigmoid activation #2
layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']),
biases['encoder_b2']))
return layer_2
# Building the decoder
def decoder(x):
# Decoder Hidden layer with sigmoid activation #1
layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']),
biases['decoder_b1']))
# Decoder Hidden layer with sigmoid activation #2
layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']),
biases['decoder_b2']))
return layer_2
# Construct model
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)
# Prediction
y_pred = decoder_op
# Targets (Labels) are the input data.
y_true = X
# Define loss and optimizer, minimize the squared error
loss = tf.reduce_mean(tf.pow(y_true - y_pred, 2))
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()
# Start Training
# Start a new TF session
with tf.Session() as sess:
# Run the initializer
sess.run(init)
# Training
for i in range(1, num_steps+1):
# Prepare Data
# Get the next batch of Iris data
idx_train = np.random.RandomState(i).choice(np.arange(X_iris_with_noise.shape[0]), size=batch_size)
batch_x = X_iris_with_noise[idx_train,:]
# Run optimization op (backprop) and cost op (to get loss value)
_, l = sess.run([optimizer, loss], feed_dict={X: batch_x})
# Display logs per step
if i % display_step == 0 or i == 1:
print('Step %i: Minibatch Loss: %f' % (i, l))
您的嵌入可以通过 h = encoder(X)
访问。然后,对于每个批次,您可以获得如下值:
_, l, embedding = sess.run([optimizer, loss, h], feed_dict={X: batch_x})
使用嵌入可视化 (https://www.tensorflow.org/programmers_guide/embedding) 的 TensorBoard 有一个更好的解决方案:
from tensorflow.contrib.tensorboard.plugins import projector
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = h.name
# Use the same LOG_DIR where you stored your checkpoint.
summary_writer = tf.summary.FileWriter(LOG_DIR)
projector.visualize_embeddings(summary_writer, config)
我正在尝试调整 Aymeric Damien's code 以可视化 TensorFlow
中实现的自动编码器执行的降维。我见过的所有示例都在 mnist
数字数据集上工作,但我想使用这种方法在二维中可视化 iris 数据集作为玩具示例,这样我就可以弄清楚如何针对我的现实世界调整它数据集。
我的问题是:如何才能使样本特定的二维嵌入可视化?
例如,鸢尾花数据集有 150 samples
和 4 attributes
。我添加了 4 noise attributes
得到总共 8 attributes
。 encoding/decoding 如下: [8, 4, 2, 4, 8]
但我不确定如何提取形状数组 (150, 2)
来可视化嵌入。我还没有找到任何关于如何使用 TensorFlow
可视化降维的教程。
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
# Set random seeds
np.random.seed(0)
tf.set_random_seed(0)
# Load data
iris = load_iris()
# Original Iris : (150,4)
X_iris = iris.data
# Iris with noise : (150,8)
X_iris_with_noise = np.concatenate([X_iris, np.random.random(size=X_iris.shape)], axis=1).astype(np.float32)
y_iris = iris.target
# PCA
pca_xy = PCA(n_components=2).fit_transform(X_iris_with_noise)
with plt.style.context("seaborn-white"):
fig, ax = plt.subplots()
ax.scatter(pca_xy[:,0], pca_xy[:,1], c=y_iris, cmap=plt.cm.Set2)
ax.set_title("PCA | Iris with noise")
# Training Parameters
learning_rate = 0.01
num_steps = 1000
batch_size = 10
display_step = 250
examples_to_show = 10
# Network Parameters
num_hidden_1 = 4 # 1st layer num features
num_hidden_2 = 2 # 2nd layer num features (the latent dim)
num_input = 8 # Iris data input
# tf Graph input
X = tf.placeholder(tf.float32, [None, num_input], name="input")
weights = {
'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1]), dtype=tf.float32, name="encoder_h1"),
'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2]), dtype=tf.float32, name="encoder_h2"),
'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1]), dtype=tf.float32, name="decoder_h1"),
'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input]), dtype=tf.float32, name="decoder_h2"),
}
biases = {
'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1]), dtype=tf.float32, name="encoder_b1"),
'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2]), dtype=tf.float32, name="encoder_b2"),
'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1]), dtype=tf.float32, name="decoder_b1"),
'decoder_b2': tf.Variable(tf.random_normal([num_input]), dtype=tf.float32, name="decoder_b2"),
}
# Building the encoder
def encoder(x):
# Encoder Hidden layer with sigmoid activation #1
layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']),
biases['encoder_b1']))
# Encoder Hidden layer with sigmoid activation #2
layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']),
biases['encoder_b2']))
return layer_2
# Building the decoder
def decoder(x):
# Decoder Hidden layer with sigmoid activation #1
layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']),
biases['decoder_b1']))
# Decoder Hidden layer with sigmoid activation #2
layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']),
biases['decoder_b2']))
return layer_2
# Construct model
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)
# Prediction
y_pred = decoder_op
# Targets (Labels) are the input data.
y_true = X
# Define loss and optimizer, minimize the squared error
loss = tf.reduce_mean(tf.pow(y_true - y_pred, 2))
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()
# Start Training
# Start a new TF session
with tf.Session() as sess:
# Run the initializer
sess.run(init)
# Training
for i in range(1, num_steps+1):
# Prepare Data
# Get the next batch of Iris data
idx_train = np.random.RandomState(i).choice(np.arange(X_iris_with_noise.shape[0]), size=batch_size)
batch_x = X_iris_with_noise[idx_train,:]
# Run optimization op (backprop) and cost op (to get loss value)
_, l = sess.run([optimizer, loss], feed_dict={X: batch_x})
# Display logs per step
if i % display_step == 0 or i == 1:
print('Step %i: Minibatch Loss: %f' % (i, l))
您的嵌入可以通过 h = encoder(X)
访问。然后,对于每个批次,您可以获得如下值:
_, l, embedding = sess.run([optimizer, loss, h], feed_dict={X: batch_x})
使用嵌入可视化 (https://www.tensorflow.org/programmers_guide/embedding) 的 TensorBoard 有一个更好的解决方案:
from tensorflow.contrib.tensorboard.plugins import projector
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = h.name
# Use the same LOG_DIR where you stored your checkpoint.
summary_writer = tf.summary.FileWriter(LOG_DIR)
projector.visualize_embeddings(summary_writer, config)