如何确保 TensorFlow Generator 上采样过程创建完全覆盖随机噪声的种子?
How to ensure TensorFlow Generator upsampling process creates seed with full coverage of random noise?
我正在调整来自 tensorflow 2.0 dcGAN 教程的代码 (https://www.tensorflow.org/beta/tutorials/generative/dcgan) to a spectrogram of audio signals. I'm using libroasa chroma_cqt to convert the raw audio data into a WxHx2 matrix, and using that as the input. When I attempt to create the seed matrix by upscaling random noise, the result I'm getting is an alternating bands in time-space of random noise and 0s and a thin black bar on the top (see image).
我已经对原始教程代码进行了调整,以处理各种尺寸的图像,并在种子图像和最终输出方面取得了良好的效果,但相同的原则并没有引导我处理 3 维数据。我如何确保我制作的种子具有适当的覆盖率,而不是在实际训练模型时继续这个问题?
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
tf.__version__
import numpy as np
import os
from tensorflow.keras import layers
import librosa
import librosa.display
import matplotlib.pyplot as plt
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
sr = 44100/2
sample_path = os.getcwd()
def make_generator_model():
model = tf.keras.Sequential()
model.add(layers.Dense(2*7*19*128, use_bias=False, dtype='float32', input_shape=(361,)))
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Reshape((2 ,7, 19, 128)))
assert model.output_shape == (None,2, 7, 19, 128) # Note: None is the batch size
model.add(layers.Conv3DTranspose(128, (2, 5, 5), strides=(1, 6, 1), padding='same', use_bias=False))
assert model.output_shape == (None, 2, 42, 19, 128)
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Conv3DTranspose(128, (2, 5, 5), strides=(1, 3, 19), padding='same', use_bias=False))
assert model.output_shape == (None, 2, 126, 361, 128)
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Conv3DTranspose(1, (2, 5, 5), strides=(1, 2, 1), padding='same', use_bias=False, activation='tanh'))
assert model.output_shape == (None, 2, 252, 361, 1)
return model
generator = make_generator_model()
noise = tf.random.normal([1, 361])
generated_audio = generator(noise, training=False)
D = []
for x in range(len(generated_audio[0][0])):
this_line = []
for y in range(len(generated_audio[0][0][x])):
this_line.append(np.complex(generated_audio[0][0][x][y],generated_audio[0][1][x][y]))
D.append(this_line)
D = np.asarray(D)
librosa.display.specshow(librosa.amplitude_to_db(np.abs(D), ref=np.max),
sr=sr, x_axis='time', y_axis='cqt_note')
plt.axis('off')
plt.savefig(sample_path + '\image_at_epoch_fuzz.png')
plt.show()
print(D.shape)
我正在输出音频噪声的视觉表示,它看起来应该像一张完全模糊的图像。相反,我得到了交替的噪音和大的黑色垂直条以及顶部的细黑条。
问题最终是我需要遵循什么规则来匹配生成器种子、内核大小和步幅?有人可以提供一个示例,说明如何以编程方式确保给定层数的步幅和内核大小不匹配吗?
当你的步幅太大时,就会发生这种情况。尝试使用更大的 Dense
层和更小的步幅,或更多 Conv3DTranspose
层。类似于:
def make_generator_model():
model = tf.keras.Sequential()
model.add(layers.Dense(2*32*46*128, use_bias=False, dtype='float32', input_shape=(361,)))
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Reshape((2, 32, 46, 128)))
# assert model.output_shape == (None,2, 7, 19, 128) # Note: None is the batch size
model.add(layers.Conv3DTranspose(128, (2, 3, 3), strides=(1, 2, 2), padding='same', use_bias=False))
# assert model.output_shape == (None, 2, 42, 19, 128)
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Conv3DTranspose(128, (2, 3, 3), strides=(1, 2, 2), padding='same', use_bias=False))
# assert model.output_shape == (None, 2, 126, 361, 128)
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Conv3DTranspose(1, (2, 3, 3), strides=(1, 2, 2), padding='same', use_bias=False, activation='tanh'))
# assert model.output_shape == (None, 2, 252, 361, 1)
model.add(layers.Lambda(lambda x: x[:, :, :252, :361, :]))
return model
所以问题最终是关于卷积 kernel_size 和步幅之间的关系(有关每个术语的更好解释,请参见此处的 Conv3DTranspose 部分 https://keras.io/layers/convolutional/)。致密层一开始就很好。
在原始代码中,以下 Conv3DTranspose 行 kernel_size 不涵盖高度方向 (5<6) 和宽度方向 (5<19)
的步幅
model.add(layers.Conv3DTranspose(128, (2, 5, 5), strides=(1, 6, 1), padding='same', use_bias=False))
model.add(layers.Conv3DTranspose(128, (2, 5, 5), strides=(1, 3, 19), padding='same', use_bias=False))
通过确保 kernel_size 的最小尺寸与所选步幅尺寸相匹配,问题已解决。这是固定的代码:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
tf.__version__
import numpy as np
import os
from tensorflow.keras import layers
import librosa
import librosa.display
import matplotlib.pyplot as plt
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
sr = 44100/2
sample_path = os.getcwd()
def make_generator_model():
model = tf.keras.Sequential()
model.add(layers.Dense(2*7*19*128, use_bias=False, dtype='float64', input_shape=(50,)))
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Reshape((2 ,7, 19, 128)))
#assert model.output_shape == (None,2, 7, 9, 128) # Note: None is the batch size
model.add(layers.Conv3DTranspose(128, (1, 6, 1), strides=(1, 6, 1), padding='same', use_bias=False))
#assert model.output_shape == (None, 2, 42, 19, 128)
model.add(layers.BatchNormalization())
model.add(layers.LeakyReL())
model.add(layers.Conv3DTranspose(128, (1, 3, 19), strides=(1, 3, 19), padding='same', use_bias=False))
#assert model.output_shape == (None, 2, 126, 361, 128)
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Conv3DTranspose(1, (1, 2, 1), strides=(1, 2, 1), padding='same', use_bias=False, activation='tanh'))
#assert model.output_shape == (None, 2, 252, 361, 1)
return model
generator = make_generator_model()
noise = tf.random.normal([1, 50])
generated_audio = generator(noise, training=False)
D = []
for x in range(len(generated_audio[0][0])):
this_line = []
for y in range(len(generated_audio[0][0][x])):
this_line.append(np.complex(generated_audio[0][0][x][y],generated_audio[0][1][x][y]))
D.append(this_line)
D = np.asarray(D)
librosa.display.specshow(librosa.amplitude_to_db(np.abs(D), ref=np.max),
sr=sr, x_axis='time', y_axis='cqt_note')
plt.axis('off')
plt.savefig(sample_path + '\image_at_epoch_fuzz.png')
plt.show()
print(D.shape)
结果:
我正在调整来自 tensorflow 2.0 dcGAN 教程的代码 (https://www.tensorflow.org/beta/tutorials/generative/dcgan) to a spectrogram of audio signals. I'm using libroasa chroma_cqt to convert the raw audio data into a WxHx2 matrix, and using that as the input. When I attempt to create the seed matrix by upscaling random noise, the result I'm getting is an alternating bands in time-space of random noise and 0s and a thin black bar on the top (see image).
我已经对原始教程代码进行了调整,以处理各种尺寸的图像,并在种子图像和最终输出方面取得了良好的效果,但相同的原则并没有引导我处理 3 维数据。我如何确保我制作的种子具有适当的覆盖率,而不是在实际训练模型时继续这个问题?
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
tf.__version__
import numpy as np
import os
from tensorflow.keras import layers
import librosa
import librosa.display
import matplotlib.pyplot as plt
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
sr = 44100/2
sample_path = os.getcwd()
def make_generator_model():
model = tf.keras.Sequential()
model.add(layers.Dense(2*7*19*128, use_bias=False, dtype='float32', input_shape=(361,)))
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Reshape((2 ,7, 19, 128)))
assert model.output_shape == (None,2, 7, 19, 128) # Note: None is the batch size
model.add(layers.Conv3DTranspose(128, (2, 5, 5), strides=(1, 6, 1), padding='same', use_bias=False))
assert model.output_shape == (None, 2, 42, 19, 128)
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Conv3DTranspose(128, (2, 5, 5), strides=(1, 3, 19), padding='same', use_bias=False))
assert model.output_shape == (None, 2, 126, 361, 128)
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Conv3DTranspose(1, (2, 5, 5), strides=(1, 2, 1), padding='same', use_bias=False, activation='tanh'))
assert model.output_shape == (None, 2, 252, 361, 1)
return model
generator = make_generator_model()
noise = tf.random.normal([1, 361])
generated_audio = generator(noise, training=False)
D = []
for x in range(len(generated_audio[0][0])):
this_line = []
for y in range(len(generated_audio[0][0][x])):
this_line.append(np.complex(generated_audio[0][0][x][y],generated_audio[0][1][x][y]))
D.append(this_line)
D = np.asarray(D)
librosa.display.specshow(librosa.amplitude_to_db(np.abs(D), ref=np.max),
sr=sr, x_axis='time', y_axis='cqt_note')
plt.axis('off')
plt.savefig(sample_path + '\image_at_epoch_fuzz.png')
plt.show()
print(D.shape)
我正在输出音频噪声的视觉表示,它看起来应该像一张完全模糊的图像。相反,我得到了交替的噪音和大的黑色垂直条以及顶部的细黑条。
问题最终是我需要遵循什么规则来匹配生成器种子、内核大小和步幅?有人可以提供一个示例,说明如何以编程方式确保给定层数的步幅和内核大小不匹配吗?
当你的步幅太大时,就会发生这种情况。尝试使用更大的 Dense
层和更小的步幅,或更多 Conv3DTranspose
层。类似于:
def make_generator_model():
model = tf.keras.Sequential()
model.add(layers.Dense(2*32*46*128, use_bias=False, dtype='float32', input_shape=(361,)))
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Reshape((2, 32, 46, 128)))
# assert model.output_shape == (None,2, 7, 19, 128) # Note: None is the batch size
model.add(layers.Conv3DTranspose(128, (2, 3, 3), strides=(1, 2, 2), padding='same', use_bias=False))
# assert model.output_shape == (None, 2, 42, 19, 128)
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Conv3DTranspose(128, (2, 3, 3), strides=(1, 2, 2), padding='same', use_bias=False))
# assert model.output_shape == (None, 2, 126, 361, 128)
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Conv3DTranspose(1, (2, 3, 3), strides=(1, 2, 2), padding='same', use_bias=False, activation='tanh'))
# assert model.output_shape == (None, 2, 252, 361, 1)
model.add(layers.Lambda(lambda x: x[:, :, :252, :361, :]))
return model
所以问题最终是关于卷积 kernel_size 和步幅之间的关系(有关每个术语的更好解释,请参见此处的 Conv3DTranspose 部分 https://keras.io/layers/convolutional/)。致密层一开始就很好。 在原始代码中,以下 Conv3DTranspose 行 kernel_size 不涵盖高度方向 (5<6) 和宽度方向 (5<19)
的步幅model.add(layers.Conv3DTranspose(128, (2, 5, 5), strides=(1, 6, 1), padding='same', use_bias=False))
model.add(layers.Conv3DTranspose(128, (2, 5, 5), strides=(1, 3, 19), padding='same', use_bias=False))
通过确保 kernel_size 的最小尺寸与所选步幅尺寸相匹配,问题已解决。这是固定的代码:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
tf.__version__
import numpy as np
import os
from tensorflow.keras import layers
import librosa
import librosa.display
import matplotlib.pyplot as plt
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
sr = 44100/2
sample_path = os.getcwd()
def make_generator_model():
model = tf.keras.Sequential()
model.add(layers.Dense(2*7*19*128, use_bias=False, dtype='float64', input_shape=(50,)))
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Reshape((2 ,7, 19, 128)))
#assert model.output_shape == (None,2, 7, 9, 128) # Note: None is the batch size
model.add(layers.Conv3DTranspose(128, (1, 6, 1), strides=(1, 6, 1), padding='same', use_bias=False))
#assert model.output_shape == (None, 2, 42, 19, 128)
model.add(layers.BatchNormalization())
model.add(layers.LeakyReL())
model.add(layers.Conv3DTranspose(128, (1, 3, 19), strides=(1, 3, 19), padding='same', use_bias=False))
#assert model.output_shape == (None, 2, 126, 361, 128)
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Conv3DTranspose(1, (1, 2, 1), strides=(1, 2, 1), padding='same', use_bias=False, activation='tanh'))
#assert model.output_shape == (None, 2, 252, 361, 1)
return model
generator = make_generator_model()
noise = tf.random.normal([1, 50])
generated_audio = generator(noise, training=False)
D = []
for x in range(len(generated_audio[0][0])):
this_line = []
for y in range(len(generated_audio[0][0][x])):
this_line.append(np.complex(generated_audio[0][0][x][y],generated_audio[0][1][x][y]))
D.append(this_line)
D = np.asarray(D)
librosa.display.specshow(librosa.amplitude_to_db(np.abs(D), ref=np.max),
sr=sr, x_axis='time', y_axis='cqt_note')
plt.axis('off')
plt.savefig(sample_path + '\image_at_epoch_fuzz.png')
plt.show()
print(D.shape)
结果: