Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
for mu in [128, 256, 512]:
for x, y in [(-1.0, 0), (0.0, mu // 2), (0.99999, mu - 1)]:
y_hat = P.mulaw_quantize(x, mu)
err = np.abs(x - P.inv_mulaw_quantize(y_hat, mu))
print(y, y_hat, err)
assert np.allclose(y, y_hat)
# have small quantize error
assert err <= 0.1
# ndarray input
for mu in [128, 256, 512]:
x = np.random.rand(10)
y = P.mulaw(x, mu)
x_hat = P.inv_mulaw(y, mu)
assert np.allclose(x, x_hat)
P.inv_mulaw_quantize(P.mulaw_quantize(x))
# torch array input
from warnings import warn
import torch
torch.manual_seed(1234)
for mu in [128, 256, 512]:
x = torch.rand(10)
y = P.mulaw(x, mu)
x_hat = P.inv_mulaw(y, mu)
assert np.allclose(x, x_hat)
P.inv_mulaw_quantize(P.mulaw_quantize(x))
if returns_power:
# (1 x N')
p = librosa.feature.rmse(x, frame_length=256, hop_length=128)
upsample_factor = x.size // p.size
# (1 x N)
p = np.repeat(p, upsample_factor, axis=-1)
if p.size < x.size:
# pad against time axis
p = np.pad(p, [(0, 0), (0, x.size - p.size)], mode="constant", constant_values=0)
# shape adajst
p = p.reshape(1, 1, -1)
# (T,)
if mulaw:
x = P.mulaw_quantize(x)
x_org = P.inv_mulaw_quantize(x)
# (C, T)
x = to_categorical(x, num_classes=256).T
# (1, C, T)
x = x.reshape(1, 256, -1).astype(np.float32)
else:
x_org = x
x = x.reshape(1, 1, -1)
if returns_power:
return x, x_org, p
return x, x_org
if c is not None:
if hparams.upsample_conditional_features:
c = c[idx, :, :length // audio.get_hop_size()].unsqueeze(0)
else:
c = c[idx, :, :length].unsqueeze(0)
assert c.dim() == 3
print("Shape of local conditioning features: {}".format(c.size()))
if g is not None:
# TODO: test
g = g[idx]
print("Shape of global conditioning features: {}".format(g.size()))
# Dummy silence
if is_mulaw_quantize(hparams.input_type):
initial_value = P.mulaw_quantize(0, hparams.quantize_channels)
elif is_mulaw(hparams.input_type):
initial_value = P.mulaw(0.0, hparams.quantize_channels)
else:
initial_value = 0.0
print("Intial value:", initial_value)
# (C,)
if is_mulaw_quantize(hparams.input_type):
initial_input = np_utils.to_categorical(
initial_value, num_classes=hparams.quantize_channels).astype(np.float32)
initial_input = torch.from_numpy(initial_input).view(
1, 1, hparams.quantize_channels)
else:
initial_input = torch.zeros(1, 1, 1).fill_(initial_value)
initial_input = initial_input.to(device)
if c is not None:
if hparams.upsample_conditional_features:
c = c[idx, :, :length // audio.get_hop_size()].unsqueeze(0)
else:
c = c[idx, :, :length].unsqueeze(0)
assert c.dim() == 3
print("Shape of local conditioning features: {}".format(c.size()))
if g is not None:
# TODO: test
g = g[idx]
print("Shape of global conditioning features: {}".format(g.size()))
# Dummy silence
if is_mulaw_quantize(hparams.input_type):
initial_value = P.mulaw_quantize(0, hparams.quantize_channels)
elif is_mulaw(hparams.input_type):
initial_value = P.mulaw(0.0, hparams.quantize_channels)
else:
initial_value = 0.0
print("Intial value:", initial_value)
# (C,)
if is_mulaw_quantize(hparams.input_type):
initial_input = np_utils.to_categorical(
initial_value, num_classes=hparams.quantize_channels).astype(np.float32)
initial_input = torch.from_numpy(initial_input).view(
1, 1, hparams.quantize_channels)
else:
initial_input = torch.zeros(1, 1, 1).fill_(initial_value)
initial_input = initial_input.to(device)
# Trim silences
start, end = audio.start_and_end_indices(quantized, silence_threshold)
quantized = quantized[start:end]
wav = wav[start:end]
# Compute a mel-scale spectrogram from the trimmed wav:
# (N, D)
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
# lws pads zeros internally before performing stft
# this is needed to adjast time resolution between audio and mel-spectrogram
l, r = audio.lws_pad_lr(wav, fft_size, audio.get_hop_size())
# zero pad for quantized signal
quantized = np.pad(quantized, (l, r), mode="constant",
constant_values=P.mulaw_quantize(0))
N = mel_spectrogram.shape[0]
assert len(quantized) >= N * audio.get_hop_size()
# time resolution adjastment
# ensure length of raw audio is multiple of hop_size so that we can use
# transposed convolution to upsample
quantized = quantized[:N * audio.get_hop_size()]
assert len(quantized) % audio.get_hop_size() == 0
timesteps = len(quantized)
wav_id = wav_path.split('/')[-1].split('.')[0]
# Write the spectrograms to disk:
audio_filename = '{}-audio.npy'.format(wav_id)
mel_filename = '{}-mel.npy'.format(wav_id)
np.save(os.path.join(out_dir, audio_filename),
# Load the audio to a numpy array:
wav = audio.load_wav(wav_path)
if hparams.rescaling:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# Mu-law quantize
if is_mulaw_quantize(hparams.input_type):
# [0, quantize_channels)
out = P.mulaw_quantize(wav, hparams.quantize_channels)
# Trim silences
start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
wav = wav[start:end]
out = out[start:end]
constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
out_dtype = np.int16
elif is_mulaw(hparams.input_type):
# [-1, 1]
out = P.mulaw(wav, hparams.quantize_channels)
constant_values = P.mulaw(0.0, hparams.quantize_channels)
out_dtype = np.float32
else:
# [-1, 1]
out = wav
constant_values = 0.0
out_dtype = np.float32
# Compute a mel-scale spectrogram from the trimmed wav:
# (N, D)
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
# lws pads zeros internally before performing stft
s = np.random.randint(0, len(x) - max_time_steps)
if local_conditioning:
x, c = x[s:s + max_time_steps], c[s:s + max_time_steps, :]
else:
x = x[s:s + max_time_steps]
new_batch.append((x, c, g))
batch = new_batch
# Lengths
input_lengths = [len(x[0]) for x in batch]
max_input_len = max(input_lengths)
# (B, T, C)
# pad for time-axis
if is_mulaw_quantize(hparams.input_type):
padding_value = P.mulaw_quantize(0, mu=hparams.quantize_channels)
x_batch = np.array([_pad_2d(np_utils.to_categorical(
x[0], num_classes=hparams.quantize_channels),
max_input_len, 0, padding_value) for x in batch], dtype=np.float32)
else:
x_batch = np.array([_pad_2d(x[0].reshape(-1, 1), max_input_len)
for x in batch], dtype=np.float32)
assert len(x_batch.shape) == 3
# (B, T)
if is_mulaw_quantize(hparams.input_type):
padding_value = P.mulaw_quantize(0, mu=hparams.quantize_channels)
y_batch = np.array([_pad(x[0], max_input_len, constant_values=padding_value)
for x in batch], dtype=np.int)
else:
y_batch = np.array([_pad(x[0], max_input_len) for x in batch], dtype=np.float32)
assert len(y_batch.shape) == 2
# (T,)
y_target = y[idx].view(-1).data.cpu().numpy()[:length]
if c is not None:
c = c[idx, :, :length].unsqueeze(0)
assert c.dim() == 3
print("Shape of local conditioning features: {}".format(c.size()))
if g is not None:
# TODO: test
g = g[idx]
print("Shape of global conditioning features: {}".format(g.size()))
# Dummy silence
if is_mulaw_quantize(hparams.input_type):
initial_value = P.mulaw_quantize(0, hparams.quantize_channels)
elif is_mulaw(hparams.input_type):
initial_value = P.mulaw(0.0, hparams.quantize_channels)
else:
initial_value = 0.0
print("Initial value:", initial_value)
# (C,)
if is_mulaw_quantize(hparams.input_type):
initial_input = np_utils.to_categorical(
initial_value, num_classes=hparams.quantize_channels).astype(np.float32)
initial_input = Variable(torch.from_numpy(initial_input)).view(
1, 1, hparams.quantize_channels)
else:
initial_input = Variable(torch.zeros(1, 1, 1).fill_(initial_value))
initial_input = initial_input.cuda() if use_cuda else initial_input
y_hat, c_hat = model.incremental_forward(