Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
length = input_lengths[idx].data.cpu().item()
# (B, C, T)
if y_hat.dim() == 4:
y_hat = y_hat.squeeze(-1)
if is_mulaw_quantize(hparams.input_type):
# (B, T)
y_hat = F.softmax(y_hat, dim=1).max(1)[1]
# (T,)
y_hat = y_hat[idx].data.cpu().long().numpy()
y = y[idx].view(-1).data.cpu().long().numpy()
y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
y = P.inv_mulaw_quantize(y, hparams.quantize_channels)
else:
# (B, T)
y_hat = sample_from_discretized_mix_logistic(
y_hat, log_scale_min=hparams.log_scale_min)
# (T,)
y_hat = y_hat[idx].view(-1).data.cpu().numpy()
y = y[idx].view(-1).data.cpu().numpy()
if is_mulaw(hparams.input_type):
y_hat = P.inv_mulaw(y_hat, hparams.quantize_channels)
y = P.inv_mulaw(y, hparams.quantize_channels)
# Mask by length
y_hat[length:] = 0
y[length:] = 0
assert c.ndim == 2
Tc = c.shape[0]
upsample_factor = audio.get_hop_size()
# Overwrite length according to feature size
length = Tc * upsample_factor
# (Tc, D) -> (Tc', D)
# Repeat features before feeding it to the network
if not hparams.upsample_conditional_features:
c = np.repeat(c, upsample_factor, axis=0)
# B x C x T
c = Variable(torch.FloatTensor(c.T).unsqueeze(0))
if initial_value is None:
if is_mulaw_quantize(hparams.input_type):
initial_value = P.mulaw_quantize(0, hparams.quantize_channels)
else:
initial_value = 0.0
if is_mulaw_quantize(hparams.input_type):
assert initial_value >= 0 and initial_value < hparams.quantize_channels
initial_input = np_utils.to_categorical(
initial_value, num_classes=hparams.quantize_channels).astype(np.float32)
initial_input = Variable(torch.from_numpy(initial_input)).view(
1, 1, hparams.quantize_channels)
else:
initial_input = Variable(torch.zeros(1, 1, 1)).fill_(initial_value)
g = None if g is None else Variable(torch.LongTensor([g]))
if use_cuda:
initial_input = initial_input.cuda()
g = None if g is None else g.cuda()
# (T,)
y_target = y[idx].view(-1).data.cpu().numpy()[:length]
if c is not None:
c = c[idx, :, :length].unsqueeze(0)
assert c.dim() == 3
print("Shape of local conditioning features: {}".format(c.size()))
if g is not None:
# TODO: test
g = g[idx]
print("Shape of global conditioning features: {}".format(g.size()))
# Dummy silence
if is_mulaw_quantize(hparams.input_type):
initial_value = P.mulaw_quantize(0, hparams.quantize_channels)
elif is_mulaw(hparams.input_type):
initial_value = P.mulaw(0.0, hparams.quantize_channels)
else:
initial_value = 0.0
print("Intial value:", initial_value)
# (C,)
if is_mulaw_quantize(hparams.input_type):
initial_input = np_utils.to_categorical(
initial_value, num_classes=hparams.quantize_channels).astype(np.float32)
initial_input = Variable(torch.from_numpy(initial_input)).view(
1, 1, hparams.quantize_channels)
else:
initial_input = Variable(torch.zeros(1, 1, 1).fill_(initial_value))
initial_input = initial_input.cuda() if use_cuda else initial_input
def save_log(sess, step, model, plot_dir, audio_dir, hp):
predicts, targets = sess.run([model.log_outputs, model.targets])
y_hat = P.inv_mulaw_quantize(predicts[0], hp.quantize_channels)
y = P.inv_mulaw_quantize(targets[0], hp.quantize_channels)
pred_wav_path = os.path.join(audio_dir, 'step-{}-pred.wav'.format(step))
target_wav_path = os.path.join(audio_dir, 'step-{}-real.wav'.format(step))
plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(step))
# Save audio
librosa.output.write_wav(pred_wav_path, y_hat, sr=hp.sample_rate)
librosa.output.write_wav(target_wav_path, y, sr=hp.sample_rate)
# Save figure
waveplot(plot_path, y_hat, y, hparams)
def _process_utterance(out_dir, index, wav_path, text):
# Load the audio to a numpy array:
wav = audio.load_wav(wav_path)
if hparams.rescaling:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# Mu-law quantize
if is_mulaw_quantize(hparams.input_type):
# [0, quantize_channels)
out = P.mulaw_quantize(wav, hparams.quantize_channels)
# Trim silences
start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
wav = wav[start:end]
out = out[start:end]
constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
out_dtype = np.int16
elif is_mulaw(hparams.input_type):
# [-1, 1]
out = P.mulaw(wav, hparams.quantize_channels)
constant_values = P.mulaw(0.0, hparams.quantize_channels)
out_dtype = np.float32
else:
# [-1, 1]
out = wav
constant_values = 0.0
# (B, T, C)
# pad for time-axis
if is_mulaw_quantize(hparams.input_type):
padding_value = P.mulaw_quantize(0, mu=hparams.quantize_channels)
x_batch = np.array([_pad_2d(np_utils.to_categorical(
x[0], num_classes=hparams.quantize_channels),
max_input_len, 0, padding_value) for x in batch], dtype=np.float32)
else:
x_batch = np.array([_pad_2d(x[0].reshape(-1, 1), max_input_len)
for x in batch], dtype=np.float32)
assert len(x_batch.shape) == 3
# (B, T)
if is_mulaw_quantize(hparams.input_type):
padding_value = P.mulaw_quantize(0, mu=hparams.quantize_channels)
y_batch = np.array([_pad(x[0], max_input_len, constant_values=padding_value)
for x in batch], dtype=np.int)
else:
y_batch = np.array([_pad(x[0], max_input_len) for x in batch], dtype=np.float32)
assert len(y_batch.shape) == 2
# (B, T, D)
if local_conditioning:
max_len = max([len(x[1]) for x in batch])
c_batch = np.array([_pad_2d(x[1], max_len) for x in batch], dtype=np.float32)
assert len(c_batch.shape) == 3
# (B x C x T)
c_batch = torch.FloatTensor(c_batch).transpose(1, 2).contiguous()
else:
c_batch = None
length = input_lengths[idx].data.cpu().numpy()
if mu is not None:
mu = mu[idx]
# (B, C, T)
if y_hat.dim() == 4:
y_hat = y_hat.squeeze(-1)
if is_mulaw_quantize(hparams.input_type):
# (B, T)
y_hat = F.softmax(y_hat, dim=1).max(1)[1]
# (T,)
y_hat = y_hat[idx].data.cpu().long().numpy()
y = y[idx].view(-1).data.cpu().long().numpy()
y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
y = P.inv_mulaw_quantize(y, hparams.quantize_channels)
else:
# (B, T)
y_hat = sample_from_discretized_mix_logistic(
y_hat, log_scale_min=hparams.log_scale_min)
# (T,)
y_hat = y_hat[idx].view(-1).data.cpu().numpy()
y = y[idx].view(-1).data.cpu().numpy()
if is_mulaw(hparams.input_type):
y_hat = P.inv_mulaw(y_hat, hparams.quantize_channels)
y = P.inv_mulaw(y, hparams.quantize_channels)
# Mask by length
y_hat[length:] = 0
y[length:] = 0
assert c.ndim == 2
Tc = c.shape[0]
upsample_factor = audio.get_hop_size()
# Overwrite length according to feature size
length = Tc * upsample_factor
# (Tc, D) -> (Tc', D)
# Repeat features before feeding it to the network
if not hparams.upsample_conditional_features:
c = np.repeat(c, upsample_factor, axis=0)
# B x C x T
c = Variable(torch.FloatTensor(c.T).unsqueeze(0))
if initial_value is None:
if is_mulaw_quantize(hparams.input_type):
initial_value = P.mulaw_quantize(0, hparams.quantize_channels)
else:
initial_value = 0.0
if is_mulaw_quantize(hparams.input_type):
assert initial_value >= 0 and initial_value < hparams.quantize_channels
initial_input = np_utils.to_categorical(
initial_value, num_classes=hparams.quantize_channels).astype(np.float32)
initial_input = Variable(torch.from_numpy(initial_input)).view(
1, 1, hparams.quantize_channels)
else:
initial_input = Variable(torch.zeros(1, 1, 1)).fill_(initial_value)
g = None if g is None else Variable(torch.LongTensor([g]))
if use_cuda:
initial_input = initial_input.cuda()
g = None if g is None else g.cuda()
y_hat = y_hat[idx].data.cpu().long().numpy()
y = y[idx].view(-1).data.cpu().long().numpy()
y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
y = P.inv_mulaw_quantize(y, hparams.quantize_channels)
else:
# (B, T)
y_hat = sample_from_discretized_mix_logistic(
y_hat, log_scale_min=hparams.log_scale_min)
# (T,)
y_hat = y_hat[idx].view(-1).data.cpu().numpy()
y = y[idx].view(-1).data.cpu().numpy()
if is_mulaw(hparams.input_type):
y_hat = P.inv_mulaw(y_hat, hparams.quantize_channels)
y = P.inv_mulaw(y, hparams.quantize_channels)
# Mask by length
y_hat[length:] = 0
y[length:] = 0
# Save audio
audio_dir = join(checkpoint_dir, "audio")
os.makedirs(audio_dir, exist_ok=True)
path = join(audio_dir, "step{:09d}_predicted.wav".format(global_step))
librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate)
path = join(audio_dir, "step{:09d}_target.wav".format(global_step))
librosa.output.write_wav(path, y, sr=hparams.sample_rate)
# Load the audio to a numpy array. Resampled if needed.
wav = audio.load_wav(wav_path)
if hparams.rescaling:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# Mu-law quantize
if is_mulaw_quantize(hparams.input_type):
# [0, quantize_channels)
out = P.mulaw_quantize(wav, hparams.quantize_channels)
# Trim silences
start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
wav = wav[start:end]
out = out[start:end]
constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
out_dtype = np.int16
elif is_mulaw(hparams.input_type):
# [-1, 1]
out = P.mulaw(wav, hparams.quantize_channels)
constant_values = P.mulaw(0.0, hparams.quantize_channels)
out_dtype = np.float32
else:
# [-1, 1]
out = wav
constant_values = 0.0
out_dtype = np.float32
# Compute a mel-scale spectrogram from the trimmed wav:
# (N, D)
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
# lws pads zeros internally before performing stft