Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_labels_number_of_frames():
# https://github.com/r9y9/nnmnkwii/issues/85
binary_dict, continuous_dict = hts.load_question_set(
join(DATA_DIR, "jp.hed"))
labels = hts.load(join(DATA_DIR, "BASIC5000_0619.lab"))
linguistic_features = fe.linguistic_features(
labels, binary_dict, continuous_dict, add_frame_features=True)
assert labels.num_frames() == linguistic_features.shape[0]
def test_succeeding_times():
l = hts.HTSLabelFile()
l.append((0, 1000000, "OK"))
l.append((1000000, 2000000, "OK"))
description='Prepare segments from HTS-style alignment files',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('wav_scp', type=str, help='wav scp file')
return parser
if __name__ == "__main__":
args = get_parser().parse_args(sys.argv[1:])
with open(args.wav_scp) as f:
for l in f:
recording_id, path = l.split()
lab_path = path.replace("wav/", "lab/").replace(".wav", ".lab")
assert os.path.exists(lab_path)
labels = hts.load(lab_path)
assert "sil" in labels[0][-1]
assert "sil" in labels[-1][-1]
segment_begin = "{:.3f}".format(labels[0][1] * 1e-7)
segment_end = "{:.3f}".format(labels[-1][0] * 1e-7)
# recording_id = "{}_{}_{}".format(utt_id, segment_begin, segment_end)
# As we assume that there's only a single utterance per recording,
# utt_id is same as recording_id.
# https://kaldi-asr.org/doc/data_prep.html
utt_id = recording_id
sys.stdout.write("{} {} {} {}\n".format(utt_id, recording_id, segment_begin, segment_end))
description='Prepare segments from HTS-style alignment files',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('wav_scp', type=str, help='wav scp file')
return parser
if __name__ == "__main__":
args = get_parser().parse_args(sys.argv[1:])
with open(args.wav_scp) as f:
for l in f:
recording_id, path = l.split()
lab_path = path.replace("wav/", "lab/").replace(".wav", ".lab")
assert os.path.exists(lab_path)
labels = hts.load(lab_path)
assert "sil" in labels[0][-1]
assert "sil" in labels[-1][-1]
segment_begin = "{:.3f}".format(labels[0][1] * 1e-7)
segment_end = "{:.3f}".format(labels[-1][0] * 1e-7)
# As we assume that there's only a single utterance per recording,
# utt_id is same as recording_id.
# https://kaldi-asr.org/doc/data_prep.html
utt_id = recording_id
sys.stdout.write("{} {} {} {}\n".format(utt_id, recording_id, segment_begin, segment_end))
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
sr = hparams.sample_rate
# Load the audio to a numpy array. Resampled if needed
wav = audio.load_wav(wav_path)
lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")
# Trim silence from hts labels if available
# TODO
if exists(lab_path) and False:
labels = hts.load(lab_path)
b = int(start_at(labels) * 1e-7 * sr)
e = int(end_at(labels) * 1e-7 * sr)
wav = wav[b:e]
wav, _ = librosa.effects.trim(wav, top_db=20)
else:
wav, _ = librosa.effects.trim(wav, top_db=20)
if hparams.rescaling:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# Mu-law quantize
if is_mulaw_quantize(hparams.input_type):
# [0, quantize_channels)
out = P.mulaw_quantize(wav, hparams.quantize_channels)
# Trim silences
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
sr = hparams.sample_rate
# Load the audio to a numpy array:
wav = audio.load_wav(wav_path)
lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")
# Trim silence from hts labels if available
if exists(lab_path):
labels = hts.load(lab_path)
b = int(start_at(labels) * 1e-7 * sr)
e = int(end_at(labels) * 1e-7 * sr)
wav = wav[b:e]
wav, _ = librosa.effects.trim(wav, top_db=25)
else:
wav, _ = librosa.effects.trim(wav, top_db=15)
# Compute the linear-scale spectrogram from the wav:
spectrogram = audio.spectrogram(wav).astype(np.float32)
n_frames = spectrogram.shape[1]
# Compute a mel-scale spectrogram from the wav:
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
# Write the spectrograms to disk:
spectrogram_filename = 'vctk-spec-%05d.npy' % index
def _process_utterance(out_dir, index, wav_path, text):
sr = hparams.sample_rate
# Load the audio to a numpy array:
wav = dv3.audio.load_wav(wav_path)
lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")
# Trim silence from hts labels if available
if exists(lab_path):
labels = hts.load(lab_path)
assert labels[0][-1] == "silB"
assert labels[-1][-1] == "silE"
b = int(labels[0][1] * 1e-7 * sr)
e = int(labels[-1][0] * 1e-7 * sr)
wav = wav[b:e]
else:
wav, _ = librosa.effects.trim(wav, top_db=30)
# Compute the linear-scale spectrogram from the wav:
spectrogram =dv3.audio.spectrogram(wav).astype(np.float32)
n_frames = spectrogram.shape[1]
# Compute a mel-scale spectrogram from the wav:
mel_spectrogram = dv3.audio.melspectrogram(wav).astype(np.float32)
# Write the spectrograms to disk:
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
sr = hparams.sample_rate
# Load the audio to a numpy array:
wav = audio.load_wav(wav_path)
lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")
# Trim silence from hts labels if available
if exists(lab_path):
labels = hts.load(lab_path)
b = int(start_at(labels) * 1e-7 * sr)
e = int(end_at(labels) * 1e-7 * sr)
wav = wav[b:e]
wav, _ = librosa.effects.trim(wav, top_db=25)
else:
wav, _ = librosa.effects.trim(wav, top_db=15)
if hparams.rescaling:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# Compute the linear-scale spectrogram from the wav:
spectrogram = audio.spectrogram(wav).astype(np.float32)
n_frames = spectrogram.shape[1]
# Compute a mel-scale spectrogram from the wav:
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
def load_labels_with_phone_alignment(hts_labels,
binary_dict,
continuous_dict,
subphone_features=None,
add_frame_features=False,
frame_shift_in_micro_sec=50000):
dict_size = len(binary_dict) + len(continuous_dict)
frame_feature_size = get_frame_feature_size(subphone_features)
dimension = frame_feature_size + dict_size
assert isinstance(hts_labels, hts.HTSLabelFile)
if add_frame_features:
label_feature_matrix = np.empty((hts_labels.num_frames(), dimension))
else:
label_feature_matrix = np.empty((hts_labels.num_phones(), dimension))
label_feature_index = 0
if subphone_features == "coarse_coding":
cc_features = compute_coarse_coding_features()
for idx, (start_time, end_time, full_label) in enumerate(hts_labels):
frame_number = int(end_time / frame_shift_in_micro_sec) - int(start_time / frame_shift_in_micro_sec)
label_binary_vector = pattern_matching_binary(
binary_dict, full_label)
def _process_utterance(out_dir, index, wav_path, text):
sr = hparams.sample_rate
# Load the audio to a numpy array:
wav = audio.load_wav(wav_path)
lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")
# Trim silence from hts labels if available
if exists(lab_path):
labels = hts.load(lab_path)
assert labels[0][-1] == "silB"
assert labels[-1][-1] == "silE"
b = int(labels[0][1] * 1e-7 * sr)
e = int(labels[-1][0] * 1e-7 * sr)
wav = wav[b:e]
else:
wav, _ = librosa.effects.trim(wav, top_db=30)
# Compute the linear-scale spectrogram from the wav:
spectrogram = audio.spectrogram(wav).astype(np.float32)
n_frames = spectrogram.shape[1]
# Compute a mel-scale spectrogram from the wav:
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
# Write the spectrograms to disk: