Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
'round_to_power_of_two', 'snip_edges', 'subtract_mean', 'use_energy', 'use_log_fbank',
'use_power', 'vtln_high', 'vtln_low', 'vtln_warp', 'window_type']
fn_split = fn.split('-')
assert len(fn_split) == len(arr), ('Len mismatch: %d and %d' % (len(fn_split), len(arr)))
inputs = {arr[i]: utils.parse(fn_split[i]) for i in range(len(arr))}
# print flags for C++
s = ' '.join(['--' + arr[i].replace('_', '-') + '=' + fn_split[i] for i in range(len(arr))])
logging.info(exe_path + ' --dither=0.0 --debug-mel=true ' + s + ' ' + scp_path + ' ' + out_fn)
logging.info()
# print args for python
inputs['dither'] = 0.0
logging.info(inputs)
sound, sample_rate = torchaudio.load_wav(sound_path)
kaldi_output_dict = {k: v for k, v in torchaudio.kaldi_io.read_mat_ark(out_fn)}
res = torchaudio.compliance.kaldi.fbank(sound, **inputs)
torch.set_printoptions(precision=10, sci_mode=False)
logging.info(res)
logging.info(kaldi_output_dict['my_id'])
def get_output_fn(sound, args):
output = kaldi.fbank(
sound,
blackman_coeff=args[1],
dither=0.0,
energy_floor=args[2],
frame_length=args[3],
frame_shift=args[4],
high_freq=args[5],
htk_compat=args[6],
low_freq=args[7],
num_mel_bins=args[8],
preemphasis_coefficient=args[9],
raw_energy=args[10],
remove_dc_offset=args[11],
round_to_power_of_two=args[12],
snip_edges=args[13],
subtract_mean=args[14],
def __init__(self, mode="fbank", num_mel_bins=40, **kwargs):
super(ExtractAudioFeature, self).__init__()
self.mode = mode
self.extract_fn = torchaudio.compliance.kaldi.fbank if mode == "fbank" else torchaudio.compliance.kaldi.mfcc
self.num_mel_bins = num_mel_bins
self.kwargs = kwargs
def __getitem__(self, index):
import torchaudio
import torchaudio.compliance.kaldi as kaldi
tgt_item = self.tgt[index] if self.tgt is not None else None
path = self.aud_paths[index]
if not os.path.exists(path):
raise FileNotFoundError("Audio file not found: {}".format(path))
sound, sample_rate = torchaudio.load_wav(path)
output = kaldi.fbank(
sound,
num_mel_bins=self.num_mel_bins,
frame_length=self.frame_length,
frame_shift=self.frame_shift
)
output_cmvn = data_utils.apply_mv_norm(output)
return {"id": index, "data": [output_cmvn.detach(), tgt_item]}
def speech_to_features(self, speech_in: Tuple[numpy.array, int]):
"""
Turns numpy array with utterance into features
Args:
speech_in (tuple(np.array), int): The utterance, represented as array and the sampling rate
Returns:
np.array: The extracted features of the utterance
"""
sample_frequence = speech_in[1]
speech_in = torch.from_numpy(speech_in[0]).unsqueeze(0)
filter_bank = torchaudio.compliance.kaldi.fbank(speech_in, num_mel_bins=80, sample_frequency=sample_frequence)
# Default ASR model uses 16kHz, but different models are possible, then the sampling rate only needs to be changd in the recorder
pitch = torch.zeros(filter_bank.shape[0], 3) # TODO: check if torchaudio pitch function is better
speech_in_features = torch.cat([filter_bank, pitch], 1).numpy()
return {'speech_features': speech_in_features}
def speech_to_fbank(self, speech_in):
"""
Extracts 23 filterbanks from input utterance.
Args:
speech_in (tuple(np.array), int): The utterance, represented as array and the sampling rate
Returns:
np.array: The extracted features of the utterance
"""
speech = torch.from_numpy(speech_in[0]).unsqueeze(0)
fbank = torchaudio.compliance.kaldi.fbank(
speech,
sample_frequency=speech_in[1]
)
return {'fbank': fbank}