Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
self.rng.shuffle(rand_idx)
files = [files[i] for i in rand_idx[:n_files]]
self.size_by_samples[word] = len(files)
# add each file to the corpus
for filename in files:
file_loc = os.path.join(self.basedir, word, os.path.basename(filename))
# could also add score of original model for each word?
if speech:
meta = Meta(word=word, speech=speech, file_loc=file_loc)
else:
noise_type = os.path.basename(filename).split(".")[0]
meta = Meta(word="NA", noise_type=noise_type, speech=speech, file_loc=file_loc)
if meta.match(**kwargs):
self.add_sample(GoogleSample(filename, **meta.as_dict()))
if subset and speech:
rand_idx = np.arange(len(files))
n_files = min(subset, len(files))
self.rng.shuffle(rand_idx)
files = [files[i] for i in rand_idx[:n_files]]
self.size_by_samples[word] = len(files)
# add each file to the corpus
for filename in files:
file_loc = os.path.join(self.basedir, word, os.path.basename(filename))
# could also add score of original model for each word?
if speech:
meta = Meta(word=word, speech=speech, file_loc=file_loc)
else:
noise_type = os.path.basename(filename).split(".")[0]
meta = Meta(word="NA", noise_type=noise_type, speech=speech, file_loc=file_loc)
if meta.match(**kwargs):
self.add_sample(GoogleSample(filename, **meta.as_dict()))
def __init__(self, data, **kwargs):
''' Dummy init method '''
self.data = data
self.meta = Meta(**kwargs)
def build_corpus(self, **kwargs):
'''
Build the corpus with some filters (sex, lang, accent, sentence_tag, sentence)
'''
# Check all the sentences
for tag, info in cmu_arctic_sentences.items():
# And all speakers for each sentence
for speaker, path in info['paths'].items():
# This is the metadata for this sample
meta = Meta(speaker=speaker, tag=tag, text=info['text'], **cmu_arctic_speakers[speaker])
# it there is a match, add it
# The reason we do the match before creating the Sentence object is that
# we don't want to read the file from disk if there is no match
if meta.match(**kwargs):
self.add_sample(CMUArcticSentence(path, **meta.as_dict()))
self.recordings = self.samples # this is for convenience
eval_dir = os.path.join(self.basedir, 'eval')
dev_dir = os.path.join(self.basedir, 'dev')
if not os.path.exists(eval_dir) or not os.path.exists(dev_dir):
warnings.warn('The ''eval'' and/or ''dev'' folders are missing. Please check the structure of the dataset directory.')
for path, dirs, files in os.walk(self.basedir):
m = RE_PATH.search(path)
if m:
dev = False if m.group(1) == 'eval' else True
task, rec, array = int(m.group(2)), int(m.group(3)), m.group(4)
meta = Meta(task=task, rec=rec, array=array, dev=dev)
if array in locata_arrays and meta.match(**kwargs):
if verbose:
print(path)
self.samples.append(
LocataRecording(path, task=task, rec=rec, array=array, dev=dev)
)
if len(self) == 0:
warnings.warn('Nothing was imported. The dataset can be downloaded at ' + url + '.')