Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
make_all(args, make_dataset, args.target_lang)
print('| Wrote preprocessed data to {}'.format(args.destdir))
if args.alignfile:
assert args.trainpref, "--trainpref must be set if --alignfile is specified"
src_file_name = '{}.{}'.format(args.trainpref, args.source_lang)
tgt_file_name = '{}.{}'.format(args.trainpref, args.target_lang)
src_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)))
tgt_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)))
freq_map = {}
with open(args.alignfile, 'r') as align_file:
with open(src_file_name, 'r') as src_file:
with open(tgt_file_name, 'r') as tgt_file:
for a, s, t in zip_longest(align_file, src_file, tgt_file):
si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False)
ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False)
ai = list(map(lambda x: tuple(x.split('-')), a.split()))
for sai, tai in ai:
srcidx = si[int(sai)]
tgtidx = ti[int(tai)]
if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk():
assert srcidx != src_dict.pad()
assert srcidx != src_dict.eos()
assert tgtidx != tgt_dict.pad()
assert tgtidx != tgt_dict.eos()
if srcidx not in freq_map:
freq_map[srcidx] = {}
if tgtidx not in freq_map[srcidx]:
freq_map[srcidx][tgtidx] = 1
else:
def make_batches(lines, args, task, max_positions):
tokens = [
tokenizer.Tokenizer.tokenize(src_str, task.source_dictionary, add_if_not_exist=False).long()
for src_str in lines
]
lengths = np.array([t.numel() for t in tokens])
itr = task.get_batch_iterator(
dataset=data.LanguagePairDataset(tokens, lengths, task.source_dictionary),
max_tokens=args.max_tokens,
max_sentences=args.max_sentences,
max_positions=max_positions,
).next_epoch_itr(shuffle=False)
for batch in itr:
yield Batch(
srcs=[lines[i] for i in batch['id']],
tokens=batch['net_input']['src_tokens'],
lengths=batch['net_input']['src_lengths'],
), batch['id']
def make_batches(lines, src_dict, max_positions):
tokens = [
tokenizer.Tokenizer.tokenize(src_str, src_dict, add_if_not_exist=False).long()
for src_str in lines
]
lengths = np.array([t.numel() for t in tokens])
itr = data.EpochBatchIterator(
dataset=data.MonolingualDataset([(s[:-1], s[1:]) for s in tokens], lengths, src_dict, False),
max_tokens=100,
max_sentences=5,
max_positions=max_positions,
).next_epoch_itr(shuffle=False)
return itr
def make_batches(self, templates, deleted_words, src_dict, max_positions):
temps = [
tokenizer.Tokenizer.tokenize(temp, src_dict, add_if_not_exist=False, tokenize=lambda x: x).long()
for temp in templates
]
deleted = [
tokenizer.Tokenizer.tokenize(word, src_dict, add_if_not_exist=False, tokenize=lambda x: x).long()
for word in deleted_words
]
inputs = [
{'template': temp, 'deleted': dw} for
temp, dw in zip(temps, deleted)
]
lengths = np.array([t['template'].numel() for t in inputs])
dataset = EditDataset(inputs, lengths, src_dict, insert=self.model_args.insert, combine=self.model_args.combine)
itr = self.task.get_batch_iterator(
dataset=dataset,
max_tokens=100,
max_sentences=5,
max_positions=max_positions,
).next_epoch_itr(shuffle=False)
return itr
def make_batches(self, templates, deleted_words, src_dict, max_positions):
temps = [
tokenizer.Tokenizer.tokenize(temp, src_dict, add_if_not_exist=False, tokenize=lambda x: x).long()
for temp in templates
]
deleted = [
tokenizer.Tokenizer.tokenize(word, src_dict, add_if_not_exist=False, tokenize=lambda x: x).long()
for word in deleted_words
]
inputs = [
{'template': temp, 'deleted': dw} for
temp, dw in zip(temps, deleted)
]
lengths = np.array([t['template'].numel() for t in inputs])
dataset = EditDataset(inputs, lengths, src_dict, insert=self.model_args.insert, combine=self.model_args.combine)
itr = self.task.get_batch_iterator(
dataset=dataset,
max_tokens=100,
max_sentences=5,
def make_batches(lines, args, src_dict, max_positions, tgt_str=None, tgt_dict=None):
tokens = [
tokenizer.Tokenizer.tokenize(src_str, src_dict, add_if_not_exist=False).long()
for src_str in lines
]
if not tgt_str is None:
tgt_tokens = [
tokenizer.Tokenizer.tokenize(tgt_str, tgt_dict, add_if_not_exist=False).long()
]
else:
tgt_tokens = None
lengths = np.array([t.numel() for t in tokens])
itr = data.EpochBatchIterator(
dataset=data.LanguagePairDataset(tokens, lengths, src_dict, tgt=tgt_tokens, tgt_sizes=None, tgt_dict=tgt_dict),
max_tokens=args.max_tokens,
max_sentences=args.max_sentences,
max_positions=max_positions,
).next_epoch_itr(shuffle=False)
for batch in itr:
if not tgt_str is None:
yield Batch(
srcs=[lines[i] for i in batch['id']],
tokens=batch['net_input']['src_tokens'],
lengths=batch['net_input']['src_lengths'],
if args.print_alignment:
print('A-{}\t{}'.format(
sample_id,
' '.join(map(lambda x: str(utils.item(x)), alignment))
))
# Compare best scores
max_fluency_score = max(hypo_fluency_score_list)
max_idx = hypo_fluency_score_list.index(max_fluency_score)
max_hypo_str = hypo_str_list[max_idx]
if max_fluency_score <= best_fluency_score:
# Score only the top hypothesis
if align_dict is not None or args.remove_bpe is not None:
# Convert back to tokens for evaluation with unk replacement and/or without BPE
target_tokens = tokenizer.Tokenizer.tokenize(target_str, tgt_dict, add_if_not_exist=True)
max_tokens = hypo_tokens_list[max_idx]
scorer.add(target_tokens, max_tokens)
hypoths.append(max_hypo_str)
hypoths.append(max_hypo_str)
break
else:
# Keep boosting
iteration = iteration + 1
curr_src_str = max_hypo_str
best_fluency_score = max_fluency_score
best_hypo_str = max_hypo_str
wps_meter.update(src_tokens.size(0))
t.log({'wps': round(wps_meter.avg)})
num_sentences += 1
def make_batches(self, lines, src_dict, max_positions, tokenize=str.split):
tokens = [
tokenizer.Tokenizer.tokenize(src_str, src_dict, add_if_not_exist=False, tokenize=tokenize).long()
for src_str in lines
]
lengths = np.array([t.numel() for t in tokens])
# Load dataset
# MonolingualDataset[i] = source, future_target, past_target
# all targets are effectively ignored during inference
dataset = data.MonolingualDataset(
dataset=[(s[:-1], s[1:], None) for s in tokens],
sizes=lengths, src_vocab=src_dict, tgt_vocab=src_dict,
add_eos_for_other_targets=False, shuffle=False)
itr = self.task.get_batch_iterator(
dataset=dataset,
max_tokens=100,
max_sentences=5,
max_positions=max_positions,
def post_process_prediction(hypo_tokens, src_str, alignment, align_dict, tgt_dict, remove_bpe):
from fairseq import tokenizer
hypo_str = tgt_dict.string(hypo_tokens, remove_bpe)
if align_dict is not None:
hypo_str = replace_unk(hypo_str, src_str, alignment, align_dict, tgt_dict.unk_string())
if align_dict is not None or remove_bpe is not None:
# Convert back to tokens for evaluating with unk replacement or without BPE
# Note that the dictionary can be modified inside the method.
hypo_tokens = tokenizer.Tokenizer.tokenize(hypo_str, tgt_dict, add_if_not_exist=True)
return hypo_tokens, hypo_str, alignment