Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
*[m.max_positions() for m in models],
),
ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
required_batch_size_multiple=args.required_batch_size_multiple,
seed=args.seed,
num_workers=args.num_workers,
).next_epoch_itr(shuffle=False)
progress = progress_bar.build_progress_bar(
args, itr,
prefix='valid on \'{}\' subset'.format(subset),
no_progress_bar='simple'
)
log_outputs = []
for i, sample in enumerate(progress):
sample = utils.move_to_cuda(sample) if use_cuda else sample
_loss, _sample_size, log_output = task.valid_step(sample, model, criterion)
progress.log(log_output, step=i)
log_outputs.append(log_output)
log_output = task.aggregate_logging_outputs(log_outputs, criterion)
progress.print(log_output, tag=subset, step=i)
def disambiguate_pronoun(self, model, sentence, use_cuda=False):
sample_json = wsc_utils.convert_sentence_to_json(sentence)
dataset = self.build_dataset_for_inference(sample_json)
sample = dataset.collater([dataset[0]])
if use_cuda:
sample = utils.move_to_cuda(sample)
def get_masked_input(tokens, mask):
masked_tokens = tokens.clone()
masked_tokens[mask.bool()] = self.mask
return masked_tokens
def get_lprobs(tokens, mask):
logits, _ = model(src_tokens=get_masked_input(tokens, mask))
lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float)
scores = lprobs.gather(2, tokens.unsqueeze(-1)).squeeze(-1)
mask = mask.type_as(scores)
scores = (scores * mask).sum(dim=-1) / mask.sum(dim=-1)
return scores
cand_lprobs = get_lprobs(
sample['candidate_tokens'][0],
def _prepare_sample(self, sample):
if sample is None or len(sample) == 0:
return None
if self.cuda:
sample = utils.move_to_cuda(sample)
def apply_half(t):
if t.dtype is torch.float32:
return t.half()
return t
if self.args.fp16:
sample = utils.apply_to_sample(apply_half, sample)
return sample
[{'id': 1, 'source': 'hello world', 'target': 'hallo welt'}]
Args:
samples (List[dict]): samples to backtranslate. Individual samples are
expected to have a 'source' key, which will become the 'target'
after backtranslation.
collate_fn (callable): function to collate samples into a mini-batch
generate_fn (callable): function to generate backtranslations
cuda (bool): use GPU for generation (default: ``True``)
Returns:
List[dict]: an updated list of samples with a backtranslated source
"""
collated_samples = collate_fn(samples)
s = utils.move_to_cuda(collated_samples) if cuda else collated_samples
generated_sources = generate_fn(s)
id_to_src = {
sample['id']: sample['source'] for sample in samples
}
# Go through each tgt sentence in batch and its corresponding best
# generated hypothesis and create a backtranslation data pair
# {id: id, source: generated backtranslation, target: original tgt}
return [
{'id': id.item(), 'target': id_to_src[id.item()], 'source': hypos[0]['tokens'].cpu()}
for id, hypos in zip(collated_samples['id'], generated_sources)
]
# Generate and compute score
coco = task.dataset(args.gen_subset).coco
iou_types = ['bbox']
scorer = CocoEvaluator(coco, iou_types)
num_images = 0
with progress_bar.build_progress_bar(
args, itr,
prefix='inference on \'{}\' subset'.format(args.gen_subset),
no_progress_bar='simple',
) as progress:
wps_meter = TimeMeter()
for sample in progress:
sample = utils.move_to_cuda(sample) if use_cuda else sample
gen_timer.start()
hypos = task.inference_step(generator, models, sample)
num_generated_boxes = sum(len(h['scores']) for h in hypos)
gen_timer.stop(num_generated_boxes)
result = {}
for i, sample_id in enumerate(sample['id'].tolist()):
result[sample_id] = hypos[i]
scorer.update(result)
wps_meter.update(num_generated_boxes)
progress.log({'wps': round(wps_meter.avg)})
num_images += sample['nsentences']
print('| Detected {} images ({} tokens) in {:.1f}s ({:.2f} images/s, {:.2f} tokens/s)'.format(
Args:
maxlen_a/b: generate sequences of maximum length ax + b,
where x is the source sentence length.
cuda: use GPU for generation
timer: StopwatchMeter for timing generations.
"""
if maxlen_b is None:
maxlen_b = self.maxlen
for sample in data_itr:
if "net_input" not in sample:
continue
if cuda:
s = utils.move_to_cuda(sample)
else:
s = sample
input = s["net_input"]
srclen = input["src_tokens"].size(1)
if self.use_char_source:
encoder_input = {
k: v
for k, v in input.items()
if k in ["src_tokens", "src_lengths", "char_inds", "word_lengths"]
}
else:
encoder_input = {
k: v for k, v in input.items() if k in ["src_tokens", "src_lengths"]
}
if timer is not None:
timer.start()
def _prepare_sample(self, sample):
if sample is None or len(sample) == 0:
return None
return utils.move_to_cuda(sample)
prefix_size=0,
):
"""Iterate over a batched dataset and yield individual translations.
Args:
maxlen_a/b: generate sequences of maximum length ax + b,
where x is the source sentence length.
cuda: use GPU for generation
timer: StopwatchMeter for timing generations.
"""
if maxlen_b is None:
maxlen_b = self.maxlen
for sample in data_itr:
if cuda:
s = utils.move_to_cuda(sample)
input = s["net_input"]
# Take the max source length to compute the max target length
srclen = input["src_tokens"].size(1)
# FIXME: handle characters properly
if self.use_char_source:
raise ValueError(
"Character level encoder is not supported yet for "
"multisource sentences."
)
encoder_inputs = (input["src_tokens"], input["src_lengths"])
if timer is not None:
timer.start()
with torch.no_grad():
hypos = self.generate(
encoder_inputs,
srcs_ids=input["src_ids"],
itr = progress_bar.build_progress_bar(
args=progress_bar_args,
iterator=itr,
prefix=f"top-k probs eval",
no_progress_bar="simple",
)
for sample in itr:
sentence_ids = sample["id"]
target_lengths = (
(sample["net_input"]["prev_output_tokens"] != dataset.tgt_dict.pad())
.sum(axis=1)
.numpy()
)
if use_cuda:
sample = utils.move_to_cuda(sample)
avg_probs = None
for model in models:
with torch.no_grad():
net_output = model(**sample["net_input"])
probs = model.get_normalized_probs(net_output, log_probs=False)
if avg_probs is None:
avg_probs = probs
else:
avg_probs.add_(probs)
avg_probs.div_(len(models))
top_k_avg_probs, indices = torch.topk(avg_probs, k=k)
top_k_probs_normalized = F.normalize(top_k_avg_probs, p=1, dim=2).cpu()
indices = indices.cpu()
def generate_batched_itr(
self, data_itr, beam_size=None, maxlen_a=0.0, maxlen_b=None,
cuda=False, timer=None, prefix_size=0,
):
"""Iterate over a batched dataset and yield individual translations.
Args:
maxlen_a/b: generate sequences of maximum length ax + b,
where x is the source sentence length.
cuda: use GPU for generation
timer: StopwatchMeter for timing generations.
"""
if maxlen_b is None:
maxlen_b = self.maxlen
for sample in data_itr:
s = utils.move_to_cuda(sample) if cuda else sample
if 'net_input' not in s:
continue
input = s['net_input']
# model.forward normally channels prev_output_tokens into the decoder
# separately, but SequenceGenerator directly calls model.encoder
encoder_input = {
k: v for k, v in input.items()
if k != 'prev_output_tokens'
}
srclen = encoder_input['src_tokens'].size(1)
#set desired length to batch instances
if self.desired_length > -1:
encoder_input['target_length'].fill_(self.desired_length + 1) #+1 for EOS
if timer is not None:
timer.start()
with torch.no_grad():