Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
logger.debug(f'Loaded {binpath}')
return word2vec, dim
except IOError:
pass
dim = None
word2vec = dict()
with open(realpath, encoding='utf-8', errors='ignore') as f:
for idx, line in enumerate(f):
line = line.rstrip().split(delimiter)
if len(line) > 2:
if dim is None:
dim = len(line)
else:
if len(line) != dim:
logger.warning('{}#{} length mismatches with {}'.format(path, idx + 1, dim))
continue
word, vec = line[0], line[1:]
word2vec[word] = np.array(vec, dtype=np.float32)
dim -= 1
if cache:
save_pickle((word2vec, dim), binpath)
logger.debug(f'Cached {binpath}')
return word2vec, dim
raw_batch = [[], [], [], []]
max_len = len(max([corpus[i] for i in indices], key=len))
for idx in indices:
arc = np.zeros((max_len, max_len), dtype=np.bool)
rel = np.zeros((max_len, max_len), dtype=np.int64)
for b in raw_batch[:2]:
b.append([])
for m, cells in enumerate(corpus[idx]):
for b, c, v in zip(raw_batch, cells,
[self.form_vocab, self.cpos_vocab]):
b[-1].append(v.get_idx_without_add(c))
for n, r in zip(cells[2], cells[3]):
arc[m, n] = True
rid = self.rel_vocab.get_idx_without_add(r)
if rid is None:
logger.warning(f'Relation OOV: {r} not exists in train')
continue
rel[m, n] = rid
raw_batch[-2].append(arc)
raw_batch[-1].append(rel)
batch = []
for b, v in zip(raw_batch, [self.form_vocab, self.cpos_vocab]):
b = tf.keras.preprocessing.sequence.pad_sequences(b, padding='post',
value=v.safe_pad_token_idx,
dtype='int64')
batch.append(b)
batch += raw_batch[2:]
assert len(batch) == 4
yield (batch[0], batch[1]), (batch[2], batch[3])
tokens = []
label_ids = []
for word, label in zip(words, labels):
word_tokens = tokenizer.tokenize(word)
if not word_tokens:
# some wired chars cause the tagger to return empty list
word_tokens = [unk_token] * len(word)
tokens.extend(word_tokens)
# Use the real label id for the first token of the word, and padding ids for the remaining tokens
label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))
# Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
special_tokens_count = 3 if sep_token_extra else 2
if len(tokens) > max_seq_length - special_tokens_count:
logger.warning(
f'Input tokens {words} exceed the max sequence length of {max_seq_length - special_tokens_count}. '
f'The exceeded part will be truncated and ignored. '
f'You are recommended to split your long text into several sentences within '
f'{max_seq_length - special_tokens_count} tokens beforehand.')
tokens = tokens[: (max_seq_length - special_tokens_count)]
label_ids = label_ids[: (max_seq_length - special_tokens_count)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first