Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_roberta_embeddings():
roberta_model: str = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(roberta_model)
model = RobertaModel.from_pretrained(
pretrained_model_name_or_path=roberta_model, output_hidden_states=True
)
model.to(flair.device)
model.eval()
s: str = "Berlin and Munich have a lot of puppeteer to see ."
with torch.no_grad():
tokens = tokenizer.tokenize("<s> " + s + " </s>")
indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
tokens_tensor = torch.tensor([indexed_tokens])
tokens_tensor = tokens_tensor.to(flair.device)
hidden_states = model(tokens_tensor)[-1]
first_layer = hidden_states[1][0]
assert len(first_layer) == len(tokens)
if model in ["large", "5.5B"]:
options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5"
if model == "pt" or model == "portuguese":
options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pt/elmo_pt_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pt/elmo_pt_weights.hdf5"
if model == "pubmed":
options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pubmed/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pubmed/elmo_2x4096_512_2048cnn_2xhighway_weights_PubMed_only.hdf5"
# put on Cuda if available
from flair import device
if re.fullmatch(r"cuda:[0-9]+", str(device)):
cuda_device = int(str(device).split(":")[-1])
elif str(device) == "cpu":
cuda_device = -1
else:
cuda_device = 0
self.ee = allennlp.commands.elmo.ElmoEmbedder(
options_file=options_file, weight_file=weight_file, cuda_device=cuda_device
)
# embed a dummy sentence to determine embedding_length
dummy_sentence: Sentence = Sentence()
dummy_sentence.add_token(Token("hello"))
embedded_dummy = self.embed(dummy_sentence)
self.__embedding_length: int = len(
embedded_dummy[0].get_token(1).get_embedding()
)
def get_Elmo_embeddings(vocab, dim):
_embeddings = np.zeros([len(vocab), dim])
options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
from flair import device
import allennlp.commands.elmo
if re.fullmatch(r"cuda:[0-9]+", str(device)):
cuda_device = int(str(device).split(":")[-1])
elif str(device) == "cpu":
cuda_device = -1
else:
cuda_device = 0
elmo_embeddings = allennlp.commands.elmo.ElmoEmbedder(
options_file=options_file, weight_file=weight_file, cuda_device=cuda_device
)
temp = []
for each_word in vocab:
temp.append(each_word)
sentences_words = [temp]
embeddings = elmo_embeddings.embed_batch(sentences_words)
def _labels_to_indices(self, sentences: List[Sentence]):
indices = [
torch.tensor(
[float(label.value) for label in sentence.labels], dtype=torch.float
)
for sentence in sentences
]
vec = torch.cat(indices, 0).to(flair.device)
return vec
def set_embedding(self, name: str, vector: torch.tensor):
device = flair.device
if (flair.embedding_storage_mode == "cpu") and len(self._embeddings.keys()) > 0:
device = next(iter(self._embeddings.values())).device
if device != vector.device:
vector = vector.to(device)
self._embeddings[name] = vector
:param model_file: the model file
:return: the loaded text classifier model
"""
model_file = cls._fetch_model(str(model))
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
# load_big_file is a workaround by https://github.com/highway11git to load models on some Mac/Windows setups
# see https://github.com/zalandoresearch/flair/issues/351
f = flair.file_utils.load_big_file(str(model_file))
state = torch.load(f, map_location=flair.device)
model = cls._init_model_with_state_dict(state)
model.eval()
model.to(flair.device)
return model
def __init__(
self,
source_embeddings: Embeddings,
target_embeddings: Embeddings,
similarity_measure: SimilarityMeasure,
similarity_loss: SimilarityLoss,
eval_device=flair.device,
source_mapping: torch.nn.Module = None,
target_mapping: torch.nn.Module = None,
recall_at_points: List[int] = [1, 5, 10, 20],
recall_at_points_weights: List[float] = [0.4, 0.3, 0.2, 0.1],
interleave_embedding_updates: bool = False,
):
super(SimilarityLearner, self).__init__()
self.source_embeddings: Embeddings = source_embeddings
self.target_embeddings: Embeddings = target_embeddings
self.source_mapping: torch.nn.Module = source_mapping
self.target_mapping: torch.nn.Module = target_mapping
self.similarity_measure: SimilarityMeasure = similarity_measure
self.similarity_loss: SimilarityLoss = similarity_loss
self.eval_device = eval_device
self.recall_at_points: List[int] = recall_at_points
self.recall_at_points_weights: List[float] = recall_at_points_weights
for i, ci in enumerate(tokens_char_indices):
for j, cj in enumerate(tokens_sorted_by_length):
if ci == cj:
d[j] = i
continue
chars2_length = [len(c) for c in tokens_sorted_by_length]
longest_token_in_sentence = max(chars2_length)
tokens_mask = torch.zeros(
(len(tokens_sorted_by_length), longest_token_in_sentence),
dtype=torch.long,
device=flair.device,
)
for i, c in enumerate(tokens_sorted_by_length):
tokens_mask[i, : chars2_length[i]] = torch.tensor(
c, dtype=torch.long, device=flair.device
)
# chars for rnn processing
chars = tokens_mask
character_embeddings = self.char_embedding(chars).transpose(0, 1)
packed = torch.nn.utils.rnn.pack_padded_sequence(
character_embeddings, chars2_length
)
lstm_out, self.hidden = self.char_rnn(packed)
outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(lstm_out)
outputs = outputs.transpose(0, 1)
chars_embeds_temp = torch.zeros(
first_token_tags = sentence.tokens[0].tags
if 'weight' in first_token_tags:
weight_list.append(float(first_token_tags['weight'].value))
else:
weight_list.append(1.0)
if self.use_crf:
# pad tags if using batch-CRF decoder
tags, _ = pad_tensors(tag_list)
forward_score = self._forward_alg(features, lengths)
gold_score = self._score_sentence(features, tags, lengths)
score = forward_score - gold_score
weight_list = torch.tensor(weight_list, device=flair.device)
score = score * weight_list
return score.mean()
else:
score = 0
for sentence_feats, sentence_tags, sentence_length in zip(
features, tag_list, lengths
):
sentence_feats = sentence_feats[:sentence_length]
score += torch.nn.functional.cross_entropy(
sentence_feats, sentence_tags
)
score /= len(features)
return score
def load(cls, model: Union[str, Path]):
"""
Loads the model from the given file.
:param model_file: the model file
:return: the loaded text classifier model
"""
model_file = cls._fetch_model(str(model))
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
# load_big_file is a workaround by https://github.com/highway11git to load models on some Mac/Windows setups
# see https://github.com/zalandoresearch/flair/issues/351
f = flair.file_utils.load_big_file(str(model_file))
state = torch.load(f, map_location=flair.device)
model = cls._init_model_with_state_dict(state)
model.eval()
model.to(flair.device)
return model