Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
layers=layers,
pooling_operation=pooling_operation,
subword_start_idx=offset,
subword_end_idx=offset + len_subwords,
use_scalar_mix=use_scalar_mix,
)
offset += len_subwords
final_subtoken_embedding = torch.cat(subtoken_embeddings)
token.set_embedding(name, final_subtoken_embedding)
return sentences
class TransformerXLEmbeddings(TokenEmbeddings):
def __init__(
self,
pretrained_model_name_or_path: str = "transfo-xl-wt103",
layers: str = "1,2,3",
use_scalar_mix: bool = False,
):
"""Transformer-XL embeddings, as proposed in Dai et al., 2019.
:param pretrained_model_name_or_path: name or path of Transformer-XL model
:param layers: comma-separated list of layers
:param use_scalar_mix: defines the usage of scalar mix for specified layer(s)
"""
super().__init__()
self.tokenizer = TransfoXLTokenizer.from_pretrained(
pretrained_model_name_or_path
)
if not self.fine_tune:
embedding = embedding.detach()
token.set_embedding(self.name, embedding.clone())
all_hidden_states_in_lm = all_hidden_states_in_lm.detach()
del all_hidden_states_in_lm
return sentences
def __str__(self):
return self.name
class PooledFlairEmbeddings(TokenEmbeddings):
def __init__(
self,
contextual_embeddings: Union[str, FlairEmbeddings],
pooling: str = "min",
only_capitalized: bool = False,
**kwargs,
):
super().__init__()
# use the character language model embeddings as basis
if type(contextual_embeddings) is str:
self.context_embeddings: FlairEmbeddings = FlairEmbeddings(
contextual_embeddings, **kwargs
)
else:
name=self.name,
layers=self.layers,
pooling_operation=self.pooling_operation,
use_scalar_mix=self.use_scalar_mix,
)
return sentences
def extra_repr(self):
return "model={}".format(self.name)
def __str__(self):
return self.name
class OpenAIGPT2Embeddings(TokenEmbeddings):
def __init__(
self,
pretrained_model_name_or_path: str = "gpt2-medium",
layers: str = "1",
pooling_operation: str = "first_last",
use_scalar_mix: bool = False,
):
"""OpenAI GPT-2 embeddings, as proposed in Radford et al. 2019.
:param pretrained_model_name_or_path: name or path of OpenAI GPT-2 model
:param layers: comma-separated list of layers
:param pooling_operation: defines pooling operation for subwords
:param use_scalar_mix: defines the usage of scalar mix for specified layer(s)
"""
super().__init__()
self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path)
token_idx += feature.token_subtoken_count[token.idx] - 1
return sentences
@property
@abstractmethod
def embedding_length(self) -> int:
"""Returns the length of the embedding vector."""
return (
len(self.layer_indexes) * self.model.config.hidden_size
if not self.use_scalar_mix
else self.model.config.hidden_size
)
class CharLMEmbeddings(TokenEmbeddings):
"""Contextual string embeddings of words, as proposed in Akbik et al., 2018. """
@deprecated(version="0.4", reason="Use 'FlairEmbeddings' instead.")
def __init__(
self,
model: str,
detach: bool = True,
use_cache: bool = False,
cache_directory: Path = None,
):
"""
initializes contextual string embeddings using a character-level language model.
:param model: model string, one of 'news-forward', 'news-backward', 'news-forward-fast', 'news-backward-fast',
'mix-forward', 'mix-backward', 'german-forward', 'german-backward', 'polish-backward', 'polish-forward'
depending on which character language model is desired.
:param detach: if set to False, the gradient will propagate into the language model. this dramatically slows down
elif not Path(embeddings).exists():
raise ValueError(
f'The given embeddings "{embeddings}" is not available or is not a valid path.'
)
self.name: str = str(embeddings)
self.static_embeddings = True
log.info("Reading embeddings from %s" % embeddings)
self.precomputed_word_embeddings = gensim.models.KeyedVectors.load_word2vec_format(
open_inside_zip(str(embeddings), cache_dir=cache_dir)
)
self.__embedding_length: int = self.precomputed_word_embeddings.vector_size
super(TokenEmbeddings, self).__init__()
self.word_embeddings[token.text] / self.word_count[token.text]
if self.pooling == "mean"
else self.word_embeddings[token.text]
)
else:
base = token._embeddings[self.context_embeddings.name]
token.set_embedding(self.name, base)
return sentences
def embedding_length(self) -> int:
return self.embedding_length
class BertEmbeddings(TokenEmbeddings):
def __init__(
self,
bert_model_or_path: str = "bert-base-uncased",
layers: str = "-1,-2,-3,-4",
pooling_operation: str = "first",
use_scalar_mix: bool = False,
):
"""
Bidirectional transformer embeddings of words, as proposed in Devlin et al., 2018.
:param bert_model_or_path: name of BERT model ('') or directory path containing custom model, configuration file
and vocab file (names of three files should be - config.json, pytorch_model.bin/model.chkpt, vocab.txt)
:param layers: string indicating which layers to take for embedding
:param pooling_operation: how to get from token piece embeddings to token embedding. Either pool them and take
the average ('mean') or use first word piece embedding as token embedding ('first)
"""
super().__init__()
columns = {0: 'text', 1: 'ner'}
corpus: TaggedCorpus = NLPTaskDataFetcher.fetch_column_corpus("data1", columns,
train_file="train.txt",
test_file="test.txt",
dev_file="dev.txt")
print(corpus)
# 2. what tag do we want to predict?
tag_type = 'ner'
# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)
# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [
# WordEmbeddings('glove'),
# comment in this line to use character embeddings
# CharacterEmbeddings(),
# comment in these lines to use contextual string embeddings
# CharLMEmbeddings('news-forward'),
# CharLMEmbeddings('news-backward'),
]
embeddings = WordEmbeddings("glove.bin")
# 5. initialize sequence tagger
from flair.models import SequenceTagger
tagger: SequenceTagger = SequenceTagger(hidden_size=512,
if "spm_model_binary" in self.__dict__:
# if the model was saved as binary and it is not found on disk, write to appropriate path
if not os.path.exists(self.cache_dir / state["lang"]):
os.makedirs(self.cache_dir / state["lang"])
self.model_file = self.cache_dir / model_file
with open(self.model_file, "wb") as out:
out.write(self.__dict__["spm_model_binary"])
else:
# otherwise, use normal process and potentially trigger another download
self.model_file = self._load_file(model_file)
# once the modes if there, load it with sentence piece
state["spm"] = sentencepiece_load(self.model_file)
class MuseCrosslingualEmbeddings(TokenEmbeddings):
def __init__(self,):
self.name: str = f"muse-crosslingual"
self.static_embeddings = True
self.__embedding_length: int = 300
self.language_embeddings = {}
super().__init__()
def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
for i, sentence in enumerate(sentences):
language_code = sentence.get_language_code()
print(language_code)
supported = [
"en",
"de",
def train(
self, training_data: TrainingData, cfg: RasaNLUModelConfig, **kwargs: Any
) -> None:
corpus = self.convert_to_flair_format(training_data)
tag_type = "ner"
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
embedding_types: List[TokenEmbeddings] = []
if self.component_config["use_glove_embeddings"]:
embedding_types.append(WordEmbeddings("glove"))
if self.component_config["use_flair_embeddings"]:
embedding_types.append(FlairEmbeddings("news-forward"))
embedding_types.append(FlairEmbeddings("news-backward"))
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
tagger: SequenceTagger = SequenceTagger(
hidden_size=self.component_config["hidden_size"],
embeddings=embeddings,
tag_dictionary=tag_dictionary,
tag_type=tag_type,
use_crf=True,
pooling_operation=self.pooling_operation,
use_scalar_mix=self.use_scalar_mix,
bos_token="<s>",
eos_token="</s>",
)
return sentences
def extra_repr(self):
return "model={}".format(self.name)
def __str__(self):
return self.name
class XLMEmbeddings(TokenEmbeddings):
def __init__(
self,
pretrained_model_name_or_path: str = "xlm-mlm-en-2048",
layers: str = "1",
pooling_operation: str = "first_last",
use_scalar_mix: bool = False,
):
"""
XLM embeddings, as proposed in Guillaume et al., 2019.
:param pretrained_model_name_or_path: name or path of XLM model
:param layers: comma-separated list of layers
:param pooling_operation: defines pooling operation for subwords
:param use_scalar_mix: defines the usage of scalar mix for specified layer(s)
"""
super().__init__()