Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_flair_tagger(self):
# Download model beforehand
download_model('flair.ner', DEFAULT_CACHE_DIR, process_func=_unzip_process_func, verbose=True)
print("Downloaded the flair model")
# Load the NER tagger using the DaNLP wrapper
flair_model = load_flair_ner_model()
# Using the flair POS tagger
sentence = Sentence('jeg hopper pΓ₯ en bil som er rΓΈd sammen med Jens-Peter E. Hansen')
flair_model.predict(sentence)
expected_string = "jeg hopper pΓ₯ en bil som er rΓΈd sammen med Jens-Peter E. Hansen "
self.assertEqual(sentence.to_tagged_string(), expected_string)
def test_flair_tagger(self):
# Download model beforehand
download_model('flair.pos', DEFAULT_CACHE_DIR, process_func=_unzip_process_func, verbose=True)
print("Downloaded the flair model")
# Load the POS tagger using the DaNLP wrapper
flair_model = load_flair_pos_model()
# Using the flair POS tagger
sentence = Sentence('jeg hopper pΓ₯ en bil som er rΓΈd sammen med Jens-Peter E. Hansen')
flair_model.predict(sentence)
expected_string = "jeg hopper pΓ₯ en bil som er " \
" rΓΈd sammen med Jens-Peter E. Hansen "
self.assertEqual(sentence.to_tagged_string(), expected_string)
def test_download(self):
# Download model beforehand
model_path = download_model('spacy', DEFAULT_CACHE_DIR,
process_func=_unzip_process_func,
verbose=True)
info = spacy.info(model_path)
self.assertListEqual(info['pipeline'], ['tagger', 'parser', 'ner'])
self.assertEqual(info['lang'], 'da')
def load_wv_with_gensim(pretrained_embedding: str, cache_dir=DEFAULT_CACHE_DIR,
verbose: bool = False):
"""
Available wordembeddings:
- wiki.da.wv
- cc.da.wv
- conll17.da.wv
- news.da.wv
- sketchengine.da.wv
Available subwordembeddings:
- wiki.da.swv
- cc.da.swv
- sketchengine.da.swv
:param pretrained_embedding:
def _twitter_data_process_func(tmp_file_path: str, meta_info: dict,
cache_dir: str = DEFAULT_CACHE_DIR,
clean_up_raw_data: bool = True,
verbose: bool = True):
from zipfile import ZipFile
twitter_api = construct_twitter_api_connection()
model_name = meta_info['name']
full_path = os.path.join(cache_dir, model_name) + meta_info['file_extension']
with ZipFile(tmp_file_path, 'r') as zip_file: # Extract files to cache_dir
file_list = zip_file.namelist()
extract_single_file_from_zip(cache_dir, file_list[0], full_path, zip_file)
file_path = os.path.join(cache_dir, 'twitter.sentiment' + '.csv')
df = pd.read_csv(file_path)
twitter_ids = list(df['twitterid'])
def load_pytorch_embedding_layer(pretrained_embedding: str,
cache_dir=DEFAULT_CACHE_DIR, verbose=False):
"""
:param pretrained_embedding:
:param cache_dir: the directory for storing cached models
:return: an pytorch Embedding module and a list id2word
"""
word_embeddings_available(pretrained_embedding, can_use_subword=False)
import torch
from torch.nn import Embedding
word_vectors = load_wv_with_gensim(pretrained_embedding,
cache_dir=cache_dir, verbose=verbose)
weights = torch.FloatTensor(word_vectors.vectors)
return Embedding.from_pretrained(weights), word_vectors.index2word
def _process_embeddings_for_spacy(tmp_file_path: str, meta_info: dict,
cache_dir: str = DEFAULT_CACHE_DIR,
clean_up_raw_data: bool = True,
verbose: bool = False):
"""
To use pretrained embeddings with spaCy the embeddings need to be stored in
a specific format. This function converts embeddings saved in the binary
word2vec format to a spaCy model with the init_model() function from
spaCy. The generated files will be saved in the cache_dir under a
folder called .spacy
More information on converting pretrained word embeddings to spaCy models here:
https://spacy.io/usage/vectors-similarity#custom
:param str tmp_file_path: the file name of the embedding binary file
:param str cache_dir: the directory for storing cached data
:param bool verbose:
"""
def load_flair_pos_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False):
"""
:param cache_dir:
:param verbose:
:return:
"""
from flair.models import SequenceTagger
model_weight_path = download_model('flair.pos', cache_dir, process_func=_unzip_process_func, verbose=verbose)
# using the flair model
flair_model = SequenceTagger.load(model_weight_path)
return flair_model
def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
self.dataset_name1 = 'lcc1.sentiment'
self.file_extension1 = DATASETS[self.dataset_name1]['file_extension']
self.dataset_dir1 = download_dataset(self.dataset_name1, cache_dir=cache_dir)
self.file_path1 = os.path.join(self.dataset_dir1, self.dataset_name1 + self.file_extension1)
self.dataset_name2 = 'lcc2.sentiment'
self.file_extension2 = DATASETS[self.dataset_name2]['file_extension']
self.dataset_dir2 = download_dataset(self.dataset_name2, cache_dir=cache_dir)
self.file_path2 = os.path.join(self.dataset_dir2, self.dataset_name2 + self.file_extension2)
def load_bert_ner_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False):
"""
Wrapper function to ensure that all models in danlp are
loaded in a similar way
:param cache_dir:
:param verbose:
:return:
"""
return BertNer(cache_dir, verbose)