Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _download_wassa_if_not_there(emotion, data_folder, dataset_name):
for split in ["train", "dev", "test"]:
data_file = data_folder / f"{emotion}-{split}.txt"
if not data_file.is_file():
if split == "train":
url = f"http://saifmohammad.com/WebDocs/EmoInt%20Train%20Data/{emotion}-ratings-0to1.train.txt"
if split == "dev":
url = f"http://saifmohammad.com/WebDocs/EmoInt%20Dev%20Data%20With%20Gold/{emotion}-ratings-0to1.dev.gold.txt"
if split == "test":
url = f"http://saifmohammad.com/WebDocs/EmoInt%20Test%20Gold%20Data/{emotion}-ratings-0to1.test.gold.txt"
path = cached_path(url, Path("datasets") / dataset_name)
with open(path, "r") as f:
with open(data_file, "w") as out:
next(f)
for line in f:
fields = line.split("\t")
out.write(f"__label__{fields[3].rstrip()} {fields[1]}\n")
os.remove(path)
model = cached_path(base_path, cache_dir=cache_dir)
# mix-german-forward
elif model.lower() == "german-forward" or model.lower() == "de-forward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-forward-v0.2rc.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# mix-german-backward
elif model.lower() == "german-backward" or model.lower() == "de-backward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-backward-v0.2rc.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# common crawl Polish forward
elif model.lower() == "polish-forward" or model.lower() == "pl-forward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-forward-v0.2.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# common crawl Polish backward
elif model.lower() == "polish-backward" or model.lower() == "pl-backward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-backward-v0.2.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# Slovenian forward
elif model.lower() == "slovenian-forward" or model.lower() == "sl-forward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/lm-sl-large-forward-v0.1.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# Slovenian backward
elif model.lower() == "slovenian-backward" or model.lower() == "sl-backward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/lm-sl-large-backward-v0.1.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# Bulgarian forward
model = cached_path(base_path, cache_dir=cache_dir)
# mix-english-forward
elif model.lower() == "mix-forward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-forward-v0.2rc.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# mix-english-backward
elif model.lower() == "mix-backward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-backward-v0.2rc.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# mix-german-forward
elif model.lower() == "german-forward" or model.lower() == "de-forward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-forward-v0.2rc.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# mix-german-backward
elif model.lower() == "german-backward" or model.lower() == "de-backward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-backward-v0.2rc.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# common crawl Polish forward
elif model.lower() == "polish-forward" or model.lower() == "pl-forward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-forward-v0.2.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# common crawl Polish backward
elif model.lower() == "polish-backward" or model.lower() == "pl-backward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-backward-v0.2.pt"
model = cached_path(base_path, cache_dir=cache_dir)
model = cached_path(base_path, cache_dir=cache_dir)
# mix-german-backward
elif model.lower() == "german-backward" or model.lower() == "de-backward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-backward-v0.2rc.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# common crawl Polish forward
elif model.lower() == "polish-forward" or model.lower() == "pl-forward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-forward-v0.2.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# common crawl Polish backward
elif model.lower() == "polish-backward" or model.lower() == "pl-backward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-backward-v0.2.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# Slovenian forward
elif model.lower() == "slovenian-forward" or model.lower() == "sl-forward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/lm-sl-large-forward-v0.1.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# Slovenian backward
elif model.lower() == "slovenian-backward" or model.lower() == "sl-backward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/lm-sl-large-backward-v0.1.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# Bulgarian forward
elif model.lower() == "bulgarian-forward" or model.lower() == "bg-forward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/lm-bg-small-forward-v0.1.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# Bulgarian backward
elif model.lower() == "bulgarian-backward" or model.lower() == "bg-backward":
not allow re-use of once computed embeddings that do not fit into memory
:param cache_directory: if cache_directory is not set, the cache will be written to ~/.flair/embeddings. otherwise the cache
is written to the provided directory.
"""
super().__init__()
cache_dir = Path("embeddings")
# multilingual forward (English, German, French, Italian, Dutch, Polish)
if model.lower() == "multi-forward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-multi-forward-v0.1.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# multilingual backward (English, German, French, Italian, Dutch, Polish)
elif model.lower() == "multi-backward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-multi-backward-v0.1.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# news-english-forward
elif model.lower() == "news-forward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-v0.2rc.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# news-english-backward
elif model.lower() == "news-backward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-v0.2rc.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# news-english-forward
elif model.lower() == "news-forward-fast":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-1024-v0.2rc.pt"
model = cached_path(base_path, cache_dir=cache_dir)
old_base_path = (
"https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/"
)
base_path = (
"https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/"
)
embeddings_path_v4 = (
"https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/"
)
embeddings_path_v4_1 = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4.1/"
cache_dir = Path("embeddings")
# GLOVE embeddings
if embeddings.lower() == "glove" or embeddings.lower() == "en-glove":
cached_path(f"{old_base_path}glove.gensim.vectors.npy", cache_dir=cache_dir)
embeddings = cached_path(
f"{old_base_path}glove.gensim", cache_dir=cache_dir
)
# TURIAN embeddings
elif embeddings.lower() == "turian" or embeddings.lower() == "en-turian":
cached_path(
f"{embeddings_path_v4_1}turian.vectors.npy", cache_dir=cache_dir
)
embeddings = cached_path(
f"{embeddings_path_v4_1}turian", cache_dir=cache_dir
)
# KOMNINOS embeddings
elif embeddings.lower() == "extvec" or embeddings.lower() == "en-extvec":
cached_path(
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-fr-charlm-backward.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# Czech forward
elif model.lower() == "czech-forward" or model.lower() == "cs-forward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-cs-large-forward-v0.1.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# Czech backward
elif model.lower() == "czech-backward" or model.lower() == "cs-backward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-cs-large-backward-v0.1.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# Portuguese forward
elif model.lower() == "portuguese-forward" or model.lower() == "pt-forward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-pt-forward.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# Portuguese backward
elif model.lower() == "portuguese-backward" or model.lower() == "pt-backward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-pt-backward.pt"
model = cached_path(base_path, cache_dir=cache_dir)
elif not Path(model).exists():
raise ValueError(
f'The given model "{model}" is not available or is not a valid path.'
)
self.name = str(model)
self.static_embeddings = detach
from flair.models import LanguageModel
self.lm = LanguageModel.load_language_model(model)
if type(base_path) == str:
base_path: Path = Path(base_path)
# this dataset name
dataset_name = self.__class__.__name__.lower()
# default dataset folder is the cache root
if not base_path:
base_path = Path(flair.cache_root) / "datasets"
data_folder = base_path / dataset_name
# download data if necessary
ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Slovak-SNK/master"
cached_path(f"{ud_path}/sk_snk-ud-dev.conllu", Path("datasets") / dataset_name)
cached_path(f"{ud_path}/sk_snk-ud-test.conllu", Path("datasets") / dataset_name)
cached_path(
f"{ud_path}/sk_snk-ud-train.conllu", Path("datasets") / dataset_name
)
super(UD_SLOVAK, self).__init__(data_folder, in_memory=in_memory)
model = cached_path(base_path, cache_dir=cache_dir)
# news-english-forward
elif model.lower() == "news-forward-fast":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-1024-v0.2rc.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# news-english-backward
elif model.lower() == "news-backward-fast":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-1024-v0.2rc.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# mix-english-forward
elif model.lower() == "mix-forward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-forward-v0.2rc.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# mix-english-backward
elif model.lower() == "mix-backward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-backward-v0.2rc.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# mix-german-forward
elif model.lower() == "german-forward" or model.lower() == "de-forward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-forward-v0.2rc.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# mix-german-backward
elif model.lower() == "german-backward" or model.lower() == "de-backward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-backward-v0.2rc.pt"
model = cached_path(base_path, cache_dir=cache_dir)
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-forward-v0.2.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# common crawl Polish backward
elif model.lower() == "polish-backward" or model.lower() == "pl-backward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-backward-v0.2.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# Slovenian forward
elif model.lower() == "slovenian-forward" or model.lower() == "sl-forward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/lm-sl-large-forward-v0.1.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# Slovenian backward
elif model.lower() == "slovenian-backward" or model.lower() == "sl-backward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/lm-sl-large-backward-v0.1.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# Bulgarian forward
elif model.lower() == "bulgarian-forward" or model.lower() == "bg-forward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/lm-bg-small-forward-v0.1.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# Bulgarian backward
elif model.lower() == "bulgarian-backward" or model.lower() == "bg-backward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/lm-bg-small-backward-v0.1.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# Dutch forward
elif model.lower() == "dutch-forward" or model.lower() == "nl-forward":
base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-nl-large-forward-v0.1.pt"
model = cached_path(base_path, cache_dir=cache_dir)
# Dutch backward
elif model.lower() == "dutch-backward" or model.lower() == "nl-backward":