Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
padding embedding can be looked up with "bpemb['']", or
directly accessed with "bpemb.vectors[-1]".
vs_fallback: ``bool'', optional (default = False)
Vocabulary size fallback. Not all vocabulary sizes are available
for all languages. For example, vs=1000 is not available for
Chinese due to the large number of characters.
When set to True, this option enables an automatic fallback to
the closest available vocabulary size. For example,
when selecting BPEmb("Chinese", vs=1000, vs_fallback=True),
the actual vocabulary size would be 10000.
"""
base_url = "https://nlp.h-its.org/bpemb/"
emb_tpl = "{lang}/{lang}.wiki.bpe.vs{vs}.d{dim}.w2v.bin"
model_tpl = "{lang}/{lang}.wiki.bpe.vs{vs}.model"
archive_suffix = ".tar.gz"
available_languages = wikicode
def __init__(
self,
*,
lang: str,
vs: int = 10000,
dim: int = 100,
cache_dir: Path = Path.home() / Path(".cache/bpemb"),
preprocess: bool = True,
encode_extra_options: str = None,
add_pad_emb: bool = False,
vs_fallback: bool = True):
self.lang = lang = BPEmb._get_lang(lang)
if self.lang == 'multi':
if dim != 300:
print('Setting dim=300 for multilingual BPEmb')
def _get_lang(lang):
if lang in {'multi', 'multilingual'}:
return 'multi'
if lang in wikicode:
return lang
try:
return to_wikicode[lang]
except:
raise ValueError("Unknown language identifier: " + lang)