Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
*,
lang: str,
vs: int = 10000,
dim: int = 100,
cache_dir: Path = Path.home() / Path(".cache/bpemb"),
preprocess: bool = True,
encode_extra_options: str = None,
add_pad_emb: bool = False,
vs_fallback: bool = True):
self.lang = lang = BPEmb._get_lang(lang)
if self.lang == 'multi':
if dim != 300:
print('Setting dim=300 for multilingual BPEmb')
dim = 300
if vs_fallback:
available = BPEmb.available_vocab_sizes(lang)
if not available:
raise ValueError("No BPEmb models for language " + lang)
if vs not in available:
available = sorted(available)
_vs = vs
if vs < available[0]:
vs = available[0]
else:
vs = available[-1]
print("BPEmb fallback: {} from vocab size {} to {}".format(lang, _vs, vs))
self.vocab_size = self.vs = vs
self.dim = dim
self.cache_dir = Path(cache_dir)
model_file = self.model_tpl.format(lang=lang, vs=vs)
self.model_file = self._load_file(model_file)
self.spm = sentencepiece_load(self.model_file)