Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
elif engine == "mm" or engine == "multi_cut":
from .multi_cut import segment
segments = segment(text, custom_dict)
elif engine == "deepcut": # deepcut can optionally use dictionary
from .deepcut import segment
if custom_dict:
custom_dict = list(custom_dict)
segments = segment(text, custom_dict)
else:
segments = segment(text)
elif engine == "icu":
from .pyicu import segment
segments = segment(text)
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
It might be a typo; if not, please consult our document."""
)
if not keep_whitespace:
segments = [token.strip(" ") for token in segments if token.strip(" ")]
return segments
if engine == "newmm" or engine == "onecut":
from .newmm import segment
segments = segment(text, custom_dict)
elif engine == "newmm-safe":
from .newmm import segment
segments = segment(text, custom_dict, safe_mode=True)
elif engine == "attacut":
from .attacut import segment
segments = segment(text)
elif engine == "longest":
from .longest import segment
segments = segment(text, custom_dict)
elif engine == "mm" or engine == "multi_cut":
from .multi_cut import segment
segments = segment(text, custom_dict)
elif engine == "deepcut": # deepcut can optionally use dictionary
from .deepcut import segment
if custom_dict:
custom_dict = list(custom_dict)
segments = segment(text, custom_dict)
else:
segments = segment(text)
elif engine == "icu":
from .pyicu import segment
segments = segment(text)
# ['ชินโซ', ' ', 'อาเบะ',
# ' ', 'เกิด', ' ', '21', ' ', 'กันยายน']
"""
if not text or not isinstance(text, str):
return []
segments = []
if engine == "newmm" or engine == "onecut":
from .newmm import segment
segments = segment(text, custom_dict)
elif engine == "newmm-safe":
from .newmm import segment
segments = segment(text, custom_dict, safe_mode=True)
elif engine == "attacut":
from .attacut import segment
segments = segment(text)
elif engine == "longest":
from .longest import segment
segments = segment(text, custom_dict)
elif engine == "mm" or engine == "multi_cut":
from .multi_cut import segment
segments = segment(text, custom_dict)
elif engine == "deepcut": # deepcut can optionally use dictionary
from .deepcut import segment
if custom_dict:
return []
segments = []
if engine == "newmm" or engine == "onecut":
from .newmm import segment
segments = segment(text, custom_dict)
elif engine == "newmm-safe":
from .newmm import segment
segments = segment(text, custom_dict, safe_mode=True)
elif engine == "attacut":
from .attacut import segment
segments = segment(text)
elif engine == "longest":
from .longest import segment
segments = segment(text, custom_dict)
elif engine == "mm" or engine == "multi_cut":
from .multi_cut import segment
segments = segment(text, custom_dict)
elif engine == "deepcut": # deepcut can optionally use dictionary
from .deepcut import segment
if custom_dict:
custom_dict = list(custom_dict)
segments = segment(text, custom_dict)
else:
segments = segment(text)
segments = segment(text)
elif engine == "longest":
from .longest import segment
segments = segment(text, custom_dict)
elif engine == "mm" or engine == "multi_cut":
from .multi_cut import segment
segments = segment(text, custom_dict)
elif engine == "deepcut": # deepcut can optionally use dictionary
from .deepcut import segment
if custom_dict:
custom_dict = list(custom_dict)
segments = segment(text, custom_dict)
else:
segments = segment(text)
elif engine == "icu":
from .pyicu import segment
segments = segment(text)
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
It might be a typo; if not, please consult our document."""
)
if not keep_whitespace:
segments = [token.strip(" ") for token in segments if token.strip(" ")]
return segments
elif engine == "longest":
from .longest import segment
segments = segment(text, custom_dict)
elif engine == "mm" or engine == "multi_cut":
from .multi_cut import segment
segments = segment(text, custom_dict)
elif engine == "deepcut": # deepcut can optionally use dictionary
from .deepcut import segment
if custom_dict:
custom_dict = list(custom_dict)
segments = segment(text, custom_dict)
else:
segments = segment(text)
elif engine == "icu":
from .pyicu import segment
segments = segment(text)
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
It might be a typo; if not, please consult our document."""
)
if not keep_whitespace:
segments = [token.strip(" ") for token in segments if token.strip(" ")]
return segments