Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
self.grammar = None
self.lexicon = None
self.input_text_plugins = []
self.oov_provider_plugins = []
self.path_rewrite_plugins = []
self.buffers = []
self.header = None
self.read_system_dictionary(os.path.join(config.RESOURCEDIR, settings["systemDict"]))
"""
for p in settings["editConnectionPlugin"]:
p.set_up(self.grammar)
p.edit(self.grammar)
"""
self.read_character_definition(os.path.join(config.RESOURCEDIR, settings["characterDefinitionFile"]))
default_input_text_plugin = plugin.input_text.default_input_text_plugin.DefaultInputTextPlugin()
self.input_text_plugins = [default_input_text_plugin]
for p in self.input_text_plugins:
p.set_up()
simple_oov_plugin = plugin.oov.simple_oov_plugin.SimpleOovPlugin()
mecab_oov_plugin = plugin.oov.mecab_oov_plugin.MeCabOovPlugin()
self.oov_provider_plugins = [mecab_oov_plugin, simple_oov_plugin]
if not self.oov_provider_plugins:
raise AttributeError("no OOV provider")
for p in self.oov_provider_plugins:
p.set_up(self.grammar)
join_numeric_plugin = plugin.path_rewrite.join_numeric_plugin.JoinNumericPlugin()
join_katakana_oov_plugin = plugin.path_rewrite.join_katakana_oov_plugin.JoinKatakanaOovPlugin()
def __init__(self, settings):
self.grammar = None
self.lexicon = None
self.input_text_plugins = []
self.oov_provider_plugins = []
self.path_rewrite_plugins = []
self.buffers = []
self.header = None
self.read_system_dictionary(os.path.join(config.RESOURCEDIR, settings["systemDict"]))
"""
for p in settings["editConnectionPlugin"]:
p.set_up(self.grammar)
p.edit(self.grammar)
"""
self.read_character_definition(os.path.join(config.RESOURCEDIR, settings["characterDefinitionFile"]))
default_input_text_plugin = plugin.input_text.default_input_text_plugin.DefaultInputTextPlugin()
self.input_text_plugins = [default_input_text_plugin]
for p in self.input_text_plugins:
p.set_up()
simple_oov_plugin = plugin.oov.simple_oov_plugin.SimpleOovPlugin()
mecab_oov_plugin = plugin.oov.mecab_oov_plugin.MeCabOovPlugin()
self.oov_provider_plugins = [mecab_oov_plugin, simple_oov_plugin]
def __init__(self, config_path=None, resource_dir=None):
config.settings.set_up(config_path, resource_dir)
self.grammar = None
self.lexicon = None
self.input_text_plugins = []
self.edit_connection_plugin = []
self.oov_provider_plugins = []
self.path_rewrite_plugins = []
self.dictionaries = []
self.header = None
self._read_system_dictionary(config.settings.system_dict_path())
# self.edit_connection_plugin = [InhibitConnectionPlugin()]
# for p in self.edit_connection_plugin:
# p.set_up(self.grammar)
# p.edit(self.grammar)
self._read_character_definition(config.settings.char_def_path())
self.input_text_plugins = get_input_text_plugins()
for p in self.input_text_plugins:
p.set_up()
self.oov_provider_plugins = get_oov_plugins()
if not self.oov_provider_plugins:
raise AttributeError("no OOV provider")
for p in self.oov_provider_plugins:
def main():
parser = argparse.ArgumentParser(description="Japanese Morphological Analyzer")
subparsers = parser.add_subparsers()
# root parser
parser.add_argument("-v", "--version", action="version", version="%(prog)s v0.1.1")
# tokenize parser
parser_tk = subparsers.add_parser('tokenize', help='see `tokenize -h`', description='Japanese Morphological Analyze')
parser_tk.add_argument("-r", dest="fpath_setting", metavar="file",
default=config.SETTINGFILE, help="the setting file in JSON format")
parser_tk.add_argument("-m", dest="mode", choices=["A", "B", "C"], default="C", help="the mode of splitting")
parser_tk.add_argument("-o", dest="fpath_out", metavar="file", help="the output file")
parser_tk.add_argument("-a", action="store_true", help="print all of the fields")
parser_tk.add_argument("-d", action="store_true", help="print the debug information")
parser_tk.add_argument("input_files", metavar="input file(s)", nargs=argparse.REMAINDER)
parser_tk.set_defaults(handler=_command_tokenize, print_usage=parser_tk.print_usage)
# build dictionary parser
parser_bd = subparsers.add_parser('build', help='see `build -h`', description='Build Sudachi Dictionary')
parser_bd.add_argument('-o', dest='out_file', metavar='file', default='system.dic',
help='output file (default: system.dic)')
parser_bd.add_argument('-d', dest='description', default='', metavar='string', required=False,
help='description comment to be embedded on dictionary')
required_named_bd = parser_bd.add_argument_group('required named arguments')
required_named_bd.add_argument('-m', dest='matrix_file', metavar='file', required=True,
help='connection matrix file with MeCab\'s matrix.def format')
def set_up(self, grammar):
char_def = os.path.join(config.RESOURCEDIR, "char.def")
if not char_def:
raise AttributeError("charDef is not defined")
self.read_character_property(char_def)
unk_def = os.path.join(config.RESOURCEDIR, "unk.def")
if not unk_def:
raise AttributeError("unkDef is not defined")
self.read_oov(unk_def, grammar)
def get_input_text_plugins() -> List[InputTextPlugin]:
key_word = 'inputTextPlugin'
if key_word not in config.settings:
return []
ps = []
for obj in config.settings[key_word]:
ps.append(get_input_text_plugin(obj))
return ps
def set_up(self, grammar: Grammar) -> None:
if 'inhibitedPair' in config.settings:
self._inhibit_pairs = config.settings['inhibitedPair']
def set_up(self, grammar: Grammar) -> None:
if 'inhibitedPair' in config.settings:
self._inhibit_pairs = config.settings['inhibitedPair']
def set_up(self, grammar):
char_def = os.path.join(config.settings.resource_dir, self.__chardef_filename)
if not char_def:
raise AttributeError("charDef is not defined")
self.read_character_property(char_def)
unk_def = os.path.join(config.settings.resource_dir, self.__unkdef_filename)
if not unk_def:
raise AttributeError("unkDef is not defined")
self.read_oov(unk_def, grammar)
def get_path_rewrite_plugins() -> List[PathRewritePlugin]:
if 'pathRewritePlugin' not in config.settings:
return []
ps = []
for obj in config.settings['pathRewritePlugin']:
ps.append(get_path_rewrite_plugin(obj))
return ps