Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def tag_sentence(self, sentence):
"""Tag using Lapos model.
TODO: Figure out how to pre-load model (loading is really slow). Or force users to bulk-convert files or strings.
"""
fp_lapos = os.path.expanduser('~/cltk_data/multilingual/software/lapos')
fp_model = os.path.expanduser('~/cltk_data/{0}/model/{1}_models_cltk/taggers/pos'.format(self.language, self.language)) # rel from Lapos dir
try:
lapos_command = 'cd {0} && echo "{1}" | ./lapos -t -m {2}'.format(fp_lapos, sentence, fp_model)
p_out = subprocess.check_output(lapos_command,
shell=True,
stderr=subprocess.STDOUT,
universal_newlines=True)
except subprocess.CalledProcessError as cp_err:
logger.error('Lapos call failed. Check installation.')
logger.error(sentence)
print(cp_err)
raise
# Parse output from Lapos
# TODO: Make this cleaner/faster
output_list = p_out.split('\n')
output_list_filtered = [l for l in output_list if not l.startswith('loading the models')]
output_list_filtered = [l for l in output_list_filtered if not l == 'done']
output_list_filtered = [l for l in output_list_filtered if l]
for line in output_list_filtered:
word_tags = line.split(' ')
tagged_sentence = []
for word_tag in word_tags:
word, tag = word_tag.split('/')
word_tag_tuple = (word, tag)
def load_replacement_patterns(self):
"""Check for availability of the specified dictionary."""
filename = self.dictionary + '.py'
models = self.language + '_models_cltk'
rel_path = os.path.join(get_cltk_data_dir(),
self.language,
'model',
models,
'semantics',
filename)
path = os.path.expanduser(rel_path)
logger.info('Loading lemmata or synonyms. This may take a minute.')
loader = importlib.machinery.SourceFileLoader(filename, path)
module = types.ModuleType(loader.name)
loader.exec_module(module)
return module.DICTIONARY
full_args = ['v', 'w', 'x', 'y', 'z']
[tlgu_options.append(x) for x in full_args] # pylint: disable=W0106
if break_lines:
tlgu_options.append('N')
if divide_works:
tlgu_options.append('W')
if latin:
tlgu_options.append('r')
# setup extra args
if extra_args is None:
extra_args = []
else:
try:
extra_args = list(extra_args)
except Exception as exc:
logger.error("Argument 'extra_args' must be a list: %s.", exc)
raise
tlgu_options = tlgu_options + extra_args
# assemble all tlgu flags
tlgu_options = list(set(tlgu_options))
if tlgu_options:
tlgu_flags = '-' + ' -'.join(tlgu_options)
else:
tlgu_flags = ''
# make tlgu call
tlgu_call = 'tlgu {0} {1} {2}'.format(tlgu_flags,
input_path,
output_path)
logger.info(tlgu_call)
try:
p_out = subprocess.call(tlgu_call, shell=True)
if p_out == 1:
"""
from cltk.utils.cltk_logger import logger
from nltk.tokenize import wordpunct_tokenize
import re
import unicodedata
try:
# James Tauber's greek_accentuation package
from greek_accentuation import characters as chars
except ImportError as import_error:
message = 'Missing "greek_accentuation" package. Install with ' \
'`pip install greek-accentuation`.'
logger.error(message)
logger.error(import_error)
raise
__author__ = ['Jack Duff ']
__license__ = 'MIT License. See LICENSE.'
# Dictionaries of phonological reconstructions for use in transcribing.
# Probert, Philomen. 2010. Phonology, in E. Bakker, A Companion to the \
# Ancient Greek Language.
# (Entries which are commented out are realized through diacritic analysis.)
GREEK = {
'Attic': {
'Probert': {
'correspondence': {
from cltk.utils.cltk_logger import logger
from nltk.tokenize import wordpunct_tokenize
import re
import unicodedata
try:
# James Tauber's greek_accentuation package
from greek_accentuation import characters as chars
except ImportError as import_error:
message = 'Missing "greek_accentuation" package. Install with ' \
'`pip install greek-accentuation`.'
logger.error(message)
logger.error(import_error)
raise
__author__ = ['Jack Duff ']
__license__ = 'MIT License. See LICENSE.'
# Dictionaries of phonological reconstructions for use in transcribing.
# Probert, Philomen. 2010. Phonology, in E. Bakker, A Companion to the \
# Ancient Greek Language.
# (Entries which are commented out are realized through diacritic analysis.)
GREEK = {
'Attic': {
'Probert': {
'correspondence': {
'α': 'ɑ',
def list_corpora(self):
"""Show corpora available for the CLTK to download."""
try:
corpora = LANGUAGE_CORPORA[self.language]
except NameError as name_error:
logger.error('Corpus not available for language %s: %s', (self.language, name_error))
corpus_list = []
for corpus in corpora:
corpus_list.append(corpus['name'])
return corpus_list
def __init__(self, place=None, manner=None, voiced=None, ipar=None, geminate=None):
if isinstance(place, Place) or place is None:
self.place = place
else:
logger.error("Incorrect argument")
if isinstance(manner, Manner) or manner is None:
self.manner = manner
else:
logger.error("Incorrect argument")
raise ValueError
if type(voiced) == bool or voiced is None:
self.voiced = voiced
else:
logger.error("Incorrect argument")
raise TypeError
if type(geminate) == bool or geminate is None:
self.geminate = geminate
else:
logger.error("Incorrect argument")
raise TypeError
self.ipar = ipar
def _copy_dir_recursive(src_rel, dst_rel):
"""Copy contents of one directory to another. `dst_rel` dir cannot
exist. Source: http://stackoverflow.com/a/1994840
TODO: Move this to file_operations.py module.
:type src_rel: str
:param src_rel: Directory to be copied.
:type dst_rel: str
:param dst_rel: Directory to be created with contents of ``src_rel``.
"""
src = os.path.expanduser(src_rel)
dst = os.path.expanduser(dst_rel)
try:
shutil.copytree(src, dst)
logger.info('Files copied from %s to %s', src, dst)
except OSError as exc:
if exc.errno == errno.ENOTDIR:
shutil.copy(src, dst)
logger.info('Files copied from %s to %s', src, dst)
else:
raise
TODO: write check or try if `cltk_data` dir is not present
"""
if self.testing:
distributed_corpora_fp = os.path.normpath(get_cltk_data_dir() + '/test_distributed_corpora.yaml')
else:
distributed_corpora_fp = os.path.normpath(get_cltk_data_dir() + '/distributed_corpora.yaml')
try:
with open(distributed_corpora_fp) as file_open:
corpora_dict = yaml.safe_load(file_open)
except FileNotFoundError:
logger.info('`~/cltk_data/distributed_corpora.yaml` file not found.')
return []
except yaml.parser.ParserError as parse_err:
logger.debug('Yaml parsing error: %s' % parse_err)
return []
user_defined_corpora = []
for corpus_name in corpora_dict:
about = corpora_dict[corpus_name]
if about['language'].lower() == self.language:
user_defined_corpus = dict()
# user_defined_corpus['git_remote'] = about['git_remote']
user_defined_corpus['origin'] = about['origin']
user_defined_corpus['type'] = about['type']
user_defined_corpus['name'] = corpus_name
user_defined_corpora.append(user_defined_corpus)
return user_defined_corpora
def __init__(self, place=None, manner=None, voiced=None, ipar=None, geminate=None):
if isinstance(place, Place) or place is None:
self.place = place
else:
logger.error("Incorrect argument")
if isinstance(manner, Manner) or manner is None:
self.manner = manner
else:
logger.error("Incorrect argument")
raise ValueError
if type(voiced) == bool or voiced is None:
self.voiced = voiced
else:
logger.error("Incorrect argument")
raise TypeError
if type(geminate) == bool or geminate is None:
self.geminate = geminate
else:
logger.error("Incorrect argument")
raise TypeError
self.ipar = ipar