Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def matches(self, other):
return make_phoneme(self).matches(other)
def __eq__(self, other):
return False if type(self) != type(other) else IntEnum.__eq__(self, other)
def __floordiv__(self, other):
return make_phoneme(self) // other
class Consonantal(PhonologicalFeature):
neg = auto()
pos = auto()
class Voiced(PhonologicalFeature):
neg = auto()
pos = auto()
class Aspirated(PhonologicalFeature):
neg = auto()
pos = auto()
class Geminate(PhonologicalFeature):
neg = auto()
pos = auto()
class Roundedness(PhonologicalFeature):
neg = auto()
return make_phoneme(self) <= other
def __ge__(self, other):
return make_phoneme(self) >= other
def matches(self, other):
return make_phoneme(self).matches(other)
def __eq__(self, other):
return False if type(self) != type(other) else IntEnum.__eq__(self, other)
def __floordiv__(self, other):
return make_phoneme(self) // other
class Consonantal(PhonologicalFeature):
neg = auto()
pos = auto()
class Voiced(PhonologicalFeature):
neg = auto()
pos = auto()
class Aspirated(PhonologicalFeature):
neg = auto()
pos = auto()
class Geminate(PhonologicalFeature):
neg = auto()
def onekgreek_tei_xml_to_text_capitains():
"""Use MyCapitains program to convert TEI to plaintext."""
file = os.path.expanduser(
get_cltk_data_dir() + '/greek/text/greek_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml')
xml_dir = os.path.normpath(get_cltk_data_dir() + '/greek/text/greek_text_first1kgreek/data/*/*/*.xml')
xml_paths = glob.glob(xml_dir)
if not len(xml_paths):
logger.error('1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`.')
raise FileNotFoundError
xml_paths = [path for path in xml_paths if '__cts__' not in path]
# new dir
new_dir = os.path.normpath(get_cltk_data_dir() + '/greek/text/greek_text_first1kgreek_plaintext/')
if not os.path.isdir(new_dir):
os.makedirs(new_dir)
for xml_path in xml_paths:
_, xml_name = os.path.split(xml_path)
xml_name = xml_name.rstrip('.xml')
xml_name += '.txt'
plain_text = ''
with open(xml_path) as file_open:
text = CapitainsCtsText(resource=file_open)
# git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git')
# self._download_corpus(corpus_type, corpus_name, path)
type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type)
type_dir = os.path.expanduser(type_dir_rel)
repo_name = uri.split('/')[-1] # eg, 'latin_corpus_newton_example.git'
repo_name = repo_name.rstrip('.git')
target_dir = os.path.join(type_dir, repo_name)
target_file = os.path.join(type_dir, repo_name, 'README.md')
# check if corpus already present
# if not, clone
if not os.path.isfile(target_file):
if not os.path.isdir(type_dir):
os.makedirs(type_dir)
try:
msg = "Cloning '{}' from '{}'".format(corpus_name, uri)
logger.info(msg)
Repo.clone_from(uri, target_dir, branch=branch, depth=1,
progress=ProgressPrinter())
except CorpusImportError as corpus_imp_err:
msg = "Git clone of '{}' failed: '{}'".format(uri, corpus_imp_err)
logger.error(msg)
# if corpus is present, pull latest
else:
try:
repo = Repo(target_dir)
assert not repo.bare # or: assert repo.exists()
git_origin = repo.remotes.origin
msg = "Pulling latest '{}' from '{}'.".format(corpus_name, uri)
logger.info(msg)
git_origin.pull()
except CorpusImportError as corpus_imp_err:
msg = "Git pull of '{}' failed: '{}'".format(uri, corpus_imp_err)
logger.info(msg)
if corpus_name in ('phi5', 'phi7', 'tlg'):
if corpus_name == 'phi5':
# normalize path for checking dir
if local_path.endswith('/'):
local_path = local_path[:-1]
# check for right corpus dir
if os.path.split(local_path)[1] != 'PHI5':
logger.info("Directory must be named 'PHI5'.")
if corpus_name == 'phi7':
# normalize local_path for checking dir
if local_path.endswith('/'):
local_path = local_path[:-1]
# check for right corpus dir
if os.path.split(local_path)[1] != 'PHI7':
logger.info("Directory must be named 'PHI7'.")
if corpus_name == 'tlg':
# normalize path for checking dir
if local_path.endswith('/'):
local_path = local_path[:-1]
# check for right corpus dir
if os.path.split(local_path)[1] != 'TLG_E':
logger.info("Directory must be named 'TLG_E'.")
# move the dir-checking commands into a function
data_dir = os.path.expanduser(CLTK_DATA_DIR)
originals_dir = os.path.join(data_dir, 'originals')
# check for `originals` dir; if not present mkdir
if not os.path.isdir(originals_dir):
os.makedirs(originals_dir)
msg = "Wrote directory at '{}'.".format(originals_dir)
logger.info(msg)
tlg_originals_dir = os.path.join(data_dir,
def tag_sentence(self, sentence):
"""Tag using Lapos model.
TODO: Figure out how to pre-load model (loading is really slow). Or force users to bulk-convert files or strings.
"""
fp_lapos = os.path.expanduser('~/cltk_data/multilingual/software/lapos')
fp_model = os.path.expanduser('~/cltk_data/{0}/model/{1}_models_cltk/taggers/pos'.format(self.language, self.language)) # rel from Lapos dir
try:
lapos_command = 'cd {0} && echo "{1}" | ./lapos -t -m {2}'.format(fp_lapos, sentence, fp_model)
p_out = subprocess.check_output(lapos_command,
shell=True,
stderr=subprocess.STDOUT,
universal_newlines=True)
except subprocess.CalledProcessError as cp_err:
logger.error('Lapos call failed. Check installation.')
logger.error(sentence)
print(cp_err)
raise
# Parse output from Lapos
# TODO: Make this cleaner/faster
output_list = p_out.split('\n')
output_list_filtered = [l for l in output_list if not l.startswith('loading the models')]
output_list_filtered = [l for l in output_list_filtered if not l == 'done']
output_list_filtered = [l for l in output_list_filtered if l]
for line in output_list_filtered:
word_tags = line.split(' ')
tagged_sentence = []
for word_tag in word_tags:
word, tag = word_tag.split('/')
word_tag_tuple = (word, tag)
words = line.split(" ")
space_list = StringUtils.space_list(line)
corrected_words = []
for word in words:
found = False
for prefix in self.constants.PREFIXES:
if word.startswith(prefix) and word != prefix:
corrected_words.append(self.syllabifier.convert_consonantal_i(prefix))
corrected_words.append(
self.syllabifier.convert_consonantal_i(word[len(prefix):]))
found = True
break
if not found:
corrected_words.append(self.syllabifier.convert_consonantal_i(word))
new_line = StringUtils.join_syllables_spaces(corrected_words, space_list)
char_list = StringUtils.overwrite(list(new_line),
r"\b[iī][{}]".format(
self.constants.VOWELS + self.constants.ACCENTED_VOWELS),
"j")
char_list = StringUtils.overwrite(char_list,
r"\b[I][{}]".format(self.constants.VOWELS_WO_I),
"J")
char_list = StringUtils.overwrite(char_list, r"[{}][i][{}]".format(
self.constants.VOWELS_WO_I, self.constants.VOWELS),
"j", 1)
return "".join(char_list)
def transform_i_to_j(self, line: str) -> str:
"""Transform instances of consonantal i to j
:param line:
:return:
>>> print(VerseScanner().transform_i_to_j("iactātus"))
jactātus
>>> print(VerseScanner().transform_i_to_j("bracchia"))
bracchia
"""
words = line.split(" ")
space_list = StringUtils.space_list(line)
corrected_words = []
for word in words:
found = False
for prefix in self.constants.PREFIXES:
if word.startswith(prefix) and word != prefix:
corrected_words.append(self.syllabifier.convert_consonantal_i(prefix))
corrected_words.append(
self.syllabifier.convert_consonantal_i(word[len(prefix):]))
found = True
break
if not found:
corrected_words.append(self.syllabifier.convert_consonantal_i(word))
new_line = StringUtils.join_syllables_spaces(corrected_words, space_list)
char_list = StringUtils.overwrite(list(new_line),
r"\b[iī][{}]".format(
self.constants.VOWELS + self.constants.ACCENTED_VOWELS),
words = line.split(" ")
space_list = StringUtils.space_list(line)
corrected_words = []
for word in words:
found = False
for prefix in self.constants.PREFIXES:
if word.startswith(prefix) and word != prefix:
corrected_words.append(self.syllabifier.convert_consonantal_i(prefix))
corrected_words.append(
self.syllabifier.convert_consonantal_i(word[len(prefix):]))
found = True
break
if not found:
corrected_words.append(self.syllabifier.convert_consonantal_i(word))
new_line = StringUtils.join_syllables_spaces(corrected_words, space_list)
char_list = StringUtils.overwrite(list(new_line),
r"\b[iī][{}]".format(
self.constants.VOWELS + self.constants.ACCENTED_VOWELS),
"j")
char_list = StringUtils.overwrite(char_list,
r"\b[I][{}]".format(self.constants.VOWELS_WO_I),
"J")
char_list = StringUtils.overwrite(char_list, r"[{}][i][{}]".format(
self.constants.VOWELS_WO_I, self.constants.VOWELS),
"j", 1)
return "".join(char_list)
def _get_corpus_properties(self, corpus_name):
"""Check whether a corpus is available for import.
:type corpus_name: str
:param corpus_name: Name of available corpus.
:rtype : str
"""
try:
# corpora = LANGUAGE_CORPORA[self.language]
corpora = self.all_corpora
except NameError as name_error:
msg = 'Corpus not available for language ' \
'"%s": %s' % (self.language, name_error)
logger.error(msg)
raise CorpusImportError(msg)
for corpus_properties in corpora:
if corpus_properties['name'] == corpus_name:
return corpus_properties
msg = 'Corpus "%s" not available for the ' \
'"%s" language.' % (corpus_name, self.language)
logger.error(msg)
raise CorpusImportError(msg)