Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def tag_ngram_12_backoff(self, untagged_string: str):
"""Tag POS with 1-, 2-gram tagger.
:type untagged_string: str
:param : An untagged, untokenized string of text.
:rtype tagged_text: str
"""
untagged_tokens = wordpunct_tokenize(untagged_string)
pickle_path = self.available_taggers['ngram_12_backoff']
tagger = open_pickle(pickle_path)
tagged_text = tagger.tag(untagged_tokens)
return tagged_text
def __init__(self: object, language:str = 'latin', strict:bool = False):
"""
:param language : language for sentence tokenization
:type language: str
:param strict : allow for stricter puctuation for sentence tokenization
:type strict: bool
"""
self.lang_vars = LatinLanguageVars()
self.strict = strict
super().__init__(language='latin', lang_vars=self.lang_vars)
self.models_path = LatinPunktSentenceTokenizer.models_path
try:
self.model = open_pickle(os.path.join(self.models_path, 'latin_punkt.pickle'))
except FileNotFoundError as err:
raise type(err)(LatinPunktSentenceTokenizer.missing_models_message)
if self.strict:
PunktLanguageVars.sent_end_chars=STRICT_PUNCTUATION
else:
PunktLanguageVars.sent_end_chars=PUNCTUATION
def __init__(self, train, seed=3):
self.train = train
self.seed = seed
rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
path = os.path.expanduser(rel_path)
# Check for presence of LATIN_OLD_MODEL
file = 'latin_lemmata_cltk.pickle'
old_model_path = os.path.join(path, file)
if os.path.isfile(old_model_path):
self.LATIN_OLD_MODEL = open_pickle(old_model_path)
else:
self.LATIN_OLD_MODEL = {}
print('The file %s is not available in cltk_data' % file)
# Check for presence of LATIN_MODEL
file = 'latin_model.pickle'
model_path = os.path.join(path, file)
if os.path.isfile(model_path):
self.LATIN_MODEL = open_pickle(model_path)
else:
self.LATIN_MODEL = {}
print('The file %s is not available in cltk_data' % file)
# Check for presence of misc_patterns
self.latin_sub_patterns = latin_sub_patterns
def tag_tnt(self, untagged_string: str):
"""Tag POS with TnT tagger.
:type untagged_string: str
:param : An untagged, untokenized string of text.
:rtype tagged_text: str
"""
untagged_tokens = wordpunct_tokenize(untagged_string)
pickle_path = self.available_taggers['tnt']
tagger = open_pickle(pickle_path)
tagged_text = tagger.tag(untagged_tokens)
return tagged_text
def __init__(self, language: str = None, lang_vars: object = None):
"""
:param language : language for sentence tokenization
:type language: str
"""
self.language = language
self.lang_vars = lang_vars
super().__init__(language=self.language)
if self.language:
self.models_path = self._get_models_path(self.language)
try:
self.model = open_pickle(os.path.join(os.path.expanduser(self.models_path),
f'{self.language}_punkt.pickle'))
except FileNotFoundError as err:
raise type(err)(BasePunktSentenceTokenizer.missing_models_message)
def tag_ngram_123_backoff(self, untagged_string: str):
"""Tag POS with 1-, 2-, 3-gram tagger.
:type untagged_string: str
:param : An untagged, untokenized string of text.
:rtype tagged_text: str
"""
untagged_tokens = wordpunct_tokenize(untagged_string)
pickle_path = self.available_taggers['ngram_123_backoff']
tagger = open_pickle(pickle_path)
tagged_text = tagger.tag(untagged_tokens)
return tagged_text
# Check for presence of LATIN_OLD_MODEL
file = 'latin_lemmata_cltk.pickle'
old_model_path = os.path.join(path, file)
if os.path.isfile(old_model_path):
self.LATIN_OLD_MODEL = open_pickle(old_model_path)
else:
self.LATIN_OLD_MODEL = {}
print('The file %s is not available in cltk_data' % file)
# Check for presence of LATIN_MODEL
file = 'latin_model.pickle'
model_path = os.path.join(path, file)
if os.path.isfile(model_path):
self.LATIN_MODEL = open_pickle(model_path)
else:
self.LATIN_MODEL = {}
print('The file %s is not available in cltk_data' % file)
# Check for presence of misc_patterns
self.latin_sub_patterns = latin_sub_patterns
# Check for presence of verb_patterns
self.latin_verb_patterns = latin_verb_patterns
# Check for presence of latin_pps
self.latin_pps = latin_pps
def _randomize_data(train, seed):
import random
random.seed(seed)
def __init__(self: object, train: List[list] = None, seed: int = 3, verbose: bool = False):
self.models_path = BackoffGreekLemmatizer.models_path
missing_models_message = "BackoffGreekLemmatizer requires the ```greek_models_cltk``` to be in cltk_data. Please load this corpus."
try:
self.train = open_pickle(os.path.join(self.models_path, 'greek_lemmatized_sents.pickle'))
self.GREEK_OLD_MODEL = open_pickle(os.path.join(self.models_path, 'greek_lemmata_cltk.pickle'))
self.GREEK_MODEL = open_pickle(os.path.join(self.models_path, 'greek_model.pickle'))
except FileNotFoundError as err:
raise type(err)(missing_models_message)
self.greek_sub_patterns = greek_sub_patterns # Move to greek_models_cltk
self.seed = seed
self.VERBOSE=verbose
def _randomize_data(train: List[list], seed: int):
import random
random.seed(seed)
random.shuffle(train)
pos_train_sents = train[:4000]
lem_train_sents = [[(item[0], item[1]) for item in sent] for sent in train]
train_sents = lem_train_sents[:4000]
test_sents = lem_train_sents[4000:5000]
def __init__(self: object, train: List[list] = None, seed: int = 3, verbose: bool = False):
self.models_path = BackoffGreekLemmatizer.models_path
missing_models_message = "BackoffGreekLemmatizer requires the ```greek_models_cltk``` to be in cltk_data. Please load this corpus."
try:
self.train = open_pickle(os.path.join(self.models_path, 'greek_lemmatized_sents.pickle'))
self.GREEK_OLD_MODEL = open_pickle(os.path.join(self.models_path, 'greek_lemmata_cltk.pickle'))
self.GREEK_MODEL = open_pickle(os.path.join(self.models_path, 'greek_model.pickle'))
except FileNotFoundError as err:
raise type(err)(missing_models_message)
self.greek_sub_patterns = greek_sub_patterns # Move to greek_models_cltk
self.seed = seed
self.VERBOSE=verbose
def _randomize_data(train: List[list], seed: int):
import random
random.seed(seed)
random.shuffle(train)
pos_train_sents = train[:4000]
lem_train_sents = [[(item[0], item[1]) for item in sent] for sent in train]
def tag_trigram(self, untagged_string: str):
"""Tag POS with trigram tagger.
:type untagged_string: str
:param : An untagged, untokenized string of text.
:rtype tagged_text: str
"""
untagged_tokens = wordpunct_tokenize(untagged_string)
pickle_path = self.available_taggers['trigram']
tagger = open_pickle(pickle_path)
tagged_text = tagger.tag(untagged_tokens)
return tagged_text