Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# -*- coding: utf-8 -*-
import regex
import os
import codecs
from contractions import Contractions
class ItalianContractions(Contractions):
def __init__(self):
# Load Italian verbs and their inflections from a lexicon.
filepath = os.sep.join([os.path.dirname(os.path.realpath(__file__)),
'italian_verbs.txt'])
self.verbs = set()
with codecs.open(filepath, encoding='utf8') as f:
for line in f:
fields = line.rstrip('\n').split()
assert len(fields) == 3
self.verbs.add(fields[0])
def split_if_contraction(self, word):
original_word = word
# Handle preposition+determiner contractions.
word = regex.sub(ur'^([A|a])l$', ur'\1 il', word)
# -*- coding: utf-8 -*-
import regex
from contractions import Contractions
class EnglishContractions(Contractions):
def __init__(self):
# List of contractions adapted from Robert MacIntyre's tokenizer.
# These were in turn collected from the TreebankWordTokenizer in NLTK.
self.CONTRACTIONS = [regex.compile(r"([^' ])('[sS]|'[mM]|'[dD]|')\b"),
regex.compile(
r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T)\b")]
self.CONTRACTIONS2 = [regex.compile(r"(?i)\b(can)(not)\b"),
regex.compile(r"(?i)\b(d)('ye)\b"),
regex.compile(r"(?i)\b(gim)(me)\b"),
regex.compile(r"(?i)\b(gon)(na)\b"),
regex.compile(r"(?i)\b(got)(ta)\b"),
regex.compile(r"(?i)\b(lem)(me)\b"),
regex.compile(r"(?i)\b(mor)('n)\b"),
regex.compile(r"(?i)\b(wan)(na) ")]
self.CONTRACTIONS3 = [regex.compile(r"(?i) ('t)(is)\b"),
regex.compile(r"(?i) ('t)(was)\b")]
# -*- coding: utf-8 -*-
import regex
from contractions import Contractions
class PortugueseContractions(Contractions):
def __init__(self):
# A blacklist of words that should not be confused with contractions.
# If True, mark consonants removed due to enclitics with symbols # and
# -CL- for mesoclitics.
self.mark_enclitics = False
self.non_contractions = {}
self.contractions = self._generate_contractions()
self.clitics, self.clitic_suffixes = self._generate_clitics()
def _generate_contractions(self):
"""
Generate contractions for Portuguese, along with the words
and lemmas that are contracted (e.g. contraction "das" is composed by
words "de" + "as", with corresponding lemmas "de" + "o".
Return a dictionary of contractions, each entry containing a list of
words and a list of lemmas (typically lists of length two).
# -*- coding: utf-8 -*-
import codecs
import os
import regex
from contractions import Contractions
class SpanishContractions(Contractions):
def __init__(self):
# A blacklist of words that should not be confused with contractions.
self.non_contractions = {} # {u'perla', u'perlas', u'arte', u'parte', \
# u'aparte'}
# A whitelist of frequent words that regexes are not getting but are
# contractions.
self.contractions = {}
verbs = [] # [u'convencer', u'haber', u'hacer', u'meter', u'vender', \
# u'poner', u'tener', u'comer', u'mover', u'atender', \
# u'responder', u'devolver', u'dar']
for verb in verbs:
for suffix in [u'me', u'te', u'nos', u'os']:
self.contractions[verb + suffix] = [verb, suffix]
# Load Spanish verbs and their inflections from a lexicon.
filepath = os.sep.join([os.path.dirname(os.path.realpath(__file__)),
# -*- coding: utf-8 -*-
import regex
from contractions import Contractions
class FrenchContractions(Contractions):
def __init__(self):
pass
def split_if_contraction(self, word):
# Handle preposition+determiner contractions.
word = regex.sub(ur'^(A|a)u$', ur'à le', word)
word = regex.sub(ur'^(A|a)uquel$', ur'à lequel', word)
word = regex.sub(ur'^(A|a)ux$', ur'à les', word)
word = regex.sub(ur'^(A|a)uxquels$', ur'à lesquels', word)
word = regex.sub(ur'^(A|a)uxquelles$', ur'à lesquelles', word)
word = regex.sub(ur'^(D|d)u$', ur'de le', word)
word = regex.sub(ur'^(D|d)uquel$', ur'de lequel', word)
word = regex.sub(ur'^(D|d)es$', ur'de les', word)
word = regex.sub(ur'^(D|d)esquels$', ur'de lesquels', word)
word = regex.sub(ur'^(D|d)esquelles$', ur'de lesquelles', word)