Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# -*- coding: utf-8 -*-
import regex
import os
import codecs
from contractions import Contractions
class ItalianContractions(Contractions):
def __init__(self):
# Load Italian verbs and their inflections from a lexicon.
filepath = os.sep.join([os.path.dirname(os.path.realpath(__file__)),
'italian_verbs.txt'])
self.verbs = set()
with codecs.open(filepath, encoding='utf8') as f:
for line in f:
fields = line.rstrip('\n').split()
assert len(fields) == 3
self.verbs.add(fields[0])
def split_if_contraction(self, word):
original_word = word
# Handle preposition+determiner contractions.
word = regex.sub(ur'^([A|a])l$', ur'\1 il', word)
#add extra space to make things easier
text = " " + text + " "
#ending quotes
text = re.sub(r'"', " '' ", text)
text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
# Split on contractions and clitics.
if self.tokenization_style == 'cintil':
text = contr.split_contractions(text, self.contractions,
use_cintil_format=True)
text = clit.split_clitics(text, self.clitics, self.suffixes,
use_cintil_format=True)
else:
text = contr.split_contractions(text, self.contractions,
use_cintil_format=False)
text = clit.split_clitics(text, self.clitics, self.suffixes,
use_cintil_format=False)
text = re.sub(" +", " ", text)
text = text.strip()
#add space at end to match up with MacIntyre's output (for debugging)
if text != "":
text += " "
return text.split()
def normalize_corpus(corpus, lemmatize=True,
only_text_chars=False,
tokenize=False):
normalized_corpus = []
for text in corpus:
text = html_parser.unescape(text)
text = expand_contractions(text, CONTRACTION_MAP)
if lemmatize:
text = lemmatize_text(text)
else:
text = text.lower()
text = remove_special_characters(text)
text = remove_stopwords(text)
if only_text_chars:
text = keep_text_characters(text)
if tokenize:
text = tokenize_text(text)
normalized_corpus.append(text)
else:
normalized_corpus.append(text)
return normalized_corpus
def normalize_corpus(corpus, lemmatize=True,
only_text_chars=False,
tokenize=False):
normalized_corpus = []
for index, text in enumerate(corpus):
text = normalize_accented_characters(text)
text = html_parser.unescape(text)
text = strip_html(text)
text = expand_contractions(text, CONTRACTION_MAP)
if lemmatize:
text = lemmatize_text(text)
else:
text = text.lower()
text = remove_special_characters(text)
text = remove_stopwords(text)
if only_text_chars:
text = keep_text_characters(text)
if tokenize:
text = tokenize_text(text)
normalized_corpus.append(text)
else:
normalized_corpus.append(text)
return normalized_corpus
def normalize_corpus(corpus, tokenize=False):
normalized_corpus = []
for text in corpus:
text = expand_contractions(text, CONTRACTION_MAP)
text = lemmatize_text(text)
text = remove_special_characters(text)
text = remove_stopwords(text)
normalized_corpus.append(text)
if tokenize:
text = tokenize_text(text)
normalized_corpus.append(text)
return normalized_corpus
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
flags=re.IGNORECASE|re.DOTALL)
def expand_match(contraction):
match = contraction.group(0)
first_char = match[0]
expanded_contraction = contraction_mapping.get(match)\
if contraction_mapping.get(match)\
else contraction_mapping.get(match.lower())
expanded_contraction = first_char+expanded_contraction[1:]
return expanded_contraction
expanded_sentence = contractions_pattern.sub(expand_match, sentence)
return expanded_sentence
expanded_corpus = [expand_contractions(sentence, CONTRACTION_MAP)
for sentence in cleaned_corpus]
print expanded_corpus
print
# case conversion
print corpus[0].lower()
print corpus[0].upper()
# removing stopwords
def remove_stopwords(tokens):
stopword_list = nltk.corpus.stopwords.words('english')
filtered_tokens = [token for token in tokens if token not in stopword_list]
return filtered_tokens
def normalize_corpus(corpus, lemmatize=True, tokenize=False):
normalized_corpus = []
for text in corpus:
text = html_parser.unescape(text)
text = expand_contractions(text, CONTRACTION_MAP)
if lemmatize:
text = lemmatize_text(text)
else:
text = text.lower()
text = remove_special_characters(text)
text = remove_stopwords(text)
if tokenize:
text = tokenize_text(text)
normalized_corpus.append(text)
else:
normalized_corpus.append(text)
return normalized_corpus
# -*- coding: utf-8 -*-
import regex
from contractions import Contractions
class EnglishContractions(Contractions):
def __init__(self):
# List of contractions adapted from Robert MacIntyre's tokenizer.
# These were in turn collected from the TreebankWordTokenizer in NLTK.
self.CONTRACTIONS = [regex.compile(r"([^' ])('[sS]|'[mM]|'[dD]|')\b"),
regex.compile(
r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T)\b")]
self.CONTRACTIONS2 = [regex.compile(r"(?i)\b(can)(not)\b"),
regex.compile(r"(?i)\b(d)('ye)\b"),
regex.compile(r"(?i)\b(gim)(me)\b"),
regex.compile(r"(?i)\b(gon)(na)\b"),
regex.compile(r"(?i)\b(got)(ta)\b"),
regex.compile(r"(?i)\b(lem)(me)\b"),
regex.compile(r"(?i)\b(mor)('n)\b"),
regex.compile(r"(?i)\b(wan)(na) ")]
self.CONTRACTIONS3 = [regex.compile(r"(?i) ('t)(is)\b"),
regex.compile(r"(?i) ('t)(was)\b")]
# -*- coding: utf-8 -*-
import regex
from contractions import Contractions
class PortugueseContractions(Contractions):
def __init__(self):
# A blacklist of words that should not be confused with contractions.
# If True, mark consonants removed due to enclitics with symbols # and
# -CL- for mesoclitics.
self.mark_enclitics = False
self.non_contractions = {}
self.contractions = self._generate_contractions()
self.clitics, self.clitic_suffixes = self._generate_clitics()
def _generate_contractions(self):
"""
Generate contractions for Portuguese, along with the words
and lemmas that are contracted (e.g. contraction "das" is composed by
words "de" + "as", with corresponding lemmas "de" + "o".
Return a dictionary of contractions, each entry containing a list of
words and a list of lemmas (typically lists of length two).
# -*- coding: utf-8 -*-
import codecs
import os
import regex
from contractions import Contractions
class SpanishContractions(Contractions):
def __init__(self):
# A blacklist of words that should not be confused with contractions.
self.non_contractions = {} # {u'perla', u'perlas', u'arte', u'parte', \
# u'aparte'}
# A whitelist of frequent words that regexes are not getting but are
# contractions.
self.contractions = {}
verbs = [] # [u'convencer', u'haber', u'hacer', u'meter', u'vender', \
# u'poner', u'tener', u'comer', u'mover', u'atender', \
# u'responder', u'devolver', u'dar']
for verb in verbs:
for suffix in [u'me', u'te', u'nos', u'os']:
self.contractions[verb + suffix] = [verb, suffix]
# Load Spanish verbs and their inflections from a lexicon.
filepath = os.sep.join([os.path.dirname(os.path.realpath(__file__)),