How to use contractions - 10 common examples

To help you get started, we’ve selected a few contractions examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github andre-martins / TurboParser / python / tokenizer / italian_contractions.py View on Github external
# -*- coding: utf-8 -*-

import regex
import os
import codecs
from contractions import Contractions


class ItalianContractions(Contractions):
    def __init__(self):
        # Load Italian verbs and their inflections from a lexicon.
        filepath = os.sep.join([os.path.dirname(os.path.realpath(__file__)),
                                'italian_verbs.txt'])
        self.verbs = set()
        with codecs.open(filepath, encoding='utf8') as f:
            for line in f:
                fields = line.rstrip('\n').split()
                assert len(fields) == 3
                self.verbs.add(fields[0])

    def split_if_contraction(self, word):
        original_word = word

        # Handle preposition+determiner contractions.
        word = regex.sub(ur'^([A|a])l$', ur'\1 il', word)
github andre-martins / TurboParser / python / tokenizers / portuguese / word_tokenizer.py View on Github external
#add extra space to make things easier
        text = " " + text + " "

        #ending quotes
        text = re.sub(r'"', " '' ", text)
        text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)

        # Split on contractions and clitics.
        if self.tokenization_style == 'cintil':
            text = contr.split_contractions(text, self.contractions,
                                            use_cintil_format=True)
            text = clit.split_clitics(text, self.clitics, self.suffixes,
                                      use_cintil_format=True)
        else:
            text = contr.split_contractions(text, self.contractions,
                                            use_cintil_format=False)
            text = clit.split_clitics(text, self.clitics, self.suffixes,
                                      use_cintil_format=False)

        text = re.sub(" +", " ", text)
        text = text.strip()

        #add space at end to match up with MacIntyre's output (for debugging)
        if text != "":
            text += " "

        return text.split()
github dipanjanS / text-analytics-with-python / Old-First-Edition / source_code / Ch06_Text_Similarity_and_Clustering / normalization.py View on Github external
def normalize_corpus(corpus, lemmatize=True, 
                     only_text_chars=False,
                     tokenize=False):
    
    normalized_corpus = []    
    for text in corpus:
        text = html_parser.unescape(text)
        text = expand_contractions(text, CONTRACTION_MAP)
        if lemmatize:
            text = lemmatize_text(text)
        else:
            text = text.lower()
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        if only_text_chars:
            text = keep_text_characters(text)
        
        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)
        else:
            normalized_corpus.append(text)
            
    return normalized_corpus
github dipanjanS / text-analytics-with-python / Old-First-Edition / source_code / Ch07_Semantic_and_Sentiment_Analysis / normalization.py View on Github external
def normalize_corpus(corpus, lemmatize=True, 
                     only_text_chars=False,
                     tokenize=False):
    
    normalized_corpus = []    
    for index, text in enumerate(corpus):
        text = normalize_accented_characters(text)
        text = html_parser.unescape(text)
        text = strip_html(text)
        text = expand_contractions(text, CONTRACTION_MAP)
        if lemmatize:
            text = lemmatize_text(text)
        else:
            text = text.lower()
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        if only_text_chars:
            text = keep_text_characters(text)
        
        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)
        else:
            normalized_corpus.append(text)
            
    return normalized_corpus
github dipanjanS / text-analytics-with-python / Old-First-Edition / source_code / Ch04_Text_Classification / normalization.py View on Github external
def normalize_corpus(corpus, tokenize=False):
    
    normalized_corpus = []    
    for text in corpus:
        text = expand_contractions(text, CONTRACTION_MAP)
        text = lemmatize_text(text)
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        normalized_corpus.append(text)
        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)
            
    return normalized_corpus
github dipanjanS / text-analytics-with-python / Old-First-Edition / source_code / Ch03_Processing_and_Understanding_Text / normalizer.py View on Github external
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_sentence = contractions_pattern.sub(expand_match, sentence)
    return expanded_sentence
    
expanded_corpus = [expand_contractions(sentence, CONTRACTION_MAP) 
                    for sentence in cleaned_corpus]    
print expanded_corpus
print 

    
# case conversion    
print corpus[0].lower()
print corpus[0].upper()
 
       
# removing stopwords
def remove_stopwords(tokens):
    stopword_list = nltk.corpus.stopwords.words('english')
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    return filtered_tokens
github dipanjanS / text-analytics-with-python / Old-First-Edition / source_code / Ch05_Text_Summarization / normalization.py View on Github external
def normalize_corpus(corpus, lemmatize=True, tokenize=False):
    
    normalized_corpus = []  
    for text in corpus:
        text = html_parser.unescape(text)
        text = expand_contractions(text, CONTRACTION_MAP)
        if lemmatize:
            text = lemmatize_text(text)
        else:
            text = text.lower()
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)
        else:
            normalized_corpus.append(text)
            
    return normalized_corpus
github andre-martins / TurboParser / python / tokenizer / english_contractions.py View on Github external
# -*- coding: utf-8 -*-

import regex
from contractions import Contractions


class EnglishContractions(Contractions):
    def __init__(self):
        # List of contractions adapted from Robert MacIntyre's tokenizer.
        # These were in turn collected from the TreebankWordTokenizer in NLTK.
        self.CONTRACTIONS = [regex.compile(r"([^' ])('[sS]|'[mM]|'[dD]|')\b"),
                             regex.compile(
                                 r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T)\b")]
        self.CONTRACTIONS2 = [regex.compile(r"(?i)\b(can)(not)\b"),
                              regex.compile(r"(?i)\b(d)('ye)\b"),
                              regex.compile(r"(?i)\b(gim)(me)\b"),
                              regex.compile(r"(?i)\b(gon)(na)\b"),
                              regex.compile(r"(?i)\b(got)(ta)\b"),
                              regex.compile(r"(?i)\b(lem)(me)\b"),
                              regex.compile(r"(?i)\b(mor)('n)\b"),
                              regex.compile(r"(?i)\b(wan)(na) ")]
        self.CONTRACTIONS3 = [regex.compile(r"(?i) ('t)(is)\b"),
                              regex.compile(r"(?i) ('t)(was)\b")]
github andre-martins / TurboParser / python / tokenizer / portuguese_contractions.py View on Github external
# -*- coding: utf-8 -*-

import regex
from contractions import Contractions


class PortugueseContractions(Contractions):
    def __init__(self):
        # A blacklist of words that should not be confused with contractions.
        # If True, mark consonants removed due to enclitics with symbols # and
        # -CL- for mesoclitics.
        self.mark_enclitics = False
        self.non_contractions = {}
        self.contractions = self._generate_contractions()
        self.clitics, self.clitic_suffixes = self._generate_clitics()

    def _generate_contractions(self):
        """
        Generate contractions for Portuguese, along with the words
        and lemmas that are contracted (e.g. contraction "das" is composed by
        words "de" + "as", with corresponding lemmas "de" + "o".
        Return a dictionary of contractions, each entry containing a list of
        words and a list of lemmas (typically lists of length two).
github andre-martins / TurboParser / python / tokenizer / spanish_contractions.py View on Github external
# -*- coding: utf-8 -*-

import codecs
import os

import regex

from contractions import Contractions


class SpanishContractions(Contractions):
    def __init__(self):
        # A blacklist of words that should not be confused with contractions.
        self.non_contractions = {}  # {u'perla', u'perlas', u'arte', u'parte', \
                                    # u'aparte'}
        # A whitelist of frequent words that regexes are not getting but are
        # contractions.
        self.contractions = {}
        verbs = []  # [u'convencer', u'haber', u'hacer', u'meter', u'vender', \
                    # u'poner', u'tener', u'comer', u'mover', u'atender', \
                    # u'responder', u'devolver', u'dar']
        for verb in verbs:
            for suffix in [u'me', u'te', u'nos', u'os']:
                self.contractions[verb + suffix] = [verb, suffix]

        # Load Spanish verbs and their inflections from a lexicon.
        filepath = os.sep.join([os.path.dirname(os.path.realpath(__file__)),

contractions

Fixes contractions such as `you're` to you `are`

MIT
Latest version published 2 years ago

Package Health Score

55 / 100
Full package analysis