Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def tag(self, corpus, tokenize=True):
'''Tags a string `corpus`.'''
# Assume untokenized corpus has \n between sentences and ' ' between words
s_split = nltk.sent_tokenize if tokenize else lambda t: t.split('\n')
w_split = nltk.word_tokenize if tokenize else lambda s: s.split()
def split_sents(corpus):
for s in s_split(corpus):
yield w_split(s)
prev, prev2 = self.START
tokens = []
for words in split_sents(corpus):
context = self.START + [self._normalize(w) for w in words] + self.END
for i, word in enumerate(words):
tag = self.tagdict.get(word)
if not tag:
features = self._get_features(i, word, context, prev, prev2)
tag = self.model.predict(features)
tokens.append((word, tag))
prev2 = prev
prev = tag
def tag(self, corpus, tokenize=True):
'''Tags a string `corpus`.'''
# Assume untokenized corpus has \n between sentences and ' ' between words
s_split = nltk.sent_tokenize if tokenize else lambda t: t.split('\n')
w_split = nltk.word_tokenize if tokenize else lambda s: s.split()
def split_sents(corpus):
for s in s_split(corpus):
yield w_split(s)
prev, prev2 = self.START
tokens = []
for words in split_sents(corpus):
context = self.START + [self._normalize(w) for w in words] + self.END
for i, word in enumerate(words):
tag = self.tagdict.get(word)
if not tag:
features = self._get_features(i, word, context, prev, prev2)
tag = self.model.predict(features)
tokens.append((word, tag))
prev2 = prev
def main():
for each in REQUIRED_CORPORA:
print('Downloading "{0}"'.format(each))
nltk.download(each)
print("Finished.")
def extract(self, text):
'''Return a list of noun phrases (strings) for body of text.'''
sentences = nltk.tokenize.sent_tokenize(text)
noun_phrases = []
for sentence in sentences:
parsed = self._parse_sentence(sentence)
# Get the string representation of each subtree that is a
# noun phrase tree
phrases = [_normalize_tags(filter_insignificant(each,
self.INSIGNIFICANT_SUFFIXES)) for each in parsed
if isinstance(each, nltk.tree.Tree) and each.label()
== 'NP' and len(filter_insignificant(each)) >= 1
and _is_match(each, cfg=self.CFG)]
nps = [tree2str(phrase) for phrase in phrases]
noun_phrases.extend(nps)
return noun_phrases
def extract(self, text):
'''Return a list of noun phrases (strings) for body of text.'''
sentences = nltk.tokenize.sent_tokenize(text)
noun_phrases = []
for sentence in sentences:
parsed = self._parse_sentence(sentence)
# Get the string representation of each subtree that is a
# noun phrase tree
phrases = [_normalize_tags(filter_insignificant(each,
self.INSIGNIFICANT_SUFFIXES)) for each in parsed
if isinstance(each, nltk.tree.Tree) and each.label()
== 'NP' and len(filter_insignificant(each)) >= 1
and _is_match(each, cfg=self.CFG)]
nps = [tree2str(phrase) for phrase in phrases]
noun_phrases.extend(nps)
return noun_phrases
def main():
for each in REQUIRED_CORPORA:
print('Downloading "{0}"'.format(each))
nltk.download(each)
print("Finished.")
@requires_nltk_corpus
def train(self):
'''Train the Chunker on the ConLL-2000 corpus.'''
train_data = [[(t, c) for _, t, c in nltk.chunk.tree2conlltags(sent)]
for sent in
nltk.corpus.conll2000.chunked_sents('train.txt',
chunk_types=['NP'])]
unigram_tagger = nltk.UnigramTagger(train_data)
self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
self._trained = True
def parse(self, sentence):
'''Return the parse tree for the sentence.'''
if not self._trained:
self.train()
pos_tags = [pos for (word, pos) in sentence]
tagged_pos_tags = self.tagger.tag(pos_tags)
chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in
zip(sentence, chunktags)]
return nltk.chunk.util.conlltags2tree(conlltags)
@requires_nltk_corpus
def train(self):
train_data = nltk.corpus.brown.tagged_sents(categories='news')
regexp_tagger = nltk.RegexpTagger([
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
(r'(-|:|;)$', ':'),
(r'\'*$', 'MD'),
(r'(The|the|A|a|An|an)$', 'AT'),
(r'.*able$', 'JJ'),
(r'^[A-Z].*$', 'NNP'),
(r'.*ness$', 'NN'),
(r'.*ly$', 'RB'),
(r'.*s$', 'NNS'),
(r'.*ing$', 'VBG'),
(r'.*ed$', 'VBD'),
(r'.*', 'NN'),
])
unigram_tagger = nltk.UnigramTagger(train_data, backoff=regexp_tagger)
self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
self._trained = True
@requires_nltk_corpus
def train(self):
'''Train the Chunker on the ConLL-2000 corpus.'''
train_data = [[(t, c) for _, t, c in nltk.chunk.tree2conlltags(sent)]
for sent in
nltk.corpus.conll2000.chunked_sents('train.txt',
chunk_types=['NP'])]
unigram_tagger = nltk.UnigramTagger(train_data)
self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
self._trained = True