Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def keywordize(str):
"""
Splits a string into words, removes common stopwords, stems and removes
duplicates.
"""
sents = nltk.tokenize.sent_tokenize(str)
words = []
for sent in sents:
words.extend(nltk.tokenize.word_tokenize(sent))
stemmer = nltk.stem.porter.PorterStemmer()
stop_words = nltk.corpus.stopwords.words()
words = [stemmer.stem(word.lower()) for word in words if
(word.isalpha() or word.isdigit()) and
word.lower() not in stop_words]
words = set(words)
return words
from esnlm.readouts import *
import random
print "... loading text"
#with open('./../datasets/t5_train') as f:
# text_train =(' '.join(pickle.load(f))).split(' . ')
# random.shuffle(text_train)
# text_train = (' . '.join(text_train)).split(' ')
#
#with open('./../datasets/t5_test') as f:
# text_test =(' '.join(pickle.load(f))).split(' . ')
# random.shuffle(text_test)
# text_test = (' . '.join(text_test)).split(' ')
import nltk
text_train = list(nltk.corpus.gutenberg.words('austen-emma.txt'))
text_test = text_train
vocabulary = list(set(text_train))
### Transform text into labels
utrain = [vocabulary.index(w) for w in text_train[:-1]]
ytrain = [vocabulary.index(w) for w in text_train[1:]]
utest = [vocabulary.index(w) for w in text_test[:-1]]
ytest = [vocabulary.index(w) for w in text_test[1:]]
print "... building model"
### Hyperparameters
input_dim = output_dim = len(vocabulary)
features_dim, reservoir_dim = 2, 25
spectral_radius = 0.97
# has been extended by sbadecker to support lemmatization using
# WordNetLemmatizer from NLTK.
from __future__ import absolute_import
from __future__ import print_function
import re
import operator
import six
from six.moves import range
from nltk.stem import WordNetLemmatizer
import nltk
try:
_ = nltk.corpus.wordnet
except Exception:
nltk.download('wordnet')
def is_number(s):
try:
float(s) if '.' in s else int(s)
return True
except ValueError:
return False
def load_stopwords(stopword_file):
'''
Utility function to load stop words from a file and return as a list of
words.
def score_keyphrases_by_textrank(text, n_keywords=0.05):
from itertools import takewhile, tee
import networkx, nltk
stop_words = set(nltk.corpus.stopwords.words('turkish'))
# tokenize for all words, and extract *candidate* words
words = [word.lower()
for sent in nltk.sent_tokenize(text)
for word in nltk.word_tokenize(removePunc(sent,' ').strip())
if len(word) > 2 and word.lower() not in stop_words]
candidates = extract_candidate_words(text)
# build graph, each node is a unique candidate
graph = networkx.Graph()
graph.add_nodes_from(set(candidates))
# iterate over word-pairs, add unweighted edges into graph
def pairwise(iterable):
"""s -> (s0,s1), (s1,s2), (s2, s3), ..."""
a, b = tee(iterable)
def compare_stemming_to_lemmatization():
# load each of the corpora
abc_words = nltk.corpus.abc.words()
genesis_words = nltk.corpus.genesis.words()
gutenberg_words = nltk.corpus.gutenberg.words()
inaugural_words = nltk.corpus.inaugural.words()
state_union_words = nltk.corpus.state_union.words()
webtext_words = nltk.corpus.webtext.words()
all_words = [abc_words, genesis_words, gutenberg_words, inaugural_words,
state_union_words, webtext_words]
corpora_names =["ABC", "Genesis", "Gutenberg", "Inaugural",
"Union", "Web"];
word_counts = []
lemmatized_counts = []
stemmed_counts = []
# iterate through each corpus and generate counts of the unique tokens
# in each
# This script creates an overrides file that allows the system to overcome issues with
# the way Spacy lemmatizes words and invalid data in the AGID.
# The created file is a mapping from lemma/tag to the "best" inflection. Note that
# this only overrides methods where the treebank tag is used, not ones where the
# simplified AGID tag (V, N or A) is used.
# Note that if the AGID version is changed this script should be re-run. Additionally
# if Spacy changes their lemmatizer or if a different lemmatizer is used consider re-running
# this script.
if __name__ == '__main__':
level = logging.WARNING
format = '[%(levelname)s %(filename)s ln=%(lineno)s] %(message)s'
logging.basicConfig(level=level, format=format)
# Configuration
#corp_fns = ['austen-emma.txt'] # 7,491 sentences
corp_fns = nltk.corpus.gutenberg.fileids() # 18 files with 94K sentences
max_chars = int(1e9)
req_count = 4 # require at least the many instances in corpus for an override
lemminflect.setUseInternalLemmatizer(True) # use lemminflect or spaCy's lemmatizer
inflect_oov = True # test/inflect out-of-vocab words
multiples_fn = 'CorpMultiInfls.txt'
# Load Spacy
print('Loading Spacy model')
nlp = spacy.load('en_core_web_sm')
print('Using spaCy version ', spacy.__version__)
# Load the corpus to test with
print('Loading corpus')
sents = []
for corp_fn in corp_fns:
sents += loadNLTKCorpus(corp_fn, max_chars)
def a_or_an_words(paragraphs):
with codecs.open(filename + '_a.html', 'w', 'latin1') as f:
f.write(header % dict(title='a or an'))
f.write("""<h1>a or an?</h1>
The rule is that <em>a</em> is used before a word starting with a
consonant (a house, a unicorn), and <em>an</em> before a vowel
(an ox, an hour). Here we check whether the following word is a vowel or consonant.
<hr>
<style type="text/css">
.evaluation{font-family: monospace; color: gray;}
</style>
""")
from collections import defaultdict
firstsyll = defaultdict(list)
for word, syl in nltk.corpus.cmudict.entries():
firstsyll[word].append(syl[0])
nfound = 0
nwrong = 0
f.write("<ul>\n")
for para in paragraphs:
for txt, tags, entities in para:
for i, (word, _wordtype) in enumerate(tags):
if word not in ('a', 'an'):
continue
expect_vowel = word == 'an'
if i + 1 == len(tags):
# no word after a/an.
continue
nextword, _wordtype2 = tags[i+1]
if nextword.isupper():</ul>
def process(words):
word_dict = {}
result = ""
real_words = set(nltk.corpus.words.words())
for w in words:
if w not in stop_words.ENGLISH_STOP_WORDS and w in real_words and len(w)>1:
# print(w)
result = result + w + " "
if w in word_dict:
word_dict[w] += 1
else:
word_dict[w] = 1
sorted_word_dict = sorted(word_dict.items(), key=lambda kv: kv[1], reverse=True)
res = {}
with open("t.out","w") as f:
tmp = 0
for items in sorted_word_dict:
if int(items[1]) < 50:
# if tmp>200:
break
def trained_tagger(existing=False):
"""Returns a trained trigram tagger
existing : set to True if already trained tagger has been pickled
"""
if existing:
trigram_tagger = pickle.load(open('trained_tagger.pkl', 'rb'))
return trigram_tagger
# Aggregate trained sentences for N-Gram Taggers
train_sents = nltk.corpus.brown.tagged_sents()
train_sents += nltk.corpus.conll2000.tagged_sents()
train_sents += nltk.corpus.treebank.tagged_sents()
# Create instance of SubjectTrigramTagger and persist instance of it
trigram_tagger = SubjectTrigramTagger(train_sents)
pickle.dump(trigram_tagger, open('trained_tagger.pkl', 'wb'))
return trigram_tagger
synopses.append(a.links[k].raw_text(include_content=True))
"""
for k in a.posts:
titles.append(a.posts[k].message[0:80])
links.append(k)
synopses.append(a.posts[k].raw_text())
print(str(len(titles)) + ' titles')
print(str(len(links)) + ' links')
print(str(len(synopses)) + ' synopses')
ranks = []
for i in range(0,len(titles)):
ranks.append(i)
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in synopses:
allwords_stemmed = tokenize_and_stem(i)
totalvocab_stemmed.extend(allwords_stemmed)
allwords_tokenized = tokenize_only(i)
totalvocab_tokenized.extend(allwords_tokenized)
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
from sklearn.feature_extraction.text import TfidfVectorizer