Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def get_sentences(page_title):
all_sents = []
txt = wikipydia.query_text_rendered(page_title)
parse = BeautifulSoup(txt['html'])
justtext = parse.get_text()
#justtext = justtext.encode('utf-8')
tok = nltk.tokenize.PunktSentenceTokenizer()
sents0 = tok.tokenize(justtext)
chunker = TagChunker(treebank_chunker())
i = 0
for s0 in sents0:
i += 1
sents = s0.split('\n')
for s in sents:
verbfound = False
nounfound = False
ss = s.split()
if(len(ss) > 0):
tree = chunker.parse(nltk.pos_tag(ss))
for tag in [p[1] for p in tree.leaves()]:
if(tag[0] == 'V'):
verbfound = True
break
def candidate_words(self, stripped_input):
import nltk
s = nltk.stem.isri.ISRIStemmer()
words = []
for word in nltk.tokenize.wordpunct_tokenize(stripped_input):
words.append(s.stem(word))
return words
def raw(files = 'english-kjv'):
"""
@param files: One or more treebank files to be processed
@type files: L{string} or L{tuple(string)}
@rtype: iterator over L{tree}
"""
# Just one file to process? If so convert to a tuple so we can iterate
if type(files) is str: files = (files,)
for file in files:
path = os.path.join(get_basedir(), "genesis", file+".txt")
f = open_corpus(path)
for t in tokenize.whitespace(f.read()):
yield t
def supa_bot_fire(text, screen_name): # I parse that
message = ""
if any(banned_string in text for banned_string in banned_strings):
return ""
else:
try:
text = sentence_detector.tokenize(text.strip())[0]
tag_list = nltk.pos_tag(nltk.tokenize.word_tokenize(text))
if tag_list[0][1] == 'PRP' \
and tag_list[1][1] in verb_forms \
and not tag_list[0][0].lower() in banned_pronouns \
and not tag_list[1][0].lower() in banned_verbs \
and not tag_list[2][0] in ["n\'t", "ta"] \
and not any("CC" == tag[1] for tag in tag_list):
for tag in tag_list[2:-1]:
if any(string == tag[0] for string in no_pre_space):
message = message.strip() + tag[0] + " "
elif any(string == tag[0] for string in no_post_space):
message += tag[0]
else:
message += tag[0] + " "
if tag_list[-1][0] not in [',', '.', '!', '?']:
message += tag_list[-1][0]
message = message.strip()
"""
output_file = os.path.join(output_dir, "Output Anew Sentiment " + os.path.basename(input_file).rstrip('.txt') + ".csv")
# read file into string
with open(input_file, 'r') as myfile:
fulltext = myfile.read()
# end method if file is empty
if len(fulltext) < 1:
print('Empty file.')
return
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
# otherwise, split into sentences
sentences = tokenize.sent_tokenize(fulltext)
i = 1 # to store sentence index
# check each word in sentence for sentiment and write to output_file
with open(output_file, 'w', newline='') as csvfile:
fieldnames = ['Sentence ID', 'Sentence', 'Sentiment', 'Sentiment Label', 'Arousal', 'Dominance',
'# Words Found', 'Found Words', 'All Words']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
# analyze each sentence for sentiment
for s in sentences:
# print("S" + str(i) +": " + s)
all_words = []
found_words = []
total_words = 0
v_list = [] # holds valence scores
a_list = [] # holds arousal scores
def tokenize_norwegian_article(text, first_sentences=12, max_words_length=1000):
#Removing pipes for correct sentence tokenization
text = text.replace('|', '.')
words_tokenized = []
sent_count = 0
for sentence in nltk.tokenize.sent_tokenize(text, language='norwegian'):
sent_tokenized = nltk.tokenize.word_tokenize(sentence, language='norwegian')
if len(sent_tokenized) >= 3 and sent_tokenized[-1] in ['.', '!', '?', ';'] and \
sent_tokenized != ['Saken', 'oppdateres', '.']:
sent_count += 1
words_tokenized.extend(sent_tokenized)
if sent_count == first_sentences:
break
return words_tokenized[:max_words_length]
punkt_trainer = nltk.tokenize.punkt.PunktTrainer()
punkt_count = 0
parole_crawl (parole, train_punkt)
print
print "Finalizing training..."
punkt_trainer.finalize_training(verbose=True)
print "Training done. %d text segments." % punkt_count
print
params = punkt_trainer.get_params()
# print "Params: %s" % repr(params)
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer(params)
with open(PUNKT_PICKLEFN, mode='wb') as f:
pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)
print '%s written.' % PUNKT_PICKLEFN
else:
print "Loading %s ..." % PUNKT_PICKLEFN
with open(PUNKT_PICKLEFN, mode='rb') as f:
tokenizer = pickle.load(f)
print "Loading %s ... done." % PUNKT_PICKLEFN
with codecs.open(SENTENCEFN, 'w', 'utf8') as outf:
Returns
-------
:class:`.StructuredFeature`
A :class:`.StructuredFeature` that contains sentence context.
"""
with codecs.open(fpath, 'r') as f: # Determine the encoding of the file.
document = f.read()
encoding = chardet.detect(document)['encoding']
document = document.decode(encoding)
tokens = []
sentences = []
i = 0
for sentence in nltk.tokenize.sent_tokenize(document):
sentences.append(i)
for word in nltk.tokenize.word_tokenize(sentence):
tokens.append(word)
i += 1
contexts = [('sentence', sentences)]
return StructuredFeature(tokens, contexts)
def raw(files = items):
if type(files) is str: files = (files,)
for file in files:
path = os.path.join(get_basedir(), "inaugural", file + ".txt")
f = open_corpus(path)
text = f.read()
for t in tokenize.wordpunct(text):
yield t
if (i+1) % 5000 == 0:
print(i)
pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb'))
j += 1
samples = []
pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb'))
# samples = pickle.load(open(out_dirname + '/samples0.pkl', 'rb'))
# print(samples[0])
for fn in os.listdir(in_dirname):
print(fn)
precessed = []
for stars, text in pickle.load(open(os.path.join(in_dirname, fn), 'rb')):
tokens = []
sents = nltk.tokenize.sent_tokenize(text)
for s in sents:
tokens.append(tokenizer.tokenize(s))
precessed.append((stars, tokens))
# print(tokens)
if len(precessed) % 100 == 0:
print(len(precessed))
pickle.dump(precessed, open(os.path.join(out_dirname, fn), 'wb'))