Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import logging
from gensim.models import Word2Vec
from KaggleWord2VecUtility import KaggleWord2VecUtility
import time
import sys
import csv
if __name__ == '__main__':
start = time.time()
# The csv file might contain very huge fields, therefore set the field_size_limit to maximum.
csv.field_size_limit(sys.maxsize)
# Read train data.
train_word_vector = pd.read_pickle('all.pkl')
# Use the NLTK tokenizer to split the paragraph into sentences.
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = []
print("Parsing sentences from training set...")
# Loop over each news article.
for review in train_word_vector["text"]:
try:
# Split a review into parsed sentences.
sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer, remove_stopwords=True)
except:
continue
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', \
level=logging.INFO)
num_features = int(sys.argv[1]) # Word vector dimensionality
min_word_count = 20 # Minimum word count
#nltk.download('stopwords')
from nltk.data import load as LPickle
import sys, os.path as path
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
sys.path.append(path.dirname(path.dirname(path.dirname(path.abspath(__file__)))))
from summarizer.utils.data_helpers import extract_ngrams2, prune_ngrams, untokenize
from summarizer.algorithms.base import Sentence
from _summarizer import Summarizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
sent_detector = LPickle('tokenizers/punkt/english.pickle')
class ExtractiveUpperbound(Summarizer):
def __init__(self, language):
self.sentences = []
self.docs = []
self.models = []
self.doc_sent_dict = {}
self.ref_ngrams = []
self.LANGUAGE = language
self.stemmer = SnowballStemmer(self.LANGUAGE)
self.stoplist = set(stopwords.words(self.LANGUAGE))
def __call__(self, docs, models, length, ngram_type=2):
self.sum_length = int(length)
self.load_data(docs, models)
self.get_ref_ngrams(ngram_type)
def divide_into_senteces(self, cache = True):
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = tokenizer.tokenize(self.text)
# remove period from end of sentence
return [re.sub(r'\.$', '', sentence) for sentence in sentences]
string = re_quotes_3.sub('"', string)
string = re.sub('"', '', string)
string = re_dots.sub('.', string)
string = re_punctuation.sub(r'\1', string)
string = re_hiphen.sub(' - ', string)
string = re_punkts.sub(r'\1 \2 \3', string)
string = re_punkts_b.sub(r'\1 \2 \3', string)
string = re_punkts_c.sub(r'\1 \2', string)
string = re_doublequotes_1.sub('\"', string)
string = re_doublequotes_2.sub('\'', string)
string = re_trim.sub(' ', string)
return string.strip()
sent_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
def clean_and_filter_first_sentences(string, first_sentences=8):
# Tokenize sentences and remove short and malformed sentences.
sentences = []
for sent in sent_tokenizer.tokenize(string):
if sent.count(' ') >= 3 and sent[-1] in ['.', '!', '?', ';']:
sentences.append(clean_str(sent))
if len(sentences) == first_sentences:
break
return ' '.join(sentences)
#############################################################################################
def load_input_csv(path):
news_df = pd.read_csv(path, encoding = 'utf-8')
#Concatenating all available text
def demo_sent_subjectivity(text):
"""
Classify a single sentence as subjective or objective using a stored
SentimentAnalyzer.
:param text: a sentence whose subjectivity has to be classified.
"""
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import regexp
word_tokenizer = regexp.WhitespaceTokenizer()
try:
sentim_analyzer = load('sa_subjectivity.pickle')
except LookupError:
print('Cannot find the sentiment analyzer you want to load.')
print('Training a new one using NaiveBayesClassifier.')
sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)
# Tokenize and convert to lower case
tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
print(sentim_analyzer.classify(tokenized_text))
def _set_up_sent_tok(self):
try:
import nltk
except ImportError:
raise ImportError('Please install nltk (e.g. pip install nltk).')
# nltk-specific setup
st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
try:
self.sent_tok = nltk.data.load(st_path)
except LookupError:
nltk.download('punkt')
self.sent_tok = nltk.data.load(st_path)
def data_cleaning(data):
print('---data_cleaning start...')
# 分词
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
data["words"] = data["essay"].apply(tokenizer.tokenize)
# 分句
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
data['sents'] = data["essay"].apply(sent_detector.tokenize)
# 分字母,求得长度
data['character_count'] = data['words'].apply(lambda x: len(''.join(x)))
# 分词的tag(tag)
data['tags'] = data['words'].apply(pos_tag)
print('---data_cleaning end...')
return data
def preprocess(text):
"""
Preprocess text for encoder
"""
X = []
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
for t in text:
sents = sent_detector.tokenize(t)
result = ''
for s in sents:
tokens = word_tokenize(s)
result += ' ' + ' '.join(tokens)
X.append(result)
return X
def tag_nes(cls, tokenized_sents):
tagger_url = 'nltk:taggers/maxent_treebank_pos_tagger/english.pickle'
tagger = nltk.data.load(tagger_url)
tagged = tagger.batch_tag(tokenized_sents)
ne_chunker_url = 'nltk:chunkers/maxent_ne_chunker/english_ace_multiclass.pickle'
ne_chunker = nltk.data.load(ne_chunker_url)
nes = ne_chunker.batch_parse(tagged)
return nes
FINNISH = re.compile(r'\b(ja|joka|oli|kuin|jossa|jotka|jonka)\b')
SWEDISH = re.compile(r'\b(och|med|som|att|den|det|eller|av)\b')
ENGLISH = re.compile(r'\b(and|of|for|at|the)\b')
def is_in_language(targetlang, text):
# Quick and dirty regex shortcuts for detecting the most common languages
if FINNISH.search(text) is not None:
return (targetlang == 'fi')
if SWEDISH.search(text) is not None:
return (targetlang == 'sv')
if ENGLISH.search(text) is not None:
return (targetlang == 'en')
# assume it's the right language
return True
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def split_to_sentences(text, targetlang):
sentences = []
for sentence in sentence_tokenizer.tokenize(text):
if not is_in_language(targetlang, sentence):
continue
sentences.append(sentence)
return sentences
@functools.lru_cache(maxsize=100000)
def search(text, proj, cutoff_frequency):
es = Elasticsearch()
query = {
'query': {
'function_score': {
'query': {