Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def preprocess(text):
text = text.encode('utf-8').decode("ascii", "replace").replace(u"\ufffd", "_").replace("___", "'").replace("'s", " ").replace("``", " ").replace("''", " ").replace("_", " ").replace("'"," ").replace("`"," ")
text = re.sub("[^0-9a-zA-Z !\"/:;<=>?.,!@#$%^&-_|()']+", " ", text)
tokens = text.split(" ")
result = ""
for token in tokens:
word = token.split(" ")[0]
if word not in stopwords.words('english') and token not in punctuations and token not in hoax_stopwords:
if len(word) > 0:
if word.isupper() and dictionary.check(word.lower()):
new_token = lemmatizer.lemmatize(token.lower())
if new_token == token.lower():
new_token = lemmatizer.lemmatize(token.lower(), pos='v')
result += new_token + " "
elif word.isupper():
result += token.title() + " "
elif dictionary.check(word.lower()):
new_token = lemmatizer.lemmatize(token.lower())
if new_token == token.lower():
new_token = lemmatizer.lemmatize(token.lower(), pos='v')
result += new_token + " "
else:
result += token + " "
else:
def create_dic(self, documents):
texts = [[word for word in document.lower().split() if word not in stopwords.words('english')]
for document in documents]
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]
for text in texts]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
return [dictionary, corpus]
def create_word_features(self, words):
# print words
w = []
for line in words:
for wrd in line.split():
w.append(wrd)
useful_words = [word for word in w if word not in
stopwords.words('english')]
my_dict = ' '.join([word for word in useful_words])
# print my_dict
return my_dict
def cluster_texts(texts, clusters=3):
""" Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
vectorizer = TfidfVectorizer(tokenizer=process_text,
stop_words=stopwords.words('english'),
max_df=1.0,
min_df=1,
lowercase=True)
tfidf_model = vectorizer.fit_transform(texts)
km_model = KMeans(n_clusters=clusters, n_init=100, verbose=0, tol=1e-10)
km_model.fit(tfidf_model)
#print 'inertia: ', km_model.inertia_
#pdb.set_trace()
clustering = collections.defaultdict(list)
for idx, label in enumerate(km_model.labels_):
clustering[label].append(idx)
return clustering
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
# Clean the text, with the option to remove stopwords and to stem words.
# Convert words to lower case and split them
text = text.lower().split()
# Optionally, remove stop words
if remove_stopwords:
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
text = " ".join(text)
# Clean the text
text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
text = re.sub(r"what's", "what is ", text)
text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"can't", "cannot ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r",", " ", text)
def isStopword(text):
filtered_words = []
splitsent = text.split(' ')
for w in splitsent:
if w in stopwords.words('english'):
return 'Y'
else:
return 'N'
def process(content, env, **settings):
stops = stopwords.words('english')
for doc in content:
words = word_tokenize(doc['text'])
text = [w for w in words if w not in stops]
text = " ".join(text)
yield set_text(doc, text)
import graphlab as gl
from graphlab.toolkits.distances import cosine
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
try:
import cPickle as pickle
except:
import pickle
punct = string.punctuation
sts = gl.SFrame('sts_all.gl/')
stoplist = set(stopwords.words('english'))
def get_embeddings(embedding_gzip, size):
headers = ['word'] + ['d'+str(i) for i in range(1, size+1)]
coltypes = [str] + [float] * size
sf = gl.SFrame.read_csv( embedding_gzip, delimiter=' ', column_type_hints=coltypes, header=False, quote_char='\0')
sf = sf.pack_columns(['X'+str(i) for i in range(2, size+1)])
df = sf.to_dataframe().set_index('X1')
column_names = list(df)
return df.to_dict(orient='dict')[column_names[1]]
#content_vocab = set(i.strip() for i in open('sts_vocab.txt', 'r').readlines())
def get_vector(word, embeddings):
return np.array(embeddings[word])
def __stopwords_by_language(language):
try:
from nltk.corpus import stopwords
return stopwords.words(language)
except LookupError: # pragma: no cover
# try:
# nltk.data.find('corpora/stopwords')
# except LookupError: # pragma: no cover
dpath = os.path.expanduser('~/nltk_data/corpora/stopwords')
os.makedirs(dpath, exist_ok=True)
nltk.download('stopwords')
from nltk.corpus import stopwords
return stopwords.words(language)
def preprocess(self, reviews_filename):
"""
Transforms reviews (comments and ratings) into numerical representations (vectors)
Comments are vectorized into bag-of-words representation
Ratings are transformed into 0's (negative) and 1's (positive)
Neutral reviews are discarded
:param reviews_filename: CSV file with comments and ratings
:return:
data: list of sparse matrices
vectorized comments
target: list of integers
vectorized ratings
"""
stop_words = set(stopwords.words('english'))
sp = spacy.load('en_core_web_sm')
df = pd.read_csv(reviews_filename)
raw_data, raw_target = [], []
for review in df.itertuples():
if type(review.comment) == float:
continue
comment = {token.text: True for token in sp.tokenizer(review.comment.lower()) if token.text
not in stop_words}
if self.numclasses == 2:
rating = 'pos'
if review.rating == 3:
continue