Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def convert_to_wordnet_pos(senseval_pos):
if senseval_pos == 'VERB':
return wn.VERB
elif senseval_pos == 'NOUN':
return wn.NOUN
elif senseval_pos == 'ADV':
return wn.ADV
elif senseval_pos == 'ADJ':
return wn.ADJ
else:
return None
def test_subtrees_for_phrase(self):
t = self._sentence.subtrees_for_phrase("NP")[0]
self.assertIsInstance(t, Tree)
self.assertEquals("property", t[-1].leaves()[0])
def preprocess(text):
text = text.encode('utf-8').decode("ascii", "replace").replace(u"\ufffd", "_").replace("___", "'").replace("'s", " ").replace("``", " ").replace("''", " ").replace("_", " ").replace("'"," ").replace("`"," ")
text = re.sub("[^0-9a-zA-Z !\"/:;<=>?.,!@#$%^&-_|()']+", " ", text)
tokens = text.split(" ")
result = ""
for token in tokens:
word = token.split(" ")[0]
if word not in stopwords.words('english') and token not in punctuations and token not in hoax_stopwords:
if len(word) > 0:
if word.isupper() and dictionary.check(word.lower()):
new_token = lemmatizer.lemmatize(token.lower())
if new_token == token.lower():
new_token = lemmatizer.lemmatize(token.lower(), pos='v')
result += new_token + " "
elif word.isupper():
result += token.title() + " "
elif dictionary.check(word.lower()):
new_token = lemmatizer.lemmatize(token.lower())
if new_token == token.lower():
new_token = lemmatizer.lemmatize(token.lower(), pos='v')
result += new_token + " "
else:
result += token + " "
else:
def create_dic(self, documents):
texts = [[word for word in document.lower().split() if word not in stopwords.words('english')]
for document in documents]
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]
for text in texts]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
return [dictionary, corpus]
def create_word_features(self, words):
# print words
w = []
for line in words:
for wrd in line.split():
w.append(wrd)
useful_words = [word for word in w if word not in
stopwords.words('english')]
my_dict = ' '.join([word for word in useful_words])
# print my_dict
return my_dict
def cluster_texts(texts, clusters=3):
""" Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
vectorizer = TfidfVectorizer(tokenizer=process_text,
stop_words=stopwords.words('english'),
max_df=1.0,
min_df=1,
lowercase=True)
tfidf_model = vectorizer.fit_transform(texts)
km_model = KMeans(n_clusters=clusters, n_init=100, verbose=0, tol=1e-10)
km_model.fit(tfidf_model)
#print 'inertia: ', km_model.inertia_
#pdb.set_trace()
clustering = collections.defaultdict(list)
for idx, label in enumerate(km_model.labels_):
clustering[label].append(idx)
return clustering
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
# Clean the text, with the option to remove stopwords and to stem words.
# Convert words to lower case and split them
text = text.lower().split()
# Optionally, remove stop words
if remove_stopwords:
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
text = " ".join(text)
# Clean the text
text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
text = re.sub(r"what's", "what is ", text)
text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"can't", "cannot ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r",", " ", text)
def isStopword(text):
filtered_words = []
splitsent = text.split(' ')
for w in splitsent:
if w in stopwords.words('english'):
return 'Y'
else:
return 'N'
def preprocess(text):
"""
Preprocess text for encoder
"""
X = []
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
console.log("Loaded NLTK data")
for t in text:
sents = sent_detector.tokenize(t)
result = ''
for s in sents:
tokens = word_tokenize(s)
result += ' ' + ' '.join(tokens)
X.append(result)
return X
def _build_para_dict(self):
path = "data/ppdb-2.0-s-all"
lines = read_lines(path)
relations = [line.split(" ||| ")[-1] for line in lines]
equivalent_pairs = []
print("Preprocessing raw data...")
for line in tqdm(lines):
split = line.split(" ||| ")
if split[-1] == "Equivalence":
equivalent_pairs.append(tuple(split[1:3]))
paraphrase_pairs = [line.split(" ||| ")[1:3] for line in lines]
equivalent_pairs_ubuntu = []
print("Extracting paraphrase pairs...")
for pair in tqdm(equivalent_pairs):
tokens_0 = word_tokenize(pair[0])
tokens_1 = word_tokenize(pair[1])
if not (self._contains_unknown(tokens_0) or self._contains_unknown(tokens_1)):
equivalent_pairs_ubuntu.append(
(tokens_0, tokens_1))
# Insert paraphrases in both directions
print("Building dictionary...")
self.paraphrase_dict = {}
for (p0, p1) in tqdm(equivalent_pairs_ubuntu):
p0 = tuple(p0)
p1 = tuple(p1)
try:
self.paraphrase_dict[p0] = self.paraphrase_dict[p0] + [p1]
except:
self.paraphrase_dict[p0] = [p1]