Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from tensorflow.python.framework import ops
ops.reset_default_graph()
os.chdir(os.path.dirname(os.path.realpath(__file__)))
# Start a graph session
sess = tf.Session()
# Declare model parameters
embedding_size = 200
vocabulary_size = 2000
batch_size = 100
max_words = 100
# Declare stop words
stops = stopwords.words('english')
# Load Data
print('Loading Data')
texts, target = text_helpers.load_movie_data()
# Normalize text
print('Normalizing Text Data')
texts = text_helpers.normalize_text(texts, stops)
# Texts must contain at least 3 words
target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > 2]
texts = [x for x in texts if len(x.split()) > 2]
# Split up data set into train/test
train_indices = np.random.choice(len(target), round(0.8*len(target)), replace=False)
test_indices = np.array(list(set(range(len(target))) - set(train_indices)))
import nltk, os
from collections import Counter
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))
porter = nltk.PorterStemmer()
en = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
digit, space = '0123456789', ' \t\n'
good_symbol = en + en.lower() + digit + space
def filter_word(word):
return u''.join(filter(lambda char: char in good_symbol, list(word)))
def filter_bw(text):
text = text.split()
text = filter(lambda x: len(x) > 2, text)
text = filter(lambda x: not x.isdigit(), text)
from nltk.corpus import stopwords
import csv
import gensim
import logging
import Cython
import os
from gensim import models
from gensim.models import word2vec
import re
import xml.etree.ElementTree as ET
import pickle
punctuations = [")", "(", "''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", ".", "?", "!", ",", ":", "-", "--",
"...", ";"]
stops = stopwords.words('english')
encoding_ = 'utf-8'
def getClear_full(sentence):
r = re.findall(r'\b\w+\b', sentence.lower())
r = " ".join(r)
return r
def getClear(sentence,bigram):
r = re.findall(r'\b\w+\b', sentence.lower())
length=len(r)
i=0
while(i
import numpy as np
from os import listdir
from os.path import isfile, join
import re
import hashlib
from utils import DataUtil
class Feature(object):
'''
特征工程工具
'''
# 停用词
stops = set(stopwords.words("english"))
# train.csv中IDF字典
train_idf = {}
def __init__(self):
return
@staticmethod
def load_npz(ft_fp):
loader = np.load('%s.npz' % ft_fp)
features = csr_matrix((loader['data'],
loader['indices'],
loader['indptr']),
shape=loader['shape'])
LogUtil.log("INFO", "load npz feature file done (%s)" % ft_fp)
return features
vocabulary = dict()
inverse_vocabulary = [''] # '' acts as a placeholder for the zero vector embedding
qs = pd.DataFrame({'q1': q1, 'q2': q2})
questions_cols = ['q1', 'q2']
# Iterate through the text of both questions of each pair
from tqdm import tqdm
for index, row in tqdm(qs.iterrows()):
for question in questions_cols:
q2n = [] # q2n -> numerical vector representation of each question
for word in row[question]:
# Check for stopwords who do not have a word2vec mapping and ignore them
if word in set(stopwords.words('english')) and word not in word2vec_model.vocab:
continue
if word not in vocabulary:
vocabulary[word] = len(inverse_vocabulary)
q2n.append(len(inverse_vocabulary))
inverse_vocabulary.append(word)
else:
q2n.append(vocabulary[word])
# Replace questions with equivalent numerical vector/ word-indices
qs.set_value(index, question, q2n)
# Prepare embedding layer
embedding_dim = 300
embeddings = np.random.randn(len(vocabulary)+1, embedding_dim) # Embedding matrix
# Preprocessing the data.
import json # For the json data.
import gzip # Using gzip because the file is in .gz format.
import os # For listing all the files from the directory.
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
path = 'Add the path to the data here' # The path containing the data.
files = os.listdir(path) # stores all the files in the directory from the path into a list.
Questions = [] # For storing the questions.
Answers = [] # For storing the answers for a particular question.
All_Answers = [] # For storing the answers of all the questions.
max_question_word_length = 50 # Maximum word length of the question.
# Gives a generator object of the file in the dataset.
def parse(path):
g = gzip.open(path, 'r')
for l in g:
yield eval(l)
def __init__(self, language):
self.sentences = []
self.docs = []
self.models = []
self.doc_sent_dict = {}
self.ref_ngrams = []
self.LANGUAGE = language
self.stemmer = SnowballStemmer(self.LANGUAGE)
self.stoplist = set(stopwords.words(self.LANGUAGE))
def clean_vertex(line):
""" Take review id and sentence tuple and clean it to (review_id, word list) """
rev_id, sent = line[0], line[1]
lmtz = WordNetLemmatizer()
sw = stopwords.words('english')
words = re.findall(r'[a-zA-Z]+', sent)
words = [lmtz.lemmatize(w.lower()) for w in words if w.lower() not in sw]
words = [w for w in words if len(w) > 3]
return rev_id, words
def clean_text_and_remove_stopwords(self, text):
# Remove punctuations
text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
# Remove unnecessary white space.
text = re.sub(r'\s+', ' ', text).strip()
# Convert every word to lowercase.
text = text.lower()
# Tokenize each word and remove common stop words in English.
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(text)
filtered_tokens = [w for w in word_tokens if not w in stop_words]
text = " ".join(filtered_tokens)
return text
if lower:
corpus = corpus.lower()
for token in word_tokenize(corpus):
if punctuation:
if token in string.punctuation:
continue
token = token.translate(str.maketrans("", "", string.punctuation))
if numbers:
token = token.translate(str.maketrans("", "", "0123456789"))
if stopwords:
stop_words = nltk.corpus.stopwords.words("english")
if token in stop_words:
continue
if stemmer:
stem = SnowballStemmer("english")
token = stem.stem(token)
transformed_corpus += token + " "
return transformed_corpus.strip()