Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def build_model(fmt='binary'):
print('Loading training data...')
train_paths = [find('corpora/ace_data/ace.dev'),
find('corpora/ace_data/ace.heldout'),
find('corpora/ace_data/bbn.dev'),
find('corpora/ace_data/muc.dev')]
train_trees = load_ace_data(train_paths, fmt)
train_data = [postag_tree(t) for t in train_trees]
print('Training...')
cp = NEChunkParser(train_data)
del train_data
print('Loading eval data...')
eval_paths = [find('corpora/ace_data/ace.eval')]
eval_trees = load_ace_data(eval_paths, fmt)
eval_data = [postag_tree(t) for t in eval_trees]
print('Evaluating...')
chunkscore = ChunkScore()
for i, correct in enumerate(eval_data):
guess = cp.parse(correct.leaves())
def init_nltk():
if not os.path.exists('nltk'):
os.makedirs('nltk')
nltk.data.path.append(os.getcwd() + '/nltk')
dependencies = ['corpora/stopwords']
for package in dependencies:
try:
nltk.data.find(package)
except LookupError:
nltk.download(package, os.getcwd() + '/nltk')
def build_model(fmt='binary'):
print('Loading training data...')
train_paths = [
find('corpora/ace_data/ace.dev'),
find('corpora/ace_data/ace.heldout'),
find('corpora/ace_data/bbn.dev'),
find('corpora/ace_data/muc.dev'),
]
train_trees = load_ace_data(train_paths, fmt)
train_data = [postag_tree(t) for t in train_trees]
print('Training...')
cp = NEChunkParser(train_data)
del train_data
print('Loading eval data...')
eval_paths = [find('corpora/ace_data/ace.eval')]
eval_trees = load_ace_data(eval_paths, fmt)
eval_data = [postag_tree(t) for t in eval_trees]
print('Evaluating...')
URL, 'corpora', md5sum=MD5, save_name='movie_reviews.zip')
path = os.path.join(paddle.dataset.common.DATA_HOME, 'corpora')
filename = os.path.join(path, 'movie_reviews.zip')
zip_file = zipfile.ZipFile(filename)
zip_file.extractall(path)
zip_file.close()
# make sure that nltk can find the data
if paddle.dataset.common.DATA_HOME not in nltk.data.path:
nltk.data.path.append(paddle.dataset.common.DATA_HOME)
movie_reviews.categories()
except LookupError:
print("Downloading movie_reviews data set, please wait.....")
nltk.download(
'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
print("Download data set success.....")
print("Path is " + nltk.data.find('corpora/movie_reviews').path)
if load_topic_modeling==1:
self.logger.info(':: loading TM')
self.text_tm = TopicModeling(self.config)
else:
self.text_tm = None
self.logger.info(':: database connecting ...')
self.conn = sqlite3.connect(self.config.database_db)
if bool(int(self.config.models_force_download)) is True:
self.logger.info(':: downloading NLTK data...')
try:
nltk.data.find('averaged_perceptron_tagger.zip')
except LookupError:
nltk.download('averaged_perceptron_tagger')
try:
nltk.data.find('punkt.zip')
except LookupError:
nltk.download('punkt')
try:
nltk.data.find('maxent_ne_chunker.zip')
except LookupError:
nltk.download('maxent_ne_chunker')
try:
nltk.data.find('universal_tagset.zip')
except LookupError:
nltk.download('universal_tagset')
try:
nltk.data.find('words.zip')
except LookupError:
nltk.download('words')
except LookupError:
nltk.download('averaged_perceptron_tagger')
try:
nltk.data.find('punkt.zip')
except LookupError:
nltk.download('punkt')
try:
nltk.data.find('maxent_ne_chunker.zip')
except LookupError:
nltk.download('maxent_ne_chunker')
try:
nltk.data.find('universal_tagset.zip')
except LookupError:
nltk.download('universal_tagset')
try:
nltk.data.find('words.zip')
except LookupError:
nltk.download('words')
try:
nltk.data.find('stopwords.zip')
except LookupError:
nltk.download('stopwords')
# Find the corpus root directory.
zip_name = re.sub(r'(([^/]+)(/.*)?)', r'\2.zip/\1/', self.__name)
if TRY_ZIPFILE_FIRST:
try:
root = nltk.data.find('{}/{}'.format(self.subdir, zip_name))
except LookupError as e:
try:
root = nltk.data.find('{}/{}'.format(self.subdir, self.__name))
except LookupError:
raise e
else:
try:
root = nltk.data.find('{}/{}'.format(self.subdir, self.__name))
except LookupError as e:
try:
root = nltk.data.find('{}/{}'.format(self.subdir, zip_name))
except LookupError:
raise e
# Load the corpus.
corpus = self.__reader_cls(root, *self.__args, **self.__kwargs)
# This is where the magic happens! Transform ourselves into
# the corpus by modifying our own __dict__ and __class__ to
# match that of the corpus.
args, kwargs = self.__args, self.__kwargs
name, reader_cls = self.__name, self.__reader_cls
self.__dict__ = corpus.__dict__
self.__class__ = corpus.__class__
import nltk
try:
nltk.data.find('averaged_perceptron_tagger.zip')
except LookupError:
nltk.download('averaged_perceptron_tagger')
try:
nltk.data.find('punkt.zip')
except LookupError:
nltk.download('punkt')
try:
nltk.data.find('maxent_ne_chunker.zip')
except LookupError:
nltk.download('maxent_ne_chunker')
try:
nltk.data.find('universal_tagset.zip')
except LookupError:
nltk.download('universal_tagset')
try:
nltk.data.find('words.zip')
except LookupError:
nltk.download('words')
try:
nltk.data.find('stopwords.zip')
except LookupError:
nltk.download('stopwords')
def build_model(fmt='binary'):
print('Loading training data...')
train_paths = [find('corpora/ace_data/ace.dev'),
find('corpora/ace_data/ace.heldout'),
find('corpora/ace_data/bbn.dev'),
find('corpora/ace_data/muc.dev')]
train_trees = load_ace_data(train_paths, fmt)
train_data = [postag_tree(t) for t in train_trees]
print('Training...')
cp = NEChunkParser(train_data)
del train_data
print('Loading eval data...')
eval_paths = [find('corpora/ace_data/ace.eval')]
eval_trees = load_ace_data(eval_paths, fmt)
eval_data = [postag_tree(t) for t in eval_trees]
print('Evaluating...')
chunkscore = ChunkScore()
for i, correct in enumerate(eval_data):
def build_model(fmt='binary'):
print('Loading training data...')
train_paths = [find('corpora/ace_data/ace.dev'),
find('corpora/ace_data/ace.heldout'),
find('corpora/ace_data/bbn.dev'),
find('corpora/ace_data/muc.dev')]
train_trees = load_ace_data(train_paths, fmt)
train_data = [postag_tree(t) for t in train_trees]
print('Training...')
cp = NEChunkParser(train_data)
del train_data
print('Loading eval data...')
eval_paths = [find('corpora/ace_data/ace.eval')]
eval_trees = load_ace_data(eval_paths, fmt)
eval_data = [postag_tree(t) for t in eval_trees]
print('Evaluating...')
chunkscore = ChunkScore()
for i, correct in enumerate(eval_data):
guess = cp.parse(correct.leaves())
chunkscore.score(correct, guess)