Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import utils
import transforms
from manalib import Manacost, Manatext
# Some text prettification stuff that people may not have installed
try:
from titlecase import titlecase
except ImportError:
def titlecase(s):
return s.title()
try:
import textwrap
import nltk.data
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# This could me made smarter - MSE will capitalize for us after :,
# but we still need to capitalize the first english component of an activation
# cost that starts with symbols, such as {2U}, *R*emove a +1/+1 counter from @: etc.
def cap(s):
return s[:1].capitalize() + s[1:]
# This crazy thing is actually invoked as an unpass, so newlines are still
# encoded.
def sentencecase(s):
s = s.replace(utils.x_marker, utils.reserved_marker)
lines = s.split(utils.newline)
clines = []
for line in lines:
if line:
sentences = sent_tokenizer.tokenize(line)
clines += [' '.join([cap(sent) for sent in sentences])]
return utils.newline.join(clines).replace(utils.reserved_marker, utils.x_marker)
config = ConfigParser.ConfigParser()
config.read("asoiafsearchbot.cfg")
# Database info
host = config.get("SQL", "host")
user = config.get("SQL", "user")
passwd = config.get("SQL", "passwd")
db = config.get("SQL", "db")
table = config.get("SQL", "table")
column1 = config.get("SQL", "column1")
column2 = config.get("SQL", "column2")
MAX_ROWS = 30
BOOK_CONTAINER = []
sent_tokenize = nltk.data.load('tokenizers/punkt/english.pickle')
# Reddit Info
user_agent = (
"ASOIAFSearchBot -Help you find that comment"
"- by /u/RemindMeBotWrangler")
reddit = praw.Reddit(user_agent = user_agent)
reddit_user = config.get("Reddit", "username")
reddit_pass = config.get("Reddit", "password")
reddit.login(reddit_user, reddit_pass)
# =============================================================================
# CLASSES
# =============================================================================
class Connect(object):
"""
DB connection class
# -*- coding: utf-8 -*-
'''
https://github.com/kyubyong/g2pK
'''
import os, re
import nltk
from jamo import h2j
from konlpy.tag import Mecab
from nltk.corpus import cmudict
# For further info. about cmu dict, consult http://www.speech.cs.cmu.edu/cgi-bin/cmudict.
try:
nltk.data.find('corpora/cmudict.zip')
except LookupError:
nltk.download('cmudict')
from g2pk.special import jyeo, ye, consonant_ui, josa_ui, vowel_ui, jamo, rieulgiyeok, rieulbieub, verb_nieun, balb, palatalize, modifying_rieul
from g2pk.regular import link1, link2, link3, link4
from g2pk.utils import annotate, compose, group, gloss, parse_table, get_rule_id2text
from g2pk.english import convert_eng
from g2pk.numerals import convert_num
class G2p(object):
def __init__(self):
self.mecab = Mecab() # for annotation
self.table = parse_table()
self.cmu = cmudict.dict() # for English
def __init__(self, pos, filenameroot):
"""
@type pos: {string}
@param pos: The part of speech of this index file e.g. 'noun'
@type filenameroot: {string}
@param filenameroot: The base filename of the index file.
"""
self.pos = pos
path = nltk.data.find('corpora/wordnet/index.%s' % filenameroot)
self.file = open(path, FILE_OPEN_MODE)
# Table of (pathname, offset) -> (line, nextOffset)
self.offsetLineCache = {}
self.rewind()
def launch(self):
# Verifying nltk resources
nltk.data.path[0] = self.settings.nltk_data
# Determining action
if self.opts.saijiki:
self.controller.generateSaijikiHaikus(self.opts.number)
else:
self.controller.generateMultipleHaikus(self.opts.kigo, self.opts.number)
tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
elif source_lang == 're':
print("Using Russian dictionary to find sentence boundaries.")
tokenizer = nltk.data.load('tokenizers/punkt/russian.pickle')
elif source_lang == 'it':
print("Using Italian dictionary to find sentence boundaries.")
tokenizer = nltk.data.load('tokenizers/punkt/italian.pickle')
elif source_lang == 'pt':
print("Using Portuguese dictionary to find sentence boundaries.")
tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
elif source_lang == 'es':
print("Using Spanish dictionary to find sentence boundaries.")
tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
else:
print("Using English dictionary to find sentence boundaries.")
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# Split input text into a list of sentences
sentences = tokenizer.tokenize(transcript)
print("Input text length: " + str(len(transcript)))
print("Number of sentences: " + str(len(sentences)))
translated_text = ''
transcript_chunk = ''
for sentence in sentences:
# Translate can handle 5000 unicode characters but we'll process no more than 4000
# just to be on the safe side.
if (len(sentence) + len(transcript_chunk) < 4000):
transcript_chunk = transcript_chunk + ' ' + sentence
else:
try:
print("Translation input text length: " + str(len(transcript_chunk)))
translation_chunk = translate_client.translate_text(Text=transcript_chunk,SourceLanguageCode=source_lang,TargetLanguageCode=target_lang)
import operator
import enchant
import nltk
from gcm import *
from nltk.stem.wordnet import WordNetLemmatizer
from api_constants import WALMART_OPEN_PRODUCT_API_KEY, GCM_API_KEY
from constants import PROPER_NOUN_POS_TAGS
from settings import DEBUG
# append custom path for nltk corpus
nltk.data.path.append("nltk_data/")
lmtzr = WordNetLemmatizer()
enchant_dictionary = enchant.Dict("en_US")
def check_candidature(query_string):
walmart_url = "http://api.walmartlabs.com/v1/search?apiKey={0}&query={1}".format(WALMART_OPEN_PRODUCT_API_KEY,
query_string)
response = requests.get(walmart_url)
if response.ok:
if response.json()['totalResults'] > 0:
return True
return False
def __init__(self):
try:
nltk.data.find('taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle')
except LookupError:
nltk.download('averaged_perceptron_tagger')
def install_nltk_corpora(*packages):
nltk_packages = list(packages)
try:
installed = (set(os.listdir(nltk.data.find("corpora"))) |
(set(os.listdir(nltk.data.find("taggers"))))) | \
(set(os.listdir(nltk.data.find("tokenizers"))))
except LookupError:
installed = set()
if not set(nltk_packages) <= set(installed):
nltk.download(nltk_packages)
def sql_query(dbname, query):
"""
Execute an SQL query over a database.
:param dbname: filename of persistent store
:type schema: str
:param query: SQL query
:type rel_name: str
"""
import sqlite3
try:
path = nltk.data.find(dbname)
connection = sqlite3.connect(str(path))
cur = connection.cursor()
return cur.execute(query)
except (ValueError, sqlite3.OperationalError):
import warnings
warnings.warn(
"Make sure the database file %s is installed and uncompressed." % dbname
)
raise