Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Python 2 can still *use* wordfreq, by downloading the database that was
built on Python 3.
If you insist on building the Python 2 version, pass `do_it_anyway=True`.
"""
if sys.version_info.major == 2 and not do_it_anyway:
raise UnicodeError(
"Python 2.x has insufficient Unicode support, and will build "
"the wrong database. Pass `do_it_anyway=True` to do it anyway."
)
if source_dir is None:
source_dir = config.RAW_DATA_DIR
if filename is None:
filename = config.DB_FILENAME
def wordlist_path(*pieces):
return os.path.join(source_dir, *pieces)
logger.info("Creating database")
conn = create_db(filename)
for lang in LEEDS_LANGUAGES:
filename = wordlist_path('leeds', 'internet-%s-forms.num' % lang)
read_leeds_wordlist_into_db(conn, filename, 'leeds-internet', lang)
read_wordlist_into_db(conn, wordlist_path('google', 'google-books-english.csv'), 'google-books', 'en')
read_wordlist_into_db(conn, wordlist_path('luminoso', 'twitter-52M.csv'), 'twitter', 'xx')
read_wordlist_into_db(conn, wordlist_path('luminoso', 'twitter-stems-2014.csv'), 'twitter-stems', '*')
read_wordlist_into_db(conn, wordlist_path('luminoso', 'twitter-surfaces-2014.csv'), 'twitter-surfaces', '*')
def upload_data(upload_path=None):
"""
Collect the raw data and the database file, and upload them to an
appropriate directory on the server that hosts downloads.
This requires that it's running in a reasonable Unix environment, on Python
3, and more notably, that it has the proper SSH keys to upload to that
server.
"""
from tempfile import TemporaryDirectory
if upload_path is None:
upload_path = config.UPLOAD_PATH
with TemporaryDirectory('.wordfreq') as build_dir:
version_dir = os.path.join(build_dir, config.MINOR_VERSION)
os.makedirs(version_dir)
source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
logger.info("Creating %s" % source_filename)
with tarfile.open(source_filename, 'w:gz') as tarf:
tarf.add(config.RAW_DATA_DIR)
logger.info("Copying database file %s" % config.DB_FILENAME)
subprocess.call([
'/bin/cp',
config.DB_FILENAME,
version_dir
])
def download_and_extract_raw_data(url=None, root_dir=None):
"""
Download the .tar.gz of raw data that can be used to build the database.
"""
if url is None:
url = config.RAW_DATA_URL
if root_dir is None:
root_dir = os.path.dirname(config.RAW_DATA_DIR)
dest_filename = os.path.join(root_dir, 'wordfreq-data.tar.gz')
ensure_dir_exists(dest_filename)
download(url, dest_filename)
logger.info("Extracting %s" % dest_filename)
with tarfile.open(dest_filename, 'r') as tarf:
tarf.extractall(root_dir)