How to use the wordfreq.config function in wordfreq

To help you get started, we’ve selected a few wordfreq examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github LuminosoInsight / wordfreq / wordfreq / build.py View on Github external
Python 2 can still *use* wordfreq, by downloading the database that was
    built on Python 3.

    If you insist on building the Python 2 version, pass `do_it_anyway=True`.
    """
    if sys.version_info.major == 2 and not do_it_anyway:
        raise UnicodeError(
            "Python 2.x has insufficient Unicode support, and will build "
            "the wrong database. Pass `do_it_anyway=True` to do it anyway."
        )

    if source_dir is None:
        source_dir = config.RAW_DATA_DIR

    if filename is None:
        filename = config.DB_FILENAME

    def wordlist_path(*pieces):
        return os.path.join(source_dir, *pieces)

    logger.info("Creating database")
    conn = create_db(filename)

    for lang in LEEDS_LANGUAGES:
        filename = wordlist_path('leeds', 'internet-%s-forms.num' % lang)
        read_leeds_wordlist_into_db(conn, filename, 'leeds-internet', lang)

    read_wordlist_into_db(conn, wordlist_path('google', 'google-books-english.csv'), 'google-books', 'en')
    read_wordlist_into_db(conn, wordlist_path('luminoso', 'twitter-52M.csv'), 'twitter', 'xx')
    read_wordlist_into_db(conn, wordlist_path('luminoso', 'twitter-stems-2014.csv'), 'twitter-stems', '*')
    read_wordlist_into_db(conn, wordlist_path('luminoso', 'twitter-surfaces-2014.csv'), 'twitter-surfaces', '*')
github LuminosoInsight / wordfreq / wordfreq / transfer.py View on Github external
def upload_data(upload_path=None):
    """
    Collect the raw data and the database file, and upload them to an
    appropriate directory on the server that hosts downloads.

    This requires that it's running in a reasonable Unix environment, on Python
    3, and more notably, that it has the proper SSH keys to upload to that
    server.
    """
    from tempfile import TemporaryDirectory

    if upload_path is None:
        upload_path = config.UPLOAD_PATH
    
    with TemporaryDirectory('.wordfreq') as build_dir:
        version_dir = os.path.join(build_dir, config.MINOR_VERSION)
        os.makedirs(version_dir)

        source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
        logger.info("Creating %s" % source_filename)
        with tarfile.open(source_filename, 'w:gz') as tarf:
            tarf.add(config.RAW_DATA_DIR)

        logger.info("Copying database file %s" % config.DB_FILENAME)
        subprocess.call([
            '/bin/cp',
            config.DB_FILENAME,
            version_dir
        ])
github LuminosoInsight / wordfreq / wordfreq / transfer.py View on Github external
def download_and_extract_raw_data(url=None, root_dir=None):
    """
    Download the .tar.gz of raw data that can be used to build the database.
    """
    if url is None:
        url = config.RAW_DATA_URL

    if root_dir is None:
        root_dir = os.path.dirname(config.RAW_DATA_DIR)

    dest_filename = os.path.join(root_dir, 'wordfreq-data.tar.gz')
    ensure_dir_exists(dest_filename)
    download(url, dest_filename)

    logger.info("Extracting %s" % dest_filename)
    with tarfile.open(dest_filename, 'r') as tarf:
        tarf.extractall(root_dir)