How to use the nltk.download function in nltk

To help you get started, we’ve selected a few nltk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github microsoft / botbuilder-python / samples / experimental / 101.corebot-bert-bidaf / model / model_corebot101 / bidaf / model_runtime / bidaf_model_runtime.py View on Github external
def init_bidaf(bidaf_model_dir: str, download_ntlk_punkt: bool = False) -> bool:
        if os.path.isdir(bidaf_model_dir):
            print("bidaf model directory already present..", file=sys.stderr)
        else:
            print("Creating bidaf model directory..", file=sys.stderr)
            os.makedirs(bidaf_model_dir, exist_ok=True)

        # Download Punkt Sentence Tokenizer
        if download_ntlk_punkt:
            nltk.download("punkt", download_dir=bidaf_model_dir)
            nltk.download("punkt")

        # Download bidaf onnx model
        onnx_model_file = os.path.abspath(os.path.join(bidaf_model_dir, "bidaf.onnx"))

        print(f"Checking file {onnx_model_file}..", file=sys.stderr)
        if os.path.isfile(onnx_model_file):
            print("bidaf.onnx downloaded already!", file=sys.stderr)
        else:
            print("Downloading bidaf.onnx...", file=sys.stderr)
            response = requests.get(
                "https://onnxzoo.blob.core.windows.net/models/opset_9/bidaf/bidaf.onnx",
                stream=True,
            )
            with open(onnx_model_file, "wb") as f:
                response.raw.decode_content = True
                shutil.copyfileobj(response.raw, f)
github persephone-tools / persephone / persephone / datasets / bkw.py View on Github external
""" Interface to Alex/Steven's Kunwinjku data. """

from pathlib import Path
import subprocess
from typing import List, Set

import nltk # type: ignore
# TODO This download should be conditional, since a complaint is raised if
# there is no net connection
nltk.download("punkt") # type: ignore
from pympi.Elan import Eaf

from .. import corpus
from .. import config
from ..preprocess.labels import segment_into_tokens
from ..utterance import Utterance
from ..preprocess.labels import LabelSegmenter
from ..corpus import Corpus

BASIC_PHONEMES = set(["a", "b", "d", "dj", "rd", "e", "h", "i", "k", "l",
            "rl", "m", "n", "ng", "nj", "rn", "o", "r", "rr", "u",
            "w", "y",])
DOUBLE_STOPS = set(["bb", "dd", "djdj", "rdd", "kk"])
DIPHTHONGS = set(["ay", "aw", "ey", "ew", "iw", "oy", "ow", "uy"])
PHONEMES = BASIC_PHONEMES | DOUBLE_STOPS | DIPHTHONGS
github fhamborg / Giveme5W / extractor / preprocessor.py View on Github external
def __init__(self, tagger, model):
        """
        Performs all necessary preprocessing

        :param tagger: Path to the Stanford NER Tagger
        :param model: Path to the model for the NER Tagger
        """

        # check if model for tokenizer exists
        try:
            nltk.data.find('punkt.zip')
        except:
            nltk.download('punkt')

        # init NER parser
        self.nerParser = StanfordNERTagger(tagger, model)

        # init Charniak parser
        self.rerankingParser = RerankingParser.fetch_and_load('WSJ+Gigaword-v2')
github nlpaueb / SumQE / src / LM_experiments / BERT_NS.py View on Github external
import json
import math
import numpy as np
import nltk
import os
import torch

from nltk.tokenize import sent_tokenize
from pytorch_pretrained_bert import BertForNextSentencePrediction
from pytorch_pretrained_bert import BertTokenizer
from torch.nn import Softmax

from configuration import CONFIG_DIR
from experiments_output import OUTPUT_DIR

nltk.download('punkt')

CONFIG_PATH = os.path.join(CONFIG_DIR, 'config.json')


def run_bert_ns(data, year, predictions_dict):
    """
    Train the BERT LM_experiments for the Next sentence prediction
    :param data: The actual data of the year stored on dictionary
    :param year: The corresponding year of the data. It is used when we save the predictions
    :param predictions_dict: A dict where we save the predictions from our experiments
    :return:
    """

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    vocab_size = len(tokenizer.vocab)
github Pinafore / qb / setup.py View on Github external
def run(self):
        import nltk
        nltk.download('stopwords')
        nltk.download('punkt')
        nltk.download('wordnet')
        nltk.download('averaged_perceptron_tagger')
        path = 'data/external/nltk_download_SUCCESS'
        os.makedirs(os.path.dirname(path), exist_ok=True)
        with open(path, 'w') as f:
            f.write('Downloaded nltk: stopwords, pinkt, wordnet')
github ClimbsRocks / empythy / empythy / utils.py View on Github external
def load_movie_reviews():

    # movie_reviews is a sizeable corpus to import, so only load it if we have to
    from nltk.corpus import movie_reviews
    try:
        movie_reviews.categories()
    except:
        import nltk
        print('This appears to be your first time using the NLTK Movie Reviews corpus. We will first download the necessary corpus (this is a one-time download that might take a little while')
        nltk.download('movie_reviews')
        from nltk.corpus import movie_reviews

    raw_data = []

    # NLTK's corpus is structured in an interesting way
    # first iterate through the two categories (pos and neg)
    for category in movie_reviews.categories():

        if category == 'pos':
            pretty_category_name = 'positive'
        elif category == 'neg':
            pretty_category_name = 'negative'

        # each of these categories is just fileids, so grab those
        for fileid in movie_reviews.fileids(category):
github drewgillson / alphabot / generate_words.py View on Github external
def getWords(min_len):
    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = _create_unverified_https_context

    nltk.download('brown')
    nltk.download('punkt')

    source = FreqDist(i.lower() for i in brown.words())
    source = np.array(source.most_common())[:, :1]

    # the Brown corpus contains duplicates and contains
    # words with weird punctuation and digits
    word_list = np.unique(np.char.lower(source))
    p = np.random.permutation(word_list.shape[0])
    word_list = word_list[p]

    words = [word for word in word_list if len(word) == len(set(word)) and re.search("[^A-Za-z\ ]", word) == None]

    output = [word for word in words if len(word) >= min_len and len(word) <= 26 and word[-1:] != 's']
    return output
github hds-lab / textvisdrg / setup / fabutils / factories.py View on Github external
def nltk_download():
        """Download required nltk corpora"""
        try:
            import nltk

            if not nltk.download(required_nltk_corpora):
                abort(red('Unable to download nltk corpora: %s' % required_nltk_corpora))
        except ImportError:
            abort(red("Failed to import nltk"))
github Daikon-Sun / MLDS2017 / hw3 / download_datasets.py View on Github external
# following https://github.com/ryankiros/skip-thoughts#getting-started
        skipthoughts_files = [
            'dictionary.txt', 'utable.npy', 'btable.npy', 'uni_skip.npz', 'uni_skip.npz.pkl', 'bi_skip.npz',
            'bi_skip.npz.pkl',
        ]
        for filename in skipthoughts_files:
            src_url = SKIPTHOUGHTS_BASE_URL + filename
            print(('Downloading ' + src_url))
            urlretrieve(src_url, os.path.join(SKIPTHOUGHTS_DIR, filename),
                        reporthook=dl_progress_hook)

    elif data_name == 'nltk_punkt':
        import nltk
        print('== NLTK pre-trained Punkt tokenizer for English ==')
        nltk.download('punkt')

    elif data_name == 'pretrained_model':
        print('== Pretrained model ==')
        MODEL_DIR = os.path.join(DATA_DIR, 'Models')
        pretrained_model_filename = 'latest_model_flowers_temp.ckpt'
        src_url = 'https://bitbucket.org/paarth_neekhara/texttomimagemodel/raw/74a4bbaeee26fe31e148a54c4f495694680e2c31/' + pretrained_model_filename
        print(('Downloading ' + src_url))
        urlretrieve(
            src_url,
            os.path.join(MODEL_DIR, pretrained_model_filename),
            reporthook=dl_progress_hook,
        )

    else:
        raise ValueError('Unknown dataset name: ' + data_name)