How to use the lineflow.download.get_cache_directory function in lineflow

To help you get started, we’ve selected a few lineflow examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github tofunlp / lineflow / tests / test_download.py View on Github external
def test_get_cache_directory(self):
        root = download.get_cache_root()
        path = download.get_cache_directory('test', False)
        self.assertEqual(path, os.path.join(root, 'test'))
github tofunlp / lineflow / lineflow / datasets / imdb.py View on Github external
def get_imdb() -> Dict[str, List[str]]:

    url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
    root = download.get_cache_directory(os.path.join('datasets'))

    def creator(path):
        archive_path = download.cached_download(url)
        with tarfile.open(archive_path, 'r') as archive:
            print(f'Extracting to {root}...')
            archive.extractall(root)

        extracted_path = os.path.join(root, 'aclImdb')

        dataset = {}
        for split in ('train', 'test'):
            pos_path = os.path.join(extracted_path, split, 'pos')
            neg_path = os.path.join(extracted_path, split, 'neg')
            dataset[split] = [x.path for x in os.scandir(pos_path)
                              if x.is_file() and x.name.endswith('.txt')] + \
                             [x.path for x in os.scandir(neg_path)
github tofunlp / lineflow / lineflow / datasets / small_parallel_enja.py View on Github external
def get_small_parallel_enja() -> Dict[str, Tuple[List[str]]]:

    en_url = 'https://raw.githubusercontent.com/odashi/small_parallel_enja/master/{}.en'
    ja_url = 'https://raw.githubusercontent.com/odashi/small_parallel_enja/master/{}.ja'
    root = download.get_cache_directory(os.path.join('datasets', 'small_parallel_enja'))

    def creator(path):
        dataset = {}
        for split in ('train', 'dev', 'test'):
            en_path = download.cached_download(en_url.format(split))
            ja_path = download.cached_download(ja_url.format(split))
            with io.open(en_path, 'rt') as en, io.open(ja_path, 'rt') as ja:
                dataset[split] = [(x.rstrip(os.linesep), y.rstrip(os.linesep))
                                  for x, y in zip(en, ja)]

        with io.open(path, 'wb') as f:
            pickle.dump(dataset, f)
        return dataset

    def loader(path):
        with io.open(path, 'rb') as f:
github tofunlp / lineflow / lineflow / datasets / wikitext.py View on Github external
def get_wikitext(name: str) -> Dict[str, Union[easyfile.TextFile, List]]:

    url = f'https://s3.amazonaws.com/research.metamind.io/wikitext/{name}-v1.zip'
    root = download.get_cache_directory(os.path.join('datasets', 'wikitext'))

    def list_creator(path):
        archive_path = download.cached_download(url)
        with zipfile.ZipFile(archive_path, 'r') as archive:
            dataset = {}
            path2key = {f'{name}/wiki.train.tokens': 'train',
                        f'{name}/wiki.valid.tokens': 'dev',
                        f'{name}/wiki.test.tokens': 'test'}
            for p, key in path2key.items():
                print(f'Extracting {p}...')
                with archive.open(p) as f:
                    lines = [line.decode('utf-8').rstrip(os.linesep) for line in f]
                dataset[key] = lines

        with io.open(path, 'wb') as f:
            pickle.dump(dataset, f)
github tofunlp / lineflow / lineflow / datasets / squad.py View on Github external
def get_squad(version: int) -> Dict[str, List]:
    version_str = 'v1.1' if version == 1 else 'v2.0'

    train_url = f'https://raw.githubusercontent.com/rajpurkar/SQuAD-explorer/master/dataset/train-{version_str}.json'
    dev_url = f'https://raw.githubusercontent.com/rajpurkar/SQuAD-explorer/master/dataset/dev-{version_str}.json'
    root = download.get_cache_directory(os.path.join('datasets', 'squad'))

    def creator(path):
        train_path = download.cached_download(train_url)
        dev_path = download.cached_download(dev_url)

        dataset = {}
        for split in ('train', 'dev'):
            data_path = train_path if split == 'train' else dev_path
            with io.open(data_path, 'rt', encoding='utf-8') as f:
                data = json.load(f)['data']
            temp = []
            for x in data:
                title = x['title']
                for paragraph in x['paragraphs']:
                    context = paragraph['context']
                    for qa in paragraph['qas']:
github tofunlp / lineflow / lineflow / datasets / commonsenseqa.py View on Github external
def get_commonsenseqa() -> Dict[str, List[str]]:
    train_url = "https://s3.amazonaws.com/commensenseqa/train_rand_split.jsonl"
    dev_url = "https://s3.amazonaws.com/commensenseqa/dev_rand_split.jsonl"
    test_url = "https://s3.amazonaws.com/commensenseqa/test_rand_split_no_answers.jsonl"
    root = download.get_cache_directory(os.path.join("datasets", "commonsenseqa"))

    def creator(path):
        train_path = download.cached_download(train_url)
        dev_path = download.cached_download(dev_url)
        test_path = download.cached_download(test_url)

        dataset = {}
        for split in ("train", "dev", "test"):
            data_path = {"train": train_path, "dev": dev_path, "test": test_path}[split]
            with io.open(data_path, "rt", encoding="utf-8") as f:
                data = [json.loads(line) for line in f.readlines()]
            temp = []
            for x in data:
                answer_key = x["answerKey"] if split != "test" else ""
                options = {choice["label"]: choice["text"] for choice in x["question"]["choices"]}
                stem = x["question"]["stem"]
github tofunlp / lineflow / lineflow / datasets / msr_paraphrase.py View on Github external
def get_msr_paraphrase() -> Dict[str, List[Dict[str, str]]]:

    url = 'https://raw.githubusercontent.com/wasiahmad/paraphrase_identification/master/dataset/msr-paraphrase-corpus/msr_paraphrase_{}.txt'  # NOQA
    root = download.get_cache_directory(os.path.join('datasets', 'msr_paraphrase'))

    def creator(path):
        dataset = {}
        fieldnames = ('quality', 'id1', 'id2', 'string1', 'string2')
        for split in ('train', 'test'):
            data_path = download.cached_download(url.format(split))
            with io.open(data_path, 'r', encoding='utf-8') as f:
                f.readline()  # skip header
                reader = csv.DictReader(f, delimiter='\t', fieldnames=fieldnames)
                dataset[split] = [dict(row) for row in reader]

        with io.open(path, 'wb') as f:
            pickle.dump(dataset, f)
        return dataset

    def loader(path):
github tofunlp / lineflow / lineflow / datasets / penn_treebank.py View on Github external
def get_penn_treebank() -> Dict[str, List[str]]:

    url = 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.{}.txt'
    root = download.get_cache_directory(os.path.join('datasets', 'ptb'))

    def creator(path):
        dataset = {}
        for split in ('train', 'dev', 'test'):
            data_path = download.cached_download(url.format(split if split != 'dev' else 'valid'))
            with io.open(data_path, 'rt') as f:
                dataset[split] = [line.rstrip(os.linesep) for line in f]

        with io.open(path, 'wb') as f:
            pickle.dump(dataset, f)
        return dataset

    def loader(path):
        with io.open(path, 'rb') as f:
            return pickle.load(f)