How to use the allennlp.data.dataset_readers.dataset_reader.DatasetReader function in allennlp

To help you get started, we’ve selected a few allennlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github serrano-s / attn-tests / textcat / textcat_reader.py View on Github external
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField, Field, ListField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.common.checks import ConfigurationError
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from .sentence_tokenizer import SentenceTokenizer
from allennlp.data.tokenizers.word_filter import StopwordFilter, PassThroughWordFilter

logger = logging.getLogger(__name__)  # pylint: disable=invalid-name


@DatasetReader.register("textcat")
class TextCatReader(DatasetReader):
    """
    Reads tokens and their topic labels.

    Assumes that data in file_path provided to _read is tab-separated, containing (at least) the two
    fields 'tokens' and 'category', in no particular order, with each document/label on one line.
    (So this means that documents must not contain either newlines or tabs.)

    Example:

    category    tokens
    sample_label_1  This is a document. It contains a couple of sentences.
    sample_label_1  This is another document. It also contains two sentences.
    sample_label_2  This document has a different label.

    and so on.
github eitanhaimashiah / multibidaf / multibidaf / dataset_readers / multirc_reader.py View on Github external
from allennlp.common import Params
from overrides import overrides

from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer

from multibidaf.dataset_readers import util

logger = logging.getLogger(__name__)  # pylint: disable=invalid-name


@DatasetReader.register("multirc")
class MultiRCDatasetReader(DatasetReader):
    """
    Reads a JSON-formatted MultiRC file and returns a ``Dataset`` where the ``Instances`` have four
    fields: ``question``, a ``TextField``, ``passage``, another ``TextField``, and ``spans``, a
    ``ListField`` of ``IndexField`` representing start token indices of spans in ``passage`` required to
    answer the ``question``. We also add a ``MetadataField`` that stores the instance's ID, the question
    ID, the passage ID, the original passage text, the question tokens, the passage tokens, the gold
    answer strings, the gold answer labels, the token offsets into the original passage, and the start
    token indices of each sentence in the paragraph, accessible as ``metadata['qid']``,
    ``metadata['pid']``, ``metadata['original_passage']``, ``metadata['question_tokens']``,
    ``metadata['passage_tokens']``, ``metadata['answer_texts']``, ``metadata['answer_labels']``,
    ``metadata['token_offsets']``, and ``metadata['sentence_start_list']`` respectively. This is so that
    we can more easily use the official MultiRC evaluation script to get metrics.

    Parameters
    ----------
    tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``)
github vered1986 / NC_embeddings / source / evaluation / compute_any_vector.py View on Github external
logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()])
logger = logging.getLogger(__name__)

from typing import Dict
from overrides import overrides

from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.fields import LabelField, TextField, IndexField, SpanField


@DatasetReader.register("nc_paraphrases_data_reader_single_words")
class NCParaphraseDatasetReaderForWords(DatasetReader):
    def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None) -> None:
        super().__init__(lazy)
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self._tokenizer = tokenizer or WordTokenizer()

    @overrides
    def _read(self, file_path):
        pass

    @overrides
    def text_to_instance(self, nc: str) -> Instance:
        nc = nc.replace('_', ' ')
        tokenized_nc = self._tokenizer.tokenize(nc)
github SparkJiao / SLQA / coca-qa / coca_reader.py View on Github external
from overrides import overrides

from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.instance import Instance
from allennlp.data.dataset_readers.reading_comprehension import util
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
from allennlp.data.fields import Field, TextField, IndexField, \
    MetadataField, LabelField, ListField

logger = logging.getLogger(__name__)  # pylint: disable=invalid-name


@DatasetReader.register("cocaqa")
class CocaQAReader(DatasetReader):

    def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}

    @overrides
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        logger.info("Reading file at %s", file_path)
        with open(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
github dangitstam / topic-rnn / library / dataset_readers / imdb_review_reader.py View on Github external
from allennlp.common.file_utils import cached_path
from allennlp.common.util import END_SYMBOL, START_SYMBOL
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from overrides import overrides

import ujson

logger = logging.getLogger(__name__)  # pylint: disable=invalid-name


@DatasetReader.register("imdb_review_language_modeling_reader")
class IMDBReviewLanguageModelingReader(DatasetReader):
    """
    Reads the 100K IMDB dataset in format that it appears in
    http://ai.stanford.edu/~amaas/data/sentiment/
    (i.e. this reader expects a full-path to the directory as a result of
     extracting the tar).

    The paper uses strict partitions instead of a sliding window when evaluating TopicRNN as a
    language model to allow fair comparison against other LMs. The variational distribution will
    then only receive the previous BPTT-limit batch of words when recomputing the Gaussian parameters.

    This dataset reader should not be used for training; it should only be used for evaluation.

    Each ``read`` yields a data instance of
        text: A backpropagation-through-time length portion of the review text as a ``TextField``
        stopless_word_frequencies: A ``torch.FloatTensor`` representing the normalized frequencies
github allenai / allennlp-bert-qa-wrapper / pretrained_bert / dataset_reader.py View on Github external
import collections
from typing import List

import torch
from overrides import overrides
from pytorch_pretrained_bert import BertTokenizer

from allennlp.common.file_utils import cached_path
from allennlp.data.fields import MetadataField
from allennlp.data.instance import Instance
from allennlp.data.dataset_readers.dataset_reader import DatasetReader

logger = logging.getLogger(__name__)  # pylint: disable=invalid-name


@DatasetReader.register("squad_for_pretrained_bert")
class SquadReaderForPretrainedBert(DatasetReader):
    def __init__(self,
                 pretrained_bert_model_file: str,
                 lazy: bool = False,
                 max_query_length: int = 64,
                 max_sequence_length: int = 384,
                 document_stride: int = 128) -> None:
        super().__init__(lazy)
        self._tokenizer = BertTokenizer.from_pretrained(pretrained_bert_model_file)
        self._max_query_length = max_query_length
        self._max_sequence_length = max_sequence_length
        self._document_stride = document_stride

    @overrides
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
github allenai / allennlp / allennlp / data / dataset_readers / next_token_lm.py View on Github external
from allennlp.data.instance import Instance
from allennlp.data.tokenizers.tokenizer import Tokenizer
from allennlp.data.tokenizers import Token
from allennlp.data.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.token_indexers.token_indexer import TokenIndexer
from allennlp.data.fields import Field, TextField
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.tokenizers import PretrainedTransformerTokenizer

logger = logging.getLogger(__name__)


@DatasetReader.register("next_token_lm")
class NextTokenLmReader(DatasetReader):
    """
    Creates ``Instances`` suitable for use in predicting a single next token using a language
    model.  The :class:`Field` s that we create are the following: an input ``TextField`` and a
    target token ``TextField`` (we only ver have a single token, but we use a ``TextField`` so we
    can index it the same way as our input, typically with a single
    ``PretrainedTransformerIndexer``).

    NOTE: This is not fully functional!  It was written to put together a demo for interpreting and
    attacking language models, not for actually training anything.  It would be a really bad idea
    to use this setup for training language models, as it would be incredibly inefficient.  The
    only purpose of this class is for a demo.

    Parameters
    ----------
    tokenizer : ``Tokenizer``, optional (default=``WhitespaceTokenizer()``)
        We use this ``Tokenizer`` for the text.  See :class:`Tokenizer`.
github allenai / allennlp / allennlp / data / dataset_readers / multiprocess_dataset_reader.py View on Github external
def __del__(self) -> None:
        """
        Terminate processes if the user hasn't joined. This is necessary as
        leaving stray processes running can corrupt shared state. In brief,
        we've observed shared memory counters being reused (when the memory was
        free from the perspective of the parent process) while the stray
        workers still held a reference to them.

        For a discussion of using destructors in Python in this manner, see
        https://eli.thegreenplace.net/2009/06/12/safely-using-destructors-in-python/.
        """
        for process in self.processes:
            process.terminate()


@DatasetReader.register("multiprocess")
class MultiprocessDatasetReader(DatasetReader):
    """
    Wraps another dataset reader and uses it to read from multiple input files
    using multiple processes. Note that in this case the ``file_path`` passed to ``read()``
    should be a glob, and that the dataset reader will return instances from all files
    matching the glob.

    The order the files are processed in is a function of Numpy's random state
    up to non-determinism caused by using multiple worker processes. This can
    be avoided by setting ``num_workers`` to 1.

    Parameters
    ----------
    base_reader : ``DatasetReader``
        Each process will use this dataset reader to read zero or more files.
    num_workers : ``int``
github allenai / allennlp / allennlp / data / dataset_readers / reading_comprehension / triviaqa.py View on Github external
import tarfile
from typing import Dict, List, Tuple

from overrides import overrides

from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.instance import Instance
from allennlp.data.dataset_readers.reading_comprehension import util
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, SpacyTokenizer

logger = logging.getLogger(__name__)


@DatasetReader.register("triviaqa")
class TriviaQaReader(DatasetReader):
    """
    Reads the TriviaQA dataset into a ``Dataset`` containing ``Instances`` with four fields:
    ``question`` (a ``TextField``), ``passage`` (another ``TextField``), ``span_start``, and
    ``span_end`` (both ``IndexFields``).

    TriviaQA is split up into several JSON files defining the questions, and a lot of text files
    containing crawled web documents.  We read these from a gzipped tarball, to avoid having to
    have millions of individual files on a filesystem.

    Because we need to read both train and validation files from the same tarball, we take the
    tarball itself as a constructor parameter, and take the question file as the argument to
    ``read``.  This means that you should give the path to the tarball in the ``dataset_reader``
    parameters in your experiment configuration file, and something like ``"wikipedia-train.json"``
    for the ``train_data_path`` and ``validation_data_path``.
github plkmo / NLP_Toolkit / nlptoolkit / gec / models / gector / datareader.py View on Github external
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import TextField, SequenceLabelField, MetadataField, Field
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from overrides import overrides

from .utils.helpers import SEQ_DELIMETERS, START_TOKEN

logger = logging.getLogger(__name__)  # pylint: disable=invalid-name


@DatasetReader.register("seq2labels_datareader")
class Seq2LabelsDatasetReader(DatasetReader):
    """
    Reads instances from a pretokenised file where each line is in the following format:

    WORD###TAG [TAB] WORD###TAG [TAB] ..... \n

    and converts it into a ``Dataset`` suitable for sequence tagging. You can also specify
    alternative delimiters in the constructor.

    Parameters
    ----------
    delimiters: ``dict``
        The dcitionary with all delimeters.
    token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
        We use this to define the input representation for the text.  See :class:`TokenIndexer`.
        Note that the `output` tags will always correspond to single token IDs based on how they
        are pre-tokenised in the data file.