Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField, Field, ListField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.common.checks import ConfigurationError
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from .sentence_tokenizer import SentenceTokenizer
from allennlp.data.tokenizers.word_filter import StopwordFilter, PassThroughWordFilter
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
@DatasetReader.register("textcat")
class TextCatReader(DatasetReader):
"""
Reads tokens and their topic labels.
Assumes that data in file_path provided to _read is tab-separated, containing (at least) the two
fields 'tokens' and 'category', in no particular order, with each document/label on one line.
(So this means that documents must not contain either newlines or tabs.)
Example:
category tokens
sample_label_1 This is a document. It contains a couple of sentences.
sample_label_1 This is another document. It also contains two sentences.
sample_label_2 This document has a different label.
and so on.
from allennlp.common import Params
from overrides import overrides
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
from multibidaf.dataset_readers import util
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
@DatasetReader.register("multirc")
class MultiRCDatasetReader(DatasetReader):
"""
Reads a JSON-formatted MultiRC file and returns a ``Dataset`` where the ``Instances`` have four
fields: ``question``, a ``TextField``, ``passage``, another ``TextField``, and ``spans``, a
``ListField`` of ``IndexField`` representing start token indices of spans in ``passage`` required to
answer the ``question``. We also add a ``MetadataField`` that stores the instance's ID, the question
ID, the passage ID, the original passage text, the question tokens, the passage tokens, the gold
answer strings, the gold answer labels, the token offsets into the original passage, and the start
token indices of each sentence in the paragraph, accessible as ``metadata['qid']``,
``metadata['pid']``, ``metadata['original_passage']``, ``metadata['question_tokens']``,
``metadata['passage_tokens']``, ``metadata['answer_texts']``, ``metadata['answer_labels']``,
``metadata['token_offsets']``, and ``metadata['sentence_start_list']`` respectively. This is so that
we can more easily use the official MultiRC evaluation script to get metrics.
Parameters
----------
tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``)
logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()])
logger = logging.getLogger(__name__)
from typing import Dict
from overrides import overrides
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.fields import LabelField, TextField, IndexField, SpanField
@DatasetReader.register("nc_paraphrases_data_reader_single_words")
class NCParaphraseDatasetReaderForWords(DatasetReader):
def __init__(self,
token_indexers: Dict[str, TokenIndexer] = None,
lazy: bool = False,
tokenizer: Tokenizer = None) -> None:
super().__init__(lazy)
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
self._tokenizer = tokenizer or WordTokenizer()
@overrides
def _read(self, file_path):
pass
@overrides
def text_to_instance(self, nc: str) -> Instance:
nc = nc.replace('_', ' ')
tokenized_nc = self._tokenizer.tokenize(nc)
from overrides import overrides
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.instance import Instance
from allennlp.data.dataset_readers.reading_comprehension import util
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
from allennlp.data.fields import Field, TextField, IndexField, \
MetadataField, LabelField, ListField
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
@DatasetReader.register("cocaqa")
class CocaQAReader(DatasetReader):
def __init__(self,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
lazy: bool = False) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer()
self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
@overrides
def _read(self, file_path: str):
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
logger.info("Reading file at %s", file_path)
with open(file_path) as dataset_file:
dataset_json = json.load(dataset_file)
from allennlp.common.file_utils import cached_path
from allennlp.common.util import END_SYMBOL, START_SYMBOL
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from overrides import overrides
import ujson
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
@DatasetReader.register("imdb_review_language_modeling_reader")
class IMDBReviewLanguageModelingReader(DatasetReader):
"""
Reads the 100K IMDB dataset in format that it appears in
http://ai.stanford.edu/~amaas/data/sentiment/
(i.e. this reader expects a full-path to the directory as a result of
extracting the tar).
The paper uses strict partitions instead of a sliding window when evaluating TopicRNN as a
language model to allow fair comparison against other LMs. The variational distribution will
then only receive the previous BPTT-limit batch of words when recomputing the Gaussian parameters.
This dataset reader should not be used for training; it should only be used for evaluation.
Each ``read`` yields a data instance of
text: A backpropagation-through-time length portion of the review text as a ``TextField``
stopless_word_frequencies: A ``torch.FloatTensor`` representing the normalized frequencies
import collections
from typing import List
import torch
from overrides import overrides
from pytorch_pretrained_bert import BertTokenizer
from allennlp.common.file_utils import cached_path
from allennlp.data.fields import MetadataField
from allennlp.data.instance import Instance
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
@DatasetReader.register("squad_for_pretrained_bert")
class SquadReaderForPretrainedBert(DatasetReader):
def __init__(self,
pretrained_bert_model_file: str,
lazy: bool = False,
max_query_length: int = 64,
max_sequence_length: int = 384,
document_stride: int = 128) -> None:
super().__init__(lazy)
self._tokenizer = BertTokenizer.from_pretrained(pretrained_bert_model_file)
self._max_query_length = max_query_length
self._max_sequence_length = max_sequence_length
self._document_stride = document_stride
@overrides
def _read(self, file_path: str):
# if `file_path` is a URL, redirect to the cache
from allennlp.data.instance import Instance
from allennlp.data.tokenizers.tokenizer import Tokenizer
from allennlp.data.tokenizers import Token
from allennlp.data.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.token_indexers.token_indexer import TokenIndexer
from allennlp.data.fields import Field, TextField
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.tokenizers import PretrainedTransformerTokenizer
logger = logging.getLogger(__name__)
@DatasetReader.register("next_token_lm")
class NextTokenLmReader(DatasetReader):
"""
Creates ``Instances`` suitable for use in predicting a single next token using a language
model. The :class:`Field` s that we create are the following: an input ``TextField`` and a
target token ``TextField`` (we only ver have a single token, but we use a ``TextField`` so we
can index it the same way as our input, typically with a single
``PretrainedTransformerIndexer``).
NOTE: This is not fully functional! It was written to put together a demo for interpreting and
attacking language models, not for actually training anything. It would be a really bad idea
to use this setup for training language models, as it would be incredibly inefficient. The
only purpose of this class is for a demo.
Parameters
----------
tokenizer : ``Tokenizer``, optional (default=``WhitespaceTokenizer()``)
We use this ``Tokenizer`` for the text. See :class:`Tokenizer`.
def __del__(self) -> None:
"""
Terminate processes if the user hasn't joined. This is necessary as
leaving stray processes running can corrupt shared state. In brief,
we've observed shared memory counters being reused (when the memory was
free from the perspective of the parent process) while the stray
workers still held a reference to them.
For a discussion of using destructors in Python in this manner, see
https://eli.thegreenplace.net/2009/06/12/safely-using-destructors-in-python/.
"""
for process in self.processes:
process.terminate()
@DatasetReader.register("multiprocess")
class MultiprocessDatasetReader(DatasetReader):
"""
Wraps another dataset reader and uses it to read from multiple input files
using multiple processes. Note that in this case the ``file_path`` passed to ``read()``
should be a glob, and that the dataset reader will return instances from all files
matching the glob.
The order the files are processed in is a function of Numpy's random state
up to non-determinism caused by using multiple worker processes. This can
be avoided by setting ``num_workers`` to 1.
Parameters
----------
base_reader : ``DatasetReader``
Each process will use this dataset reader to read zero or more files.
num_workers : ``int``
import tarfile
from typing import Dict, List, Tuple
from overrides import overrides
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.instance import Instance
from allennlp.data.dataset_readers.reading_comprehension import util
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, SpacyTokenizer
logger = logging.getLogger(__name__)
@DatasetReader.register("triviaqa")
class TriviaQaReader(DatasetReader):
"""
Reads the TriviaQA dataset into a ``Dataset`` containing ``Instances`` with four fields:
``question`` (a ``TextField``), ``passage`` (another ``TextField``), ``span_start``, and
``span_end`` (both ``IndexFields``).
TriviaQA is split up into several JSON files defining the questions, and a lot of text files
containing crawled web documents. We read these from a gzipped tarball, to avoid having to
have millions of individual files on a filesystem.
Because we need to read both train and validation files from the same tarball, we take the
tarball itself as a constructor parameter, and take the question file as the argument to
``read``. This means that you should give the path to the tarball in the ``dataset_reader``
parameters in your experiment configuration file, and something like ``"wikipedia-train.json"``
for the ``train_data_path`` and ``validation_data_path``.
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import TextField, SequenceLabelField, MetadataField, Field
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from overrides import overrides
from .utils.helpers import SEQ_DELIMETERS, START_TOKEN
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
@DatasetReader.register("seq2labels_datareader")
class Seq2LabelsDatasetReader(DatasetReader):
"""
Reads instances from a pretokenised file where each line is in the following format:
WORD###TAG [TAB] WORD###TAG [TAB] ..... \n
and converts it into a ``Dataset`` suitable for sequence tagging. You can also specify
alternative delimiters in the constructor.
Parameters
----------
delimiters: ``dict``
The dcitionary with all delimeters.
token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
We use this to define the input representation for the text. See :class:`TokenIndexer`.
Note that the `output` tags will always correspond to single token IDs based on how they
are pre-tokenised in the data file.