Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField, Field, ListField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.common.checks import ConfigurationError
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from .sentence_tokenizer import SentenceTokenizer
from allennlp.data.tokenizers.word_filter import StopwordFilter, PassThroughWordFilter
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
@DatasetReader.register("textcat")
class TextCatReader(DatasetReader):
"""
Reads tokens and their topic labels.
Assumes that data in file_path provided to _read is tab-separated, containing (at least) the two
fields 'tokens' and 'category', in no particular order, with each document/label on one line.
(So this means that documents must not contain either newlines or tabs.)
Example:
category tokens
sample_label_1 This is a document. It contains a couple of sentences.
sample_label_1 This is another document. It also contains two sentences.
sample_label_2 This document has a different label.
and so on.
# (the "em" and "f1" metrics use the official script).
candidate_answers: Counter = Counter()
token_spans = set(token_spans)
span_fields = []
span_fields = ListField([SpanField(start, end, passage_field)
for start, end in token_spans])
else:
span_fields = ListField([SpanField(-1, -1, passage_field)])
fields['spans'] = span_fields
metadata.update(additional_metadata)
fields['metadata'] = MetadataField(metadata)
return Instance(fields)
@DatasetReader.register("squad2")
class Squad2Reader(DatasetReader):
"""
Reads a JSON-formatted SQuAD file and returns a ``Dataset`` where the ``Instances`` have four
fields: ``question``, a ``TextField``, ``passage``, another ``TextField``, and ``span_start``
and ``span_end``, both ``IndexFields`` into the ``passage`` ``TextField``. We also add a
``MetadataField`` that stores the instance's ID, the original passage text, gold answer strings,
and token offsets into the original passage, accessible as ``metadata['id']``,
``metadata['original_passage']``, ``metadata['answer_texts']`` and
``metadata['token_offsets']``. This is so that we can more easily use the official SQuAD
evaluation script to get metrics.
Parameters
----------
multiparagraph : ``bool``, optional (default=``False``)
If ``True``, uses ``util.make_multi_paragraph_reading_comprehension_instance`` to create
a "multi-paragraph" instance (but with only one paragraph) with ``"paragraphs"`` being a
import logging
from typing import Dict, List
from overrides import overrides
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.instance import Instance
from allennlp.data.fields import Field, TextField, ListField, MetadataField, IndexField
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
logger = logging.getLogger(__name__)
@DatasetReader.register("qangaroo")
class QangarooReader(DatasetReader):
"""
Reads a JSON-formatted Qangaroo file and returns a ``Dataset`` where the ``Instances`` have six
fields: ``candidates``, a ``ListField[TextField]``, ``query``, a ``TextField``, ``supports``, a
``ListField[TextField]``, ``answer``, a ``TextField``, and ``answer_index``, a ``IndexField``.
We also add a ``MetadataField`` that stores the instance's ID and annotations if they are present.
Parameters
----------
tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``)
We use this ``Tokenizer`` for both the question and the passage. See :class:`Tokenizer`.
Default is ```WordTokenizer()``.
token_indexers : ``Dict[str, TokenIndexer]``, optional
We similarly use this for both the question and the passage. See :class:`TokenIndexer`.
Default is ``{"tokens": SingleIdTokenIndexer()}``.
"""
self._mapping = mapping
def transform(self, field, value) -> str:
if field == self._field:
return self._mapping.get(value, default=value)
else:
return value
@classmethod
def from_params(cls, params: Params) -> 'FieldPreparator':
field = params.pop('field', None)
mapping = params.pop('mapping', {}).as_dict()
return FieldPreparator(field=field, mapping=mapping)
@DatasetReader.register("jsonl_classification_reader")
class JsonlClassificationReader(DatasetReader):
"""
Reads a file from a classification dataset. This data is
formatted as jsonl, one json-formatted instance per line. The keys in the data are
"gold_label", "input", which are configurable in the JSON definition.
Parameters
----------
tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``)
See :class:`Tokenizer`.
token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
See :class:`TokenIndexer`.
"""
def __init__(self,
input: str,
import logging
from typing import Dict, List
from overrides import overrides
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.instance import Instance
from allennlp.data.fields import Field, TextField, ListField, MetadataField, IndexField
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Tokenizer, SpacyTokenizer
logger = logging.getLogger(__name__)
@DatasetReader.register("qangaroo")
class QangarooReader(DatasetReader):
"""
Reads a JSON-formatted Qangaroo file and returns a ``Dataset`` where the ``Instances`` have six
fields: ``candidates``, a ``ListField[TextField]``, ``query``, a ``TextField``, ``supports``, a
``ListField[TextField]``, ``answer``, a ``TextField``, and ``answer_index``, a ``IndexField``.
We also add a ``MetadataField`` that stores the instance's ID and annotations if they are present.
Parameters
----------
tokenizer : ``Tokenizer``, optional (default=``SpacyTokenizer()``)
We use this ``Tokenizer`` for both the question and the passage. See :class:`Tokenizer`.
Default is ```SpacyTokenizer()``.
token_indexers : ``Dict[str, TokenIndexer]``, optional
We similarly use this for both the question and the passage. See :class:`TokenIndexer`.
Default is ``{"tokens": SingleIdTokenIndexer()}``.
"""
fields[u"words"] = tokens
fields[u"pos_tags"] = SequenceLabelField(upos_tags, tokens, label_namespace=u"pos")
if dependencies is not None:
# We don't want to expand the label namespace with an additional dummy token, so we'll
# always give the 'ROOT_HEAD' token a label of 'root'.
fields[u"head_tags"] = SequenceLabelField([x[0] for x in dependencies],
tokens,
label_namespace=u"head_tags")
fields[u"head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies],
tokens,
label_namespace=u"head_index_tags")
fields[u"metadata"] = MetadataField({u"words": words, u"pos": upos_tags})
return Instance(fields)
UniversalDependenciesDatasetReader = DatasetReader.register(u"universal_dependencies")(UniversalDependenciesDatasetReader)
Predicate-argument categories (only if supplied)
"""
# pylint: disable=arguments-differ
text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers)
fields = {u"tokens": text_field}
for field_name, labels in ((u'ccg_categories', ccg_categories),
(u'original_pos_tags', original_pos_tags),
(u'modified_pos_tags', modified_pos_tags),
(u'predicate_arg_categories', predicate_arg_categories)):
if labels is not None:
fields[field_name] = SequenceLabelField(labels, text_field)
return Instance(fields)
CcgBankDatasetReader = DatasetReader.register(u"ccgbank")(CcgBankDatasetReader)
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Token
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.dataset_readers.dataset_utils import enumerate_spans
from dygie.data.fields.adjacency_field_assym import AdjacencyFieldAssym
from dygie.data.dataset_readers.document import Document, Sentence
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
class DyGIEDataException(Exception):
pass
@DatasetReader.register("dygie")
class DyGIEReader(DatasetReader):
"""
Reads a single JSON-formatted file. This is the same file format as used in the
scierc, but is preprocessed
"""
def __init__(self,
max_span_width: int,
token_indexers: Dict[str, TokenIndexer] = None,
**kwargs) -> None:
super().__init__(**kwargs)
self._max_span_width = max_span_width
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
@overrides
def _read(self, file_path: str):
# if `file_path` is a URL, redirect to the cache
from allennlp.data.dataset import Dataset
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.instance import Instance
from allennlp.data.fields import TextField, IndexField, ListField
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.common.checks import ConfigurationError
from allennlp.common import Params
from allen_hcn.actions import HCNActionTracker
from allen_hcn.entities import HCNEntityTracker
import allen_hcn.util as util
logger = logging.getLogger(__name__)
@DatasetReader.register("babi")
class BabiDatasetReader(DatasetReader):
"""
Read a tsv file containing paired sequences, and create a dataset suitable for a
``HybridCodeNetwork`` model.
Expected format for each input line is
The output of ``read`` is a list of ``Instance``s with the fields:
source_tokens: ``TextField`` and
target_tokens: ``TextField``
"""
def __init__(self,
token_indexers: Dict[str, TokenIndexer] = None):
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
from allennlp.data.tokenizers.word_stemmer import PorterStemmer
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.fields import ArrayField, Field, TextField, KnowledgeGraphField, LabelField
from allennlp.data.fields import IndexField, ListField, MetadataField, ProductionRuleField
from allennlp.data.fields import SequenceLabelField
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.semparse.contexts.knowledge_graph import KnowledgeGraph
from allennlp.semparse.contexts.quarel_utils import WorldTaggerExtractor, words_from_entity_string
from allennlp.semparse.contexts.quarel_utils import LEXICAL_CUES, align_entities
from allennlp.semparse.worlds.quarel_world import QuarelWorld
logger = logging.getLogger(__name__)
@DatasetReader.register("quarel")
class QuarelDatasetReader(DatasetReader):
"""
Parameters
----------
lazy : ``bool`` (optional, default=False)
Passed to ``DatasetReader``. If this is ``True``, training will start sooner, but will
take longer per batch.
replace_world_entities : ``bool`` (optional, default=False)
Replace world entities (w/stemming) with "worldone" and "worldtwo" directly in the question
world_extraction_model: ``str`` (optional, default=None)
Reference (file or URL) to world tagger model used to extract worlds.
align_world_extractions : ``bool`` (optional, default=False)
Use alignment of extracted worlds with gold worlds, to pick the appropriate gold LF.
gold_world_extractions : ``bool`` (optional, default=False)
Use gold worlds rather than world extractor
tagger_only : ``bool`` (optional default=False)