Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from forte.data.io_utils import dataset_path_iterator
from forte.data.readers.base_reader import PackReader, MultiPackReader
from forte.data.data_pack import DataPack
from forte.data.multi_pack import MultiPack
from forte.data.base_pack import PackType
logger = logging.getLogger(__name__)
__all__ = [
"MonoFileReader",
"PackReader"
]
class MonoFileReader(PackReader, ABC):
"""Data reader that reads one data pack from each single text files.
To be inherited by all mono file data readers.
"""
# pylint: disable=no-self-use
def _cache_key_function(self, file_directory: str):
return file_directory.split('/')[-1]
# pylint: disable=no-self-use
def _collect(self, file_directory: str) -> Iterator[str]: # type: ignore
"""
:param file_directory: the path to a single directory containing the
files.
:return: Iterator[Any] collections to iterate over
"""
return dataset_path_iterator(file_directory, "")
import mwxml
from mwlinks.libs.common import Span
from mwlinks.libs.wikilink import Wikilink
from forte.data import DataPack
from forte.data.ontology import wiki_ontology
from forte.data.readers.base_reader import PackReader
from forte.data.datasets.wikipedia import page_parser
__all__ = [
"WikiDumpReader",
]
logger = logging.getLogger(__name__)
class WikiDumpReader(PackReader):
def __init__(self, links_to_ignore: Optional[Set[str]] = None):
super().__init__()
self._ontology = wiki_ontology
if links_to_ignore is None:
# Default ignoring link types.
self.links_to_ignore = {"File", "Category", "wikt"}
else:
self.links_to_ignore = links_to_ignore
@property
def pack_type(self):
return DataPack
def _cache_key_function(self, collection: Any) -> str:
pass
import json
import os
from typing import Any, Iterator
from forte.data.data_pack import DataPack
from forte.data.data_utils_io import dataset_path_iterator
from forte.data.readers.base_reader import PackReader
from ft.onto.race_multi_choice_qa_ontology import (
RaceDocument, Passage, Question, Option)
__all__ = [
"RACEMultiChoiceQAReader",
]
class RACEMultiChoiceQAReader(PackReader):
r""":class:`RACEMultiChoiceQAReader` is designed to read in RACE multi
choice qa dataset.
"""
def _collect(self, json_directory) -> Iterator[Any]: # type: ignore
r"""Should be called with param ``json_directory`` which is a path to a
folder containing json files.
Args:
json_directory: directory containing the json files.
Returns: Iterator over paths to .json files
"""
return dataset_path_iterator(json_directory, "")
def _cache_key_function(self, json_file: str) -> str:
else:
break
else:
assert 0, "interesting.search() lied"
# end while
if end and i < n and not self.cdata_elem:
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(unescape(rawdata[i:n]))
else:
self.handle_data(rawdata[i:n])
i = self.updatepos(i, n)
# pylint: disable=attribute-defined-outside-init
self.rawdata = rawdata[i:]
class HTMLReader(PackReader):
r""":class:`HTMLReader` is designed to read in list of html strings.
It takes in list of html strings, cleans the HTML tags and stores the
cleaned text in pack.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.init_with_fileloc = False
self.init_with_html = False
def _collect(self, content) -> Iterator[str]: # type: ignore
r"""Could be called with a directory, a particular file location or a
list of strings. If the string is an HTML string, it will be cleaned.
Args:
"""
from typing import Iterator, Dict, Tuple, Any
from ft.onto.base_ontology import (
Document, Sentence, Token, Dependency, EnhancedDependency)
from forte.data.data_utils_io import dataset_path_iterator
from forte.data.data_pack import DataPack
from forte.data.readers.base_reader import PackReader
__all__ = [
"ConllUDReader"
]
class ConllUDReader(PackReader):
r""":class:`conllUReader` is designed to read in the Universal Dependencies
2.4 dataset.
"""
def _cache_key_function(self, data_pack: Any) -> str:
if data_pack.meta.pack_name is None:
raise ValueError("data_pack does not have a document id")
return data_pack.meta.pack_name
def _collect(self, *args, **kwargs) -> Iterator[Any]:
# pylint: disable = unused-argument
r"""Iterator over conll files in the data_source.
Args:
args: args[0] is the directory to the conllu files.
kwargs:
from typing import Iterator, Tuple
from texar.torch import HParams
from forte.data.data_pack import DataPack
from forte.data.readers.base_reader import PackReader
from forte.common.resources import Resources
from ft.onto.base_ontology import Document
__all__ = [
"CorpusReader"
]
class CorpusReader(PackReader):
def __init__(self):
super(CorpusReader, self).__init__()
self.configs = None
def initialize(self, resources: Resources, configs: HParams):
# pylint: disable = unused-argument
self.configs = configs
def _collect(self, *args, **kwargs) -> Iterator[Tuple[str, str]]:
# pylint: disable = unused-argument, undefined-variable
dir_path: str = args[0]
corpus_file_path = os.path.join(dir_path, 'collection.tsv')
with open(corpus_file_path, 'r') as file:
info_box.key = v.toPython()
info_box.value = get_resource_name(o)
def read_index(pack_index_path: str) -> Dict[str, str]:
page_idx: Dict[str, str] = {}
logging.info("Reading pack index from %s", pack_index_path)
with open(pack_index_path) as idx:
for page_name, page_path in csv.reader(idx, delimiter='\t'):
page_idx[page_name] = page_path
return page_idx
class DBpediaInfoBoxReader(PackReader):
def __init__(self):
super().__init__()
self.pack_index: Dict[str, str]
self.pack_dir: str
self.redirects: Dict[str, str]
self.logger = logging.getLogger(__name__)
def initialize(self, resources: Resources, configs: Config):
# pylint: disable=attribute-defined-outside-init
self.pack_index = read_index(configs.pack_index)
self.pack_dir = configs.pack_dir
self.redirects = resources.get('redirects')
self.literal_info_reader = NIFBufferedContextReader(
configs.mapping_literals)
import codecs
import logging
import os
from typing import Iterator, Any
from forte.data.data_pack import DataPack
from forte.data.data_utils_io import dataset_path_iterator
from forte.data.readers.base_reader import PackReader
from ft.onto.base_ontology import Token, Sentence, Document
__all__ = [
"CoNLL03Reader"
]
class CoNLL03Reader(PackReader):
r""":class:`CoNLL03Reader` is designed to read in the CoNLL03-ner dataset.
"""
def _collect(self, conll_directory) -> Iterator[Any]: # type: ignore
r"""Iterator over conll files in the data_source.
Args:
conll_directory: directory to the conll files.
Returns: Iterator over files in the path with conll extensions.
"""
logging.info("Reading .conll from %s", conll_directory)
return dataset_path_iterator(conll_directory, "conll")
def _cache_key_function(self, conll_file: str) -> str:
return os.path.basename(conll_file)
from typing import Iterator, List, Tuple, Optional, Union
import pandas as pd
from texar.torch import HParams
from ft.onto.base_ontology import Query, Document, Passage
from forte.data import DataPack
from forte.data.readers.base_reader import PackReader
from forte.common.resources import Resources
__all__ = [
"WikiPassageQAReader"
]
class WikiPassageQAReader(PackReader):
DocInfoType = Tuple[bool, str, List[str], Optional[List[str]]]
def __init__(self):
super(WikiPassageQAReader, self).__init__()
self.configs = None
def initialize(self, resources: Resources, configs: HParams):
# pylint: disable = unused-argument
self.configs = configs
def _collect(self, *args, **kwargs) -> Iterator[DocInfoType]:
# pylint: disable = unused-argument, undefined-variable
"""
Reads the contents of the input `dir_path` and returns a info to
populate query or document data packs. It reads the documents from the
from typing import (Any, DefaultDict, Iterator, List, NamedTuple, Optional,
Set, Tuple)
from forte.data.data_pack import DataPack
from forte.data.data_utils_io import dataset_path_iterator
from forte.data.readers.base_reader import PackReader
from ft.onto.base_ontology import (
CoreferenceGroup, Document, EntityMention, PredicateArgument, PredicateLink,
PredicateMention, Sentence, Token)
__all__ = [
"OntonotesReader",
]
class OntonotesReader(PackReader):
r""":class:`OntonotesReader` is designed to read in the English OntoNotes
v5.0 data in the datasets used by the CoNLL 2011/2012 shared tasks. To use
this Reader, you must follow the instructions provided `here (v12 release):
`_:, which will allow you to
download the CoNLL style annotations for the OntoNotes v5.0 release
– LDC2013T19.tgz obtained from LDC.
Args:
column_format: A list of strings indicating which field each column in a
line corresponds to. The length of the list should be equal to the
number of columns in the files to be read. Available field types
include:
- ``"document_id"``
- ``"part_number"``
- ``"word"``