Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if not from_words and not index.isdigit():
continue
curr_word_sent.append(splitted[word_column])
if not read_only_words:
pos, tag = splitted[pos_column], splitted[tag_column]
tag = pos if tag == "_" else "{},{}".format(pos, tag)
curr_tag_sent.append(tag)
if len(curr_word_sent) > 0:
if read_only_words:
curr_tag_sent = None
answer.append((curr_word_sent, curr_tag_sent))
return answer
@register('morphotagger_dataset_reader')
class MorphotaggerDatasetReader(DatasetReader):
"""Class to read training datasets in UD format"""
URL = 'http://files.deeppavlov.ai/datasets/UD2.0_source/'
def read(self, data_path: Union[List, str],
language: Optional[None] = None,
data_types: Optional[List[str]] = None,
**kwargs) -> Dict[str, List]:
"""Reads UD dataset from data_path.
Args:
data_path: can be either
1. a directory containing files. The file for data_type 'mode'
is then data_path / {language}-ud-{mode}.conllu
2. a list of files, containing the same number of items as data_types
language: a language to detect filename when it is not given
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import csv
from pathlib import Path
from typing import Dict, List, Tuple
from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
@register('siamese_reader')
class SiameseReader(DatasetReader):
"""The class to read dataset for ranking or paraphrase identification with Siamese networks."""
def read(self, data_path: str, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]:
"""Read the dataset for ranking or paraphrase identification with Siamese networks.
Args:
data_path: A path to a folder with dataset files.
"""
dataset = {'train': None, 'valid': None, 'test': None}
data_path = expand_path(data_path)
train_fname = data_path / 'train.csv'
valid_fname = data_path / 'valid.csv'
test_fname = data_path / 'test.csv'
dataset["train"] = self._preprocess_data_train(train_fname)
dataset["valid"] = self._preprocess_data_valid_test(valid_fname)
import json
from logging import getLogger
from pathlib import Path
from typing import Dict, List
from overrides import overrides
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
from deeppavlov.core.data.utils import download_decompress, mark_done
log = getLogger(__name__)
@register('kvret_reader')
class KvretDatasetReader(DatasetReader):
"""
A New Multi-Turn, Multi-Domain, Task-Oriented Dialogue Dataset.
Stanford NLP released a corpus of 3,031 multi-turn dialogues in three distinct domains appropriate for an in-car assistant: calendar scheduling, weather information retrieval, and point-of-interest navigation. The dialogues are grounded through knowledge bases ensuring that they are versatile in their natural language without being completely free form.
For details see https://nlp.stanford.edu/blog/a-new-multi-turn-multi-domain-task-oriented-dialogue-dataset/.
"""
url = 'http://files.deeppavlov.ai/datasets/kvret_public.tar.gz'
@staticmethod
def _data_fname(datatype):
assert datatype in ('train', 'dev', 'test'), "wrong datatype name"
return 'kvret_{}_public.json'.format(datatype)
@classmethod
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, softwaredata
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
@register('line_reader')
class LineReader(DatasetReader):
"""Read txt file by lines"""
def read(self, data_path: str = None, *args, **kwargs) -> Dict:
"""Read lines from txt file
Args:
data_path: path to txt file
Returns:
A dictionary containing training, validation and test parts of the dataset obtainable via ``train``, ``valid`` and ``test`` keys.
"""
with open(data_path) as f:
content = f.readlines()
dataset = dict()
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pickle
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
@register('sq_reader')
class OntonotesReader(DatasetReader):
"""Class to read training datasets in OntoNotes format"""
def read(self, data_path: str):
with open(data_path, 'rb') as f:
dataset = pickle.load(f)
return dataset
import csv
from pathlib import Path
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.utils import is_done, download, mark_done
from deeppavlov.core.data.dataset_reader import DatasetReader
@register('typos_kartaslov_reader')
class TyposKartaslov(DatasetReader):
def __init__(self):
pass
@staticmethod
def build(data_path: str):
data_path = Path(data_path) / 'kartaslov'
fname = data_path / 'orfo_and_typos.L1_5.csv'
if not is_done(data_path):
url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv'
download(fname, url)
mark_done(data_path)
from pathlib import Path
from logging import getLogger
import pandas as pd
from overrides import overrides
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
from deeppavlov.core.data.utils import download, mark_done
log = getLogger(__name__)
@register('basic_dataset_reader')
class BasicDatasetReader(DatasetReader):
"""
Class provides reading dataset in .csv format and \
assigns columns with given names to `x` and `y` without any changes of data
"""
@overrides
def read(self, data_path: str, url: str = None,
format: str = "csv",
*args, **kwargs) -> dict:
"""
Read dataset from data_path directory.
Reading files are all data_types + extension
(i.e for data_types=["train", "valid"] files "train.csv" and "valid.csv" form
data_path will be read)
Args:
# Unless required by applicable law or agreed to in writing, softwaredata
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict
from pandas import read_csv
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
@register('faq_reader')
class FaqDatasetReader(DatasetReader):
"""Reader for FAQ dataset"""
def read(self, data_path: str = None, data_url: str = None, x_col_name: str = 'x', y_col_name: str = 'y') -> Dict:
"""
Read FAQ dataset from specified csv file or remote url
Parameters:
data_path: path to csv file of FAQ
data_url: url to csv file of FAQ
x_col_name: name of Question column in csv file
y_col_name: name of Answer column in csv file
Returns:
A dictionary containing training, validation and test parts of the dataset obtainable via
``train``, ``valid`` and ``test`` keys.
"""
# limitations under the License.
import csv
import itertools
import random
import json
from pathlib import Path
from typing import Dict, List, Tuple
from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
@register('squad_ranking_reader')
class SquadRankingReader(DatasetReader):
"""The class to read dataset for ranking or paraphrase identification with Siamese networks."""
def read(self, data_path: str,
num_candidates=10,
positive_samples=False, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]:
"""Read the dataset for ranking or paraphrase identification with Siamese networks.
Args:
data_path: A path to a folder with dataset files.
"""
self.num_candidates = num_candidates
self.positive_samples = positive_samples
dataset = {'train': None, 'valid': None, 'test': None}
data_path = expand_path(data_path)
train_fname = data_path / 'train.jsonl'
valid_fname = data_path / 'dev.jsonl'
# See the License for the specific language governing permissions and
# limitations under the License.
import csv
from pathlib import Path
from typing import List, Dict, Tuple
import random
from collections import defaultdict
from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
@register('msmarco_reader')
class MSMARCOReader(DatasetReader):
"""The class to read the Ubuntu V2 dataset from csv files.
Please, see https://github.com/rkadlec/ubuntu-ranking-dataset-creator.
"""
def read(self, data_path: str,
positive_samples=False,
random_seed=243,
*args, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]:
"""Read the Ubuntu V2 dataset from csv files.
Args:
data_path: A path to a folder with dataset csv files.
positive_samples: if `True`, only positive context-response pairs will be taken for train
"""