Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def fit_chainer(config: dict, iterator: BasicDatasetIterator) -> Chainer:
chainer_config: dict = config['chainer']
chainer = Chainer(chainer_config['in'], chainer_config['out'], chainer_config.get('in_y'))
for component_config in chainer_config['pipe']:
component = from_params(component_config, vocabs=[], mode='train')
if 'fit_on' in component_config:
component: Estimator
preprocessed = chainer(*iterator.iter_all('train'), to_return=component_config['fit_on'])
if len(component_config['fit_on']) == 1:
preprocessed = [preprocessed]
preprocessed = zip(*preprocessed)*preprocessed)
if 'in' in component_config:
c_in = component_config['in']
c_out = component_config['out']
in_y = component_config.get('in_y', None)
main = component_config.get('main', False)
chainer.append(c_in, c_out, component, in_y, main)
return chainer
chainer = Chainer(chainer_config['in'], chainer_config['out'], chainer_config.get('in_y'))
for component_config in chainer_config['pipe']:
component = from_params(component_config, mode='train')
if 'fit_on' in component_config:
component: Estimator
preprocessed = chainer(*iterator.get_instances('train'), to_return=component_config['fit_on'])
if len(component_config['fit_on']) == 1:
preprocessed = [preprocessed]
preprocessed = zip(*preprocessed)*preprocessed)
if 'fit_on_batch' in component_config:
component: Estimator
component.fit_batches(iterator, config['train']['batch_size'])
if 'in' in component_config:
c_in = component_config['in']
c_out = component_config['out']
in_y = component_config.get('in_y', None)
main = component_config.get('main', False)
chainer.append(component, c_in, c_out, in_y, main)
return chainer
for j, token in enumerate(utt[-self.token_limit:]):
for k, char in enumerate(token[:self.char_limit]):
utt_idxs[b, i, j, k] = char
elif len(get_shape(batch)) == 3:
utt_idxs = np.zeros([len(batch), self.token_limit, self.char_limit], dtype=np.int32)
for i, utt in enumerate(batch):
for j, token in enumerate(utt[-self.token_limit:]):
for k, char in enumerate(token[:self.char_limit]):
utt_idxs[i, j, k] = char
raise RuntimeError("Unsupported batch shape")
return utt_idxs
class PersonachatEmbedder(Estimator):
# TODO: refactor to merge this code with SQuAD embedder
def __init__(self, emb_folder, emb_url, save_path, load_path,
x_len_limit, persona_len_limit, y_len_limit, char_limit, level='token', *args, **kwargs):
self.emb_folder = expand_path(emb_folder)
self.level = level
self.emb_url = emb_url
self.emb_file_name = Path(emb_url).name
self.save_path = expand_path(save_path)
self.load_path = expand_path(load_path)
self.x_len_limit = x_len_limit
self.persona_len_limit = persona_len_limit
self.y_len_limit = y_len_limit
self.char_limit = char_limit
self.loaded = False
self.NULL = ""
# See the License for the specific language governing permissions and
# limitations under the License.
import sqlite3
from logging import getLogger
from typing import List, Dict
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.estimator import Estimator
log = getLogger(__name__)
class Sqlite3Database(Estimator):
Loads and trains sqlite table of any items (with name ``table_name``
and path ``save_path``).
Primary (unique) keys must be specified, all other keys are infered from data.
Batch here is a list of dictionaries, where each dictionary corresponds to an item.
If an item doesn't contain values for all keys, then missing values will be stored
with ``unknown_value``.
save_path: sqlite database path.
primary_keys: list of table primary keys' names.
keys: all table keys' names.
table_name: name of the sqlite table.
unknown_value: value assigned to missing item values.
**kwargs: parameters passed to parent
from logging import getLogger
from typing import List, Union, Iterable, Optional
import numpy as np
from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from import zero_pad_truncate
from deeppavlov.core.models.component import Component
from deeppavlov.core.models.estimator import Estimator
log = getLogger(__name__)
class SiamesePreprocessor(Estimator):
""" Preprocessing of data samples containing text strings to feed them in a siamese network.
First ``num_context_turns`` strings in each data sample corresponds to the dialogue ``context``
and the rest string(s) in the sample is (are) ``response(s)``.
save_path: The parameter is only needed to initialize the base class
load_path: The parameter is only needed to initialize the base class
max_sequence_length: A maximum length of text sequences in tokens.
Longer sequences will be truncated and shorter ones will be padded.
dynamic_batch: Whether to use dynamic batching. If ``True``, the maximum length of a sequence for a batch
will be equal to the maximum of all sequences lengths from this batch,
but not higher than ``max_sequence_length``.
padding: Padding. Possible values are ``pre`` and ``post``.
import numpy as np
from typing import List, Union, Iterable
from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.log import get_logger
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from deeppavlov.core.models.estimator import Estimator
from import zero_pad_truncate
log = get_logger(__name__)
class UbuntuPreprocessor(Estimator):
""" Preprocessing of data samples from Ubuntu Dialogue Corpus v1/v2 dataset
to feed them in SMN or DAM ranking neural models.
First ``num_context_turns`` strings in each data sample corresponds to the dialogue ``context``
and the rest string(s) in the sample is (are) ``response(s)``.
save_path: The parameter is only needed to initialize the base class
load_path: The parameter is only needed to initialize the base class
max_sequence_length: A maximum length of text sequences in tokens.
Longer sequences will be truncated and shorter ones will be padded.
dynamic_batch: Whether to use dynamic batching. If ``True``, the maximum length of a sequence for a batch
will be equal to the maximum of all sequences lengths from this batch,
but not higher than ``max_sequence_length``.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
from typing import List
from logging import getLogger
import numpy as np
from deeppavlov.core.models.estimator import Estimator
log = getLogger()
class StateFeaturizer(Estimator):
def __init__(self, dontcare_value: str = None, **kwargs) -> None:
self.dontcare_value = dontcare_value
self.dim = 2 if dontcare_value else 1
self.len = 0
self.keys = []
if self.load_path.exists():
def _get_depth(cls, d: dict) -> int:
for k in d.keys():
if isinstance(d[k], dict):
return cls._get_depth(d[k]) + 1
for idx, sp in enumerate(span):
if not (ans_end <= sp[0] or ans_st >= sp[1]):
if len(answer_span) != 0:
y1, y2 = answer_span[0], answer_span[-1]
# answer not found in context
y1, y2 = -1, -1
return answers, start, end
class SquadVocabEmbedder(Estimator):
""" SquadVocabEmbedder is used to build tokens/chars vocabulary and embedding matrix.
It extracts tokens/chars form dataset and looks for pretrained embeddings.
emb_folder: path to download pretrained embeddings
emb_url: link to pretrained embeddings
save_path: extracted embeddings save path
load_path: extracted embeddigns load path
context_limit: max context length in tokens
question_limit: max question length in tokens
char_limit: max number of characters in token
level: token or char
def __init__(self, emb_folder: str, emb_url: str, save_path: str, load_path: str,
from collections import defaultdict
import ujson as json
from pathlib import Path
from typing import List, Tuple
import numpy as np
from deeppavlov.core.models.estimator import Estimator
from deeppavlov.core.common.registry import register
class UnimorphDictionaryVectorizer(Estimator):
def __init__(self, save_path, load_path, use_last_word=False,
use_suffixes=False, min_suffix_count=10, max_suffix_length=5, **kwargs):
load_path = Path(load_path).with_suffix(".json")
save_path = Path(save_path).with_suffix(".json")
super().__init__(save_path=save_path, load_path=load_path, **kwargs)
self.use_last_word = use_last_word
self.use_suffixes = use_suffixes
self.min_suffix_count = min_suffix_count
self.max_suffix_length = max_suffix_length
if self.load_path.exists():
def pos_number(self):
log.warning("initializing `{}` from scratch".format('LogisticRegression'))
self.model = LogisticRegression(self.penalty, self.dual, self.tol, self.C, self.fit_intercept,
self.intercept_scaling, self.class_weight, self.random_state,
self.solver, self.max_iter, self.multi_class, self.verbose,
self.warm_start, self.n_jobs)
log.warning("No `load_path` is provided for {0}. "
"Initializing `{0}` from scratch".format(self.__class__.__name__))
self.model = LogisticRegression(self.penalty, self.dual, self.tol, self.C, self.fit_intercept,
self.intercept_scaling, self.class_weight, self.random_state,
self.solver, self.max_iter, self.multi_class, self.verbose,
self.warm_start, self.n_jobs)
class Svm(Estimator):
The class implements the Support Vector Classifier from Sklearn library.
save_path (str): save path
load_path (str): load path
mode: train/infer trigger
**kwargs: additional arguments
model: Support Vector Classifier class from sklearn
def __init__(self, penalty='l2', loss='squared_hinge', dual=True, tol=1e-4, C=1.0, multi_class='ovr',
fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None,
max_iter=1000, **kwargs) -> None: