Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def fit_chainer(config: dict, iterator: BasicDatasetIterator) -> Chainer:
chainer_config: dict = config['chainer']
chainer = Chainer(chainer_config['in'], chainer_config['out'], chainer_config.get('in_y'))
for component_config in chainer_config['pipe']:
component = from_params(component_config, vocabs=[], mode='train')
if 'fit_on' in component_config:
component: Estimator
preprocessed = chainer(*iterator.iter_all('train'), to_return=component_config['fit_on'])
if len(component_config['fit_on']) == 1:
preprocessed = [preprocessed]
else:
preprocessed = zip(*preprocessed)
component.fit(*preprocessed)
component.save()
if 'in' in component_config:
c_in = component_config['in']
c_out = component_config['out']
in_y = component_config.get('in_y', None)
main = component_config.get('main', False)
chainer.append(c_in, c_out, component, in_y, main)
return chainer
chainer = Chainer(chainer_config['in'], chainer_config['out'], chainer_config.get('in_y'))
for component_config in chainer_config['pipe']:
component = from_params(component_config, mode='train')
if 'fit_on' in component_config:
component: Estimator
preprocessed = chainer(*iterator.get_instances('train'), to_return=component_config['fit_on'])
if len(component_config['fit_on']) == 1:
preprocessed = [preprocessed]
else:
preprocessed = zip(*preprocessed)
component.fit(*preprocessed)
component.save()
if 'fit_on_batch' in component_config:
component: Estimator
component.fit_batches(iterator, config['train']['batch_size'])
component.save()
if 'in' in component_config:
c_in = component_config['in']
c_out = component_config['out']
in_y = component_config.get('in_y', None)
main = component_config.get('main', False)
chainer.append(component, c_in, c_out, in_y, main)
return chainer
for j, token in enumerate(utt[-self.token_limit:]):
for k, char in enumerate(token[:self.char_limit]):
utt_idxs[b, i, j, k] = char
elif len(get_shape(batch)) == 3:
utt_idxs = np.zeros([len(batch), self.token_limit, self.char_limit], dtype=np.int32)
for i, utt in enumerate(batch):
for j, token in enumerate(utt[-self.token_limit:]):
for k, char in enumerate(token[:self.char_limit]):
utt_idxs[i, j, k] = char
else:
raise RuntimeError("Unsupported batch shape")
return utt_idxs
@register('personachat_vocab')
class PersonachatEmbedder(Estimator):
# TODO: refactor to merge this code with SQuAD embedder
def __init__(self, emb_folder, emb_url, save_path, load_path,
x_len_limit, persona_len_limit, y_len_limit, char_limit, level='token', *args, **kwargs):
self.emb_folder = expand_path(emb_folder)
self.level = level
self.emb_url = emb_url
self.emb_file_name = Path(emb_url).name
self.save_path = expand_path(save_path)
self.load_path = expand_path(load_path)
self.x_len_limit = x_len_limit
self.persona_len_limit = persona_len_limit
self.y_len_limit = y_len_limit
self.char_limit = char_limit
self.loaded = False
self.NULL = ""
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sqlite3
from logging import getLogger
from typing import List, Dict
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.estimator import Estimator
log = getLogger(__name__)
@register('sqlite_database')
class Sqlite3Database(Estimator):
"""
Loads and trains sqlite table of any items (with name ``table_name``
and path ``save_path``).
Primary (unique) keys must be specified, all other keys are infered from data.
Batch here is a list of dictionaries, where each dictionary corresponds to an item.
If an item doesn't contain values for all keys, then missing values will be stored
with ``unknown_value``.
Parameters:
save_path: sqlite database path.
primary_keys: list of table primary keys' names.
keys: all table keys' names.
table_name: name of the sqlite table.
unknown_value: value assigned to missing item values.
**kwargs: parameters passed to parent
from logging import getLogger
from typing import List, Union, Iterable, Optional
import numpy as np
from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.utils import zero_pad_truncate
from deeppavlov.core.models.component import Component
from deeppavlov.core.models.estimator import Estimator
log = getLogger(__name__)
@register('siamese_preprocessor')
class SiamesePreprocessor(Estimator):
""" Preprocessing of data samples containing text strings to feed them in a siamese network.
First ``num_context_turns`` strings in each data sample corresponds to the dialogue ``context``
and the rest string(s) in the sample is (are) ``response(s)``.
Args:
save_path: The parameter is only needed to initialize the base class
:class:`~deeppavlov.core.models.serializable.Serializable`.
load_path: The parameter is only needed to initialize the base class
:class:`~deeppavlov.core.models.serializable.Serializable`.
max_sequence_length: A maximum length of text sequences in tokens.
Longer sequences will be truncated and shorter ones will be padded.
dynamic_batch: Whether to use dynamic batching. If ``True``, the maximum length of a sequence for a batch
will be equal to the maximum of all sequences lengths from this batch,
but not higher than ``max_sequence_length``.
padding: Padding. Possible values are ``pre`` and ``post``.
import numpy as np
from typing import List, Union, Iterable
from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.log import get_logger
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from deeppavlov.core.models.estimator import Estimator
from deeppavlov.core.data.utils import zero_pad_truncate
log = get_logger(__name__)
@register('ubuntu_preprocessor')
class UbuntuPreprocessor(Estimator):
""" Preprocessing of data samples from Ubuntu Dialogue Corpus v1/v2 dataset
to feed them in SMN or DAM ranking neural models.
First ``num_context_turns`` strings in each data sample corresponds to the dialogue ``context``
and the rest string(s) in the sample is (are) ``response(s)``.
Args:
save_path: The parameter is only needed to initialize the base class
:class:`~deeppavlov.core.models.serializable.Serializable`.
load_path: The parameter is only needed to initialize the base class
:class:`~deeppavlov.core.models.serializable.Serializable`.
max_sequence_length: A maximum length of text sequences in tokens.
Longer sequences will be truncated and shorter ones will be padded.
dynamic_batch: Whether to use dynamic batching. If ``True``, the maximum length of a sequence for a batch
will be equal to the maximum of all sequences lengths from this batch,
but not higher than ``max_sequence_length``.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
from typing import List
from logging import getLogger
import numpy as np
from deeppavlov.core.models.estimator import Estimator
log = getLogger()
class StateFeaturizer(Estimator):
def __init__(self, dontcare_value: str = None, **kwargs) -> None:
super().__init__(**kwargs)
self.dontcare_value = dontcare_value
self.dim = 2 if dontcare_value else 1
self.len = 0
self.keys = []
if self.load_path.exists():
self.load()
@classmethod
def _get_depth(cls, d: dict) -> int:
for k in d.keys():
if isinstance(d[k], dict):
return cls._get_depth(d[k]) + 1
break
for idx, sp in enumerate(span):
if not (ans_end <= sp[0] or ans_st >= sp[1]):
answer_span.append(idx)
if len(answer_span) != 0:
y1, y2 = answer_span[0], answer_span[-1]
else:
# answer not found in context
y1, y2 = -1, -1
start[-1].append(y1)
end[-1].append(y2)
answers[-1].append(ans)
return answers, start, end
@register('squad_vocab_embedder')
class SquadVocabEmbedder(Estimator):
""" SquadVocabEmbedder is used to build tokens/chars vocabulary and embedding matrix.
It extracts tokens/chars form dataset and looks for pretrained embeddings.
Params:
emb_folder: path to download pretrained embeddings
emb_url: link to pretrained embeddings
save_path: extracted embeddings save path
load_path: extracted embeddigns load path
context_limit: max context length in tokens
question_limit: max question length in tokens
char_limit: max number of characters in token
level: token or char
"""
def __init__(self, emb_folder: str, emb_url: str, save_path: str, load_path: str,
from collections import defaultdict
import ujson as json
from pathlib import Path
from typing import List, Tuple
import numpy as np
from deeppavlov.core.models.estimator import Estimator
from deeppavlov.core.common.registry import register
@register("unimorph_vectorizer")
class UnimorphDictionaryVectorizer(Estimator):
def __init__(self, save_path, load_path, use_last_word=False,
use_suffixes=False, min_suffix_count=10, max_suffix_length=5, **kwargs):
load_path = Path(load_path).with_suffix(".json")
save_path = Path(save_path).with_suffix(".json")
super().__init__(save_path=save_path, load_path=load_path, **kwargs)
self.use_last_word = use_last_word
self.use_suffixes = use_suffixes
self.min_suffix_count = min_suffix_count
self.max_suffix_length = max_suffix_length
if self.load_path.exists():
self.load()
@property
def pos_number(self):
log.warning("initializing `{}` from scratch".format('LogisticRegression'))
self.model = LogisticRegression(self.penalty, self.dual, self.tol, self.C, self.fit_intercept,
self.intercept_scaling, self.class_weight, self.random_state,
self.solver, self.max_iter, self.multi_class, self.verbose,
self.warm_start, self.n_jobs)
else:
log.warning("No `load_path` is provided for {0}. "
"Initializing `{0}` from scratch".format(self.__class__.__name__))
self.model = LogisticRegression(self.penalty, self.dual, self.tol, self.C, self.fit_intercept,
self.intercept_scaling, self.class_weight, self.random_state,
self.solver, self.max_iter, self.multi_class, self.verbose,
self.warm_start, self.n_jobs)
@register("support_vector_classifier")
class Svm(Estimator):
"""
The class implements the Support Vector Classifier from Sklearn library.
Args:
save_path (str): save path
load_path (str): load path
mode: train/infer trigger
**kwargs: additional arguments
Attributes:
model: Support Vector Classifier class from sklearn
"""
def __init__(self, penalty='l2', loss='squared_hinge', dual=True, tol=1e-4, C=1.0, multi_class='ovr',
fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None,
max_iter=1000, **kwargs) -> None: