Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_count_vector_featurizer_oov_token(sentence, expected):
from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
ftr = CountVectorsFeaturizer(
{"token_pattern": r"(?u)\b\w+\b", "OOV_token": "__oov__"}
)
train_message = Message(sentence)
# this is needed for a valid training example
train_message.set("intent", "bla")
data = TrainingData([train_message])
ftr.train(data)
test_message = Message(sentence)
ftr.process(test_message)
assert np.all(test_message.get("text_features") == expected)
def test_count_vector_featurizer_char(sentence, expected):
from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
ftr = CountVectorsFeaturizer({"min_ngram": 1, "max_ngram": 2, "analyzer": "char"})
train_message = Message(sentence)
# this is needed for a valid training example
train_message.set("intent", "bla")
data = TrainingData([train_message])
ftr.train(data)
test_message = Message(sentence)
ftr.process(test_message)
assert np.all(test_message.get("text_features") == expected)
("ab ab ab", [[0, 0, 1, 1, 1, 0]]),
("abc", [[1, 1, 1, 1, 1]]),
],
)
def test_count_vector_featurizer_char(sentence, expected):
from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
CountVectorsFeaturizer,
)
ftr = CountVectorsFeaturizer(
{"min_ngram": 1, "max_ngram": 2, "analyzer": "char", "return_sequence": True}
)
train_message = Message(sentence)
# this is needed for a valid training example
train_message.set("intent", "bla")
data = TrainingData([train_message])
ftr.train(data)
test_message = Message(sentence)
ftr.process(test_message)
assert np.all(test_message.get("text_sparse_features").toarray()[0] == expected)
@pytest.mark.parametrize("language, pipeline", pipelines_for_tests())
def test_train_with_empty_data(language, pipeline, component_builder, tmpdir):
_config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})
trainer = Trainer(_config, component_builder)
trainer.train(TrainingData())
persistor = create_persistor(_config)
persisted_path = trainer.persist(
tmpdir.strpath, persistor, project_name="my_project"
)
loaded = Interpreter.load(persisted_path, component_builder)
assert loaded.pipeline
assert loaded.parse("hello") is not None
assert loaded.parse("Hello today is Monday, again!") is not None
def test_spacy_training_sample_alignment(spacy_nlp_component):
from spacy.tokens import Doc
m1 = Message.build(text="I have a feeling", intent="feeling")
m2 = Message.build(text="", intent="feeling")
m3 = Message.build(text="I am the last message", intent="feeling")
td = TrainingData(training_examples=[m1, m2, m3])
attribute_docs = spacy_nlp_component.docs_for_training_data(td)
assert isinstance(attribute_docs["text"][0], Doc)
assert isinstance(attribute_docs["text"][1], Doc)
assert isinstance(attribute_docs["text"][2], Doc)
assert [t.text for t in attribute_docs["text"][0]] == ["i", "have", "a", "feeling"]
assert [t.text for t in attribute_docs["text"][1]] == []
assert [t.text for t in attribute_docs["text"][2]] == [
"i",
"am",
"the",
"last",
"message",
]
@pytest.mark.parametrize("language, pipeline", pipelines_for_tests())
def test_train_with_empty_data(language, pipeline, component_builder, tmpdir):
_config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})
trainer = Trainer(_config, component_builder)
trainer.train(TrainingData())
persistor = create_persistor(_config)
persisted_path = trainer.persist(tmpdir.strpath, persistor)
loaded = Interpreter.load(persisted_path, component_builder)
assert loaded.pipeline
assert loaded.parse("hello") is not None
assert loaded.parse("Hello today is Monday, again!") is not None
def test_mitie_featurizer_train(mitie_feature_extractor):
featurizer = MitieFeaturizer.create({}, RasaNLUModelConfig())
sentence = "Hey how are you today"
message = Message(sentence)
message.set(RESPONSE, sentence)
message.set(INTENT, "intent")
MitieTokenizer().train(TrainingData([message]))
featurizer.train(
TrainingData([message]),
RasaNLUModelConfig(),
**{"mitie_feature_extractor": mitie_feature_extractor},
)
expected = np.array(
[0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00, -8.26445103e00]
)
expected_cls = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751])
vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
assert len(message.get(TOKENS_NAMES[TEXT])) == len(vecs)
assert np.allclose(vecs[0][:5], expected, atol=1e-5)
assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE])
def _read_intent(
self, intent_js: Dict[Text, Any], examples_js: List[Dict[Text, Any]]
) -> "TrainingData":
"""Reads the intent and examples from respective jsons."""
from rasa.nlu.training_data import Message, TrainingData
intent = intent_js.get("name")
training_examples = []
for ex in examples_js:
text, entities = self._join_text_chunks(ex["data"])
training_examples.append(Message.build(text, intent, entities))
return TrainingData(training_examples)
if fformat not in {DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES}:
raise ValueError(
"fformat must be either {}, or {}"
"".format(DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES)
)
root_js = rasa.utils.io.read_json_file(fn)
examples_js = self._read_examples_js(fn, language, fformat)
if not examples_js:
raise_warning(
f"No training examples found for dialogflow file {fn}!",
docs=DOCS_URL_MIGRATE_GOOGLE,
)
return TrainingData()
elif fformat == DIALOGFLOW_INTENT:
return self._read_intent(root_js, examples_js)
else: # path for DIALOGFLOW_ENTITIES
return self._read_entities(root_js, examples_js)