Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
@register_query_feature(feature_name="word-shape")
def extract_word_shape(lengths=(1,), **args):
"""
Extracts word shape for ngrams of specified lengths.
Args:
lengths (list of int): The ngram length
Returns:
(function) An feature extraction function that takes a query and \
returns ngrams of word shapes, for n of specified lengths.
"""
del args
def word_shape_basic(token):
# example: option --> xxxxx+, 123 ---> ddd, call --> xxxx
shape = ["d" if character.isdigit() else "x" for character in token]
@register_query_feature(feature_name="sys-candidates")
def extract_sys_candidates(entities=None, **args):
"""
Return an extractor for features based on a heuristic guess of numeric \
candidates in the current query.
Returns:
(function) The feature extractor.
"""
del args
entities = entities or DEFAULT_SYS_ENTITIES
def _extractor(query, resources):
del resources
system_entities = query.get_system_entity_candidates(list(entities))
sys_ent_counter = Counter()
for entity in system_entities:
@register_query_feature(feature_name="exact")
@requires(QUERY_FREQ_RSC)
def extract_query_string(scaling=1000, **args):
"""
Extract whole query string as a feature.
Returns:
(function) A feature extraction function that takes a query and \
returns the whole query string for exact matching
"""
def _extractor(query, resources):
query_key = "<{}>".format(query.normalized_text)
if query_key in resources[QUERY_FREQ_RSC]:
return {"exact|query:{}".format(query_key): scaling}
@register_query_feature(feature_name="bag-of-words-seq")
@requires(WORD_NGRAM_FREQ_RSC)
def extract_bag_of_words_features(
ngram_lengths_to_start_positions, thresholds=(0,), **args
):
"""Returns a bag-of-words feature extractor.
Args:
ngram_lengths_to_start_positions (dict)
thresholds (int): Cut off value to include word in n-gram vocab
Returns:
(function) The feature extractor.
"""
threshold_list = list(thresholds)
word_thresholds = threshold_list + [0] * (
len(ngram_lengths_to_start_positions.keys()) - len(threshold_list)
@register_query_feature(feature_name="in-gaz")
@requires(GAZETTEER_RSC)
def extract_in_gaz_feature(scaling=1, **args):
"""Returns a feature extractor that generates a set of features indicating the presence
of query n-grams in different entity gazetteers. Used by the domain and intent classifiers
when the 'in-gaz' feature is specified in the config.
Args:
scaling (int): A multiplicative scale factor to the ``ratio_pop`` and ``ratio`` features of
the in-gaz feature set.
Returns:
function: Returns an extractor function
"""
del args
def _extractor(query, resources):
@register_query_feature(feature_name="length")
def extract_length(**args):
"""
Extract length measures (tokens and chars; linear and log) on whole query.
Returns:
(function) A feature extraction function that takes a query and \
returns number of tokens and characters on linear and log scales
"""
del args
def _extractor(query, resources):
del resources
tokens = len(query.normalized_tokens)
chars = len(query.normalized_text)
return {
"tokens": tokens,
@register_query_feature(feature_name="char-ngrams-seq")
@requires(CHAR_NGRAM_FREQ_RSC)
def extract_char_ngrams_features(
ngram_lengths_to_start_positions, thresholds=(0,), **args
):
"""Returns a character n-gram feature extractor.
Args:
ngram_lengths_to_start_positions (dict):
The window of tokens to be considered relative to the
current token while extracting char n-grams
thresholds (int): Cut off value to include word in n-gram vocab
Returns:
(function) The feature extractor.
"""
del args
@register_query_feature(feature_name="in-gaz-ngram-seq")
@requires(GAZETTEER_RSC)
def extract_in_gaz_ngram_features(**args):
"""Returns a feature extractor for surrounding ngrams in gazetteers
"""
del args
def _extractor(query, resources):
def get_ngram_gaz_features(query, gazes, entity_type):
tokens = query.normalized_tokens
feat_seq = [{} for _ in tokens]
for i, _ in enumerate(feat_seq):
feat_prefix = "in_gaz|type:{}|ngram".format(entity_type)
# entity PMI and conditional prob
p_total = (
@register_query_feature(feature_name="gaz-freq")
@requires(GAZETTEER_RSC)
@requires(WORD_FREQ_RSC)
def extract_gaz_freq(**args):
"""
Extract frequency bin features for each gazetteer
Returns:
(function): A feature extraction function that returns the log of the \
count of query tokens within each gazetteer's frequency bins.
"""
del args
def _extractor(query, resources):
tokens = query.normalized_tokens
freq_features = defaultdict(int)
@register_query_feature(feature_name="enable-stemming")
@requires(ENABLE_STEMMING)
def enabled_stemming(**args):
"""Feature extractor for enabling stemming of the query
"""
del args
def _extractor(query, resources):
# no op
del query
del resources
return _extractor