Source code for mindmeld.models.features.entity_features
# -*- coding: utf-8 -*-
#
# Copyright (c) 2015 Cisco Systems, Inc. and others. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This module contains feature extractors for entities"""
from ..helpers import GAZETTEER_RSC, get_ngram, register_entity_feature, requires
from ...core import QueryEntity, Query, NestedEntity
[docs]@register_entity_feature(feature_name="in-gaz")
@requires(GAZETTEER_RSC)
def extract_in_gaz_features(**kwargs):
"""Returns a feature extractor that finds any gazetteer matches against the input query"""
del kwargs
def _extractor(example, resources):
_, entities, entity_index = example
features = {}
current_entity = entities[entity_index]
domain_gazes = resources[GAZETTEER_RSC]
for gaz_name, gaz in domain_gazes.items():
if isinstance(current_entity, (QueryEntity, NestedEntity)):
normalized_tokens = tuple(current_entity.normalized_text.split())
elif isinstance(current_entity, Query):
normalized_tokens = current_entity.normalized_tokens
else:
raise TypeError(
f"{current_entity} is of unknown type, expected Query, "
f"NestedEntity or QueryEntity type")
if normalized_tokens in gaz["pop_dict"]:
feat_name = "in_gaz|type:{}".format(gaz_name)
features[feat_name] = 1
return features
return _extractor
[docs]@register_entity_feature(feature_name="bag-of-words-before")
def extract_bag_of_words_before_features(ngram_lengths_to_start_positions, **kwargs):
"""Returns a bag-of-words feature extractor.
Args:
ngram_lengths_to_start_positions (dict):
Returns:
(function) The feature extractor.
"""
del kwargs
def _extractor(example, resources):
del resources
query, entities, entity_index = example
features = {}
tokens = query.normalized_tokens
current_entity = entities[entity_index]
current_entity_token_start = current_entity.token_span.start
for length, starts in ngram_lengths_to_start_positions.items():
for start in starts:
feat_name = "bag_of_words|ngram_before|length:{}|pos:{}".format(
length, start
)
features[feat_name] = get_ngram(
tokens, current_entity_token_start + start, length
)
return features
return _extractor
[docs]@register_entity_feature(feature_name="bag-of-words-after")
def extract_bag_of_words_after_features(ngram_lengths_to_start_positions, **kwargs):
"""Returns a bag-of-words feature extractor.
Args:
ngram_lengths_to_start_positions (dict):
Returns:
(function) The feature extractor.
"""
del kwargs
def _extractor(example, resources):
del resources
query, entities, entity_index = example
features = {}
tokens = query.normalized_tokens
current_entity = entities[entity_index]
current_entity_token_end = current_entity.token_span.end
for length, starts in ngram_lengths_to_start_positions.items():
for start in starts:
feat_name = "bag_of_words|ngram_after|length:{}|pos:{}".format(
length, start
)
features[feat_name] = get_ngram(
tokens, current_entity_token_end + start, length
)
return features
return _extractor
[docs]@register_entity_feature(feature_name="numeric")
def extract_numeric_candidate_features(**kwargs):
"""Returns a feature extractor that generates features indicating the presence
of the ``sys_time`` and ``sys_interval`` numeric entities. These numeric entities are
identified by duckling, the numerical entity recognition service and boosted by
training data containing the entity labels. Used by the role classifier when the
``'numeric'`` feature is specified in the config."""
del kwargs
def _extractor(example, resources):
del resources
query, _, _ = example
feat_seq = {}
sys_entities = query.get_system_entity_candidates(["sys_time", "sys_interval"])
for ent in sys_entities:
for i in ent.token_span:
feat_name = "sys_candidate|type:{}|pos:{}".format(ent.entity.type, i)
feat_seq[feat_name] = 1
return feat_seq
return _extractor
[docs]@register_entity_feature(feature_name="other-entities")
def extract_other_entities_features(**kwargs):
"""Returns a feature extractor for all other entities apart from the current entity.\
Used by the role classifier when the ``'other-entities'`` feature is specified in \
the config."""
del kwargs
def _extractor(example, resources):
del resources
_, entities, entity_index = example
features = {}
for idx, entity in enumerate(entities):
if idx == entity_index:
continue
feat_name = "other_entities|type:{}".format(entity.entity.type)
features[feat_name] = 1
return features
return _extractor