# -*- coding: utf-8 -*-
#
# Copyright (c) 2015 Cisco Systems, Inc. and others. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module contains the base class for all the machine-learned classifiers in MindMeld.
"""
import json
import logging
import os
from abc import ABC, abstractmethod
from ..constants import DEFAULT_TEST_SET_REGEX, DEFAULT_TRAIN_SET_REGEX
from ..core import Query
from ..exceptions import ClassifierLoadError
from ..models import ModelConfig, create_model, load_model
from ..resource_loader import ProcessedQueryList
logger = logging.getLogger(__name__)
[docs]class ClassifierConfig:
"""A value object representing a classifier configuration
Attributes:
model_type (str): The name of the model type. Will be used to find the \
model class to instantiate.
model_settings (dict): Settings specific to the model type specified.
params (dict): Params to pass to the underlying classifier.
param_selection (dict): Configuration for param selection (using cross \
validation). For example:
{'type': 'shuffle',
'n': 3,
'k': 10,
'n_jobs': 2,
'scoring': '',
'grid': {}
}
features (dict): The keys are the names of feature extractors and the \
values are either a kwargs dict which will be passed into the \
feature extractor function, or a callable which will be used as to \
extract features.
"""
__slots__ = [
"model_type",
"features",
"model_settings",
"params",
"param_selection",
]
def __init__(
self,
model_type=None,
features=None,
model_settings=None,
params=None,
param_selection=None,
):
"""Initializes a classifier configuration"""
for arg, val in {"model_type": model_type}.items():
if val is None:
raise TypeError("__init__() missing required argument {!r}".format(arg))
self.model_type = model_type
self.features = features
self.model_settings = model_settings
self.params = params
self.param_selection = param_selection
[docs] def to_dict(self):
"""Converts the model config object into a dict.
Returns:
(dict): A dict version of the config.
"""
result = {}
for attr in self.__slots__:
result[attr] = getattr(self, attr)
return result
def __repr__(self):
args_str = ", ".join(
"{}={!r}".format(key, getattr(self, key)) for key in self.__slots__
)
return "{}({})".format(self.__class__.__name__, args_str)
[docs] @classmethod
def from_model_config(cls, model_config):
config = model_config.to_dict()
config.pop("example_type")
config.pop("label_type")
config.pop("train_label_set")
config.pop("test_label_set")
return cls(**config)
[docs] def to_json(self):
"""Converts the model config object to JSON.
Returns:
(str): JSON representation of the classifier.
"""
return json.dumps(self.to_dict(), sort_keys=True)
[docs]class Classifier(ABC):
"""The base class for all the machine-learned classifiers in MindMeld. A classifier is a \
machine-learned model that categorizes input examples into one of the pre-determined class \
labels. Among other functionality, each classifier provides means by which to fit a \
statistical model on a given training dataset and then use the trained model to make \
predictions on new unseen data.
Attributes:
ready (bool): Whether the classifier is ready.
dirty (bool): Whether the classifier has unsaved changes to its model.
config (ClassifierConfig): The classifier configuration.
hash (str): A hash representing the inputs into the model.
"""
CLF_TYPE = None
"""Classifier type (`str`)."""
def __init__(self, resource_loader):
"""Initializes a classifier
Args:
resource_loader (ResourceLoader): An object which can load resources for the classifier
"""
self._resource_loader = resource_loader
self._model = None # will be set when model is fit or loaded
self.ready = False
self.dirty = False
self.config = None
self.hash = ""
[docs] def fit(self,
queries=None,
label_set=None,
incremental_timestamp=None,
load_cached=True,
**kwargs):
"""Trains a statistical model for classification using the provided training examples and
model configuration.
Args:
queries (list(ProcessedQuery) or ProcessedQueryList, optional): A list of queries
to train on. If not specified the queries will be loaded from the label_set.
label_set (str): A label set to load. If not specified, the default
training set will be loaded.
incremental_timestamp (str, optional): The timestamp folder to cache models in
model_type (str, optional): The type of machine learning model to use. If omitted, the
default model type will be used.
model_settings (dict): Settings specific to the model type specified
features (dict): Features to extract from each example instance to form the feature
vector used for model training. If omitted, the default feature set for the model
type will be used.
params (dict): Params to pass to the underlying classifier
params_selection (dict): The grid of hyper-parameters to search, for finding the optimal
hyper-parameter settings for the model. If omitted, the default hyper-parameter
search grid will be used.
param_selection (dict): Configuration for param selection (using cross-validation)
{'type': 'shuffle',
'n': 3,
'k': 10,
'n_jobs': 2,
'scoring': '',
'grid': { 'C': [100, 10000, 1000000]}}
features (dict): The keys are the names of feature extractors and the
values are either a kwargs dict which will be passed into the
feature extractor function, or a callable which will be used as to
extract features.
load_cached (bool): If the model is cached on disk, load it into memory.
Returns:
True if model was loaded and fit, False if a valid cached model exists but was not
loaded (controlled by the load_cached arg).
Examples:
Fit using default the configuration.
>>> clf.fit()
Fit using a 'special' label set.
>>> clf.fit(label_set='special')
Fit using given params, bypassing cross-validation. This is useful for speeding up
train times if you are confident the params are optimized.
>>> clf.fit(params={'C': 10000000})
Fit using given parameter selection settings (also known as cross-validation settings).
>>> clf.fit(param_selection={})
Fit using a custom set of features, including a custom feature extractor.
This is only for advanced users.
>>> clf.fit(features={
'in-gaz': {}, // gazetteer features
'contrived': lambda exa, res: {'contrived': len(exa.text) == 26}
})
"""
# create model with given params
model_config = self._get_model_config(**kwargs)
model = create_model(model_config)
# resolve query set
label_set = label_set or model_config.train_label_set or DEFAULT_TRAIN_SET_REGEX
queries = self._resolve_queries(queries, label_set)
new_hash = self._get_model_hash(model_config, queries)
cached_model_path = self._resource_loader.hash_to_model_path.get(new_hash)
if incremental_timestamp and cached_model_path:
logger.info("No need to fit. Previous model is cached.")
if load_cached:
self.load(cached_model_path)
return True
return False
examples, labels = self._get_examples_and_labels(queries)
if not examples:
logger.warning(
"Could not fit model since no relevant examples were found. "
'Make sure the labeled queries for training are placed in "%s" '
"files in your MindMeld project.",
label_set,
)
return True
num_labels = len(set(labels))
if num_labels <= 1:
phrase = ["are no classes", "is only one class"][num_labels]
logger.info("Not doing anything for fit since there %s.", phrase)
return True
model.initialize_resources(self._resource_loader, examples, labels)
model.fit(examples, labels)
self._model = model
self.config = ClassifierConfig.from_model_config(self._model.config)
self.hash = new_hash
self.ready = True
self.dirty = True
return True
def _resolve_queries(self, queries=None, label_set=None):
"""
Resolve queries and/or label_set into a ProcessedQueryList.
queries is preferred over label_set.
Args:
queries (ProcessedQueryList or list(ProcessedQuery): A set of queries to use for
the operation.
label_set (str): The label set to load queries from
Returns:
ProcessedQueryList: The set of queries
"""
if not queries:
queries = self._get_queries_from_label_set(label_set)
elif not isinstance(queries, ProcessedQueryList):
queries = ProcessedQueryList.from_in_memory_list(queries)
return queries
[docs] def predict(self, query, time_zone=None, timestamp=None, dynamic_resource=None):
"""Predicts a class label for the given query using the trained classification model
Args:
query (Query or str): The input query
time_zone (str, optional): The name of an IANA time zone, such as
'America/Los_Angeles', or 'Asia/Kolkata'
See the [tz database](https://www.iana.org/time-zones) for more information.
timestamp (long, optional): A unix time stamp for the request (in seconds).
dynamic_resource (dict, optional): A dynamic resource to aid NLP inference
Returns:
str: The predicted class label
"""
if not self._model:
logger.error("You must fit or load the model before running predict")
return None
if not isinstance(query, Query):
query = self._resource_loader.query_factory.create_query(
query, time_zone=time_zone, timestamp=timestamp
)
return self._model.predict([query], dynamic_resource=dynamic_resource)[0]
[docs] def predict_proba(
self, query, time_zone=None, timestamp=None, dynamic_resource=None
):
"""Runs prediction on a given query and generates multiple hypotheses with their
associated probabilities using the trained classification model
Args:
query (Query): The input query
time_zone (str, optional): The name of an IANA time zone, such as
'America/Los_Angeles', or 'Asia/Kolkata'
See the [tz database](https://www.iana.org/time-zones) for more information.
timestamp (long, optional): A unix time stamp for the request (in seconds).
dynamic_resource (dict, optional): A dynamic resource to aid NLP inference
Returns:
list: a list of tuples of the form (str, float) grouping predicted class labels and \
their probabilities
"""
if not self._model:
logger.error("You must fit or load the model before running predict_proba")
return []
if not isinstance(query, Query):
query = self._resource_loader.query_factory.create_query(
query, time_zone=time_zone, timestamp=timestamp
)
predict_proba_result = self._model.predict_proba(
[query], dynamic_resource=dynamic_resource
)
class_proba_tuples = list(predict_proba_result[0][1].items())
return sorted(class_proba_tuples, key=lambda x: x[1], reverse=True)
[docs] def evaluate(self, queries=None, label_set=None, fetch_distribution=False):
"""Evaluates the trained classification model on the given test data
Args:
queries (Optional(list(ProcessedQuery))): optional list of queries to evaluate
label_set (str): The label set to use for evaluation.
Returns:
ModelEvaluation: A ModelEvaluation object that contains evaluation results
"""
if not self._model:
logger.error("You must fit or load the model before running evaluate.")
return None
model_config = self._get_model_config()
label_set = label_set or model_config.test_label_set or DEFAULT_TEST_SET_REGEX
queries = self._resolve_queries(queries, label_set)
examples, labels = self._get_examples_and_labels(queries)
if not examples:
logger.info(
"Could not evaluate model since no relevant examples were found. Make sure "
'the labeled queries for evaluation are placed in "%s" files '
"in your MindMeld project.",
label_set,
)
return None
# enables fetching probability distribution for entity recognizer
kwargs = {}
if self.config.model_type == 'tagger':
kwargs["fetch_distribution"] = fetch_distribution
evaluation = self._model.evaluate(examples, labels, **kwargs)
return evaluation
[docs] def inspect(self, query, gold_label=None, dynamic_resource=None):
raise NotImplementedError
@staticmethod
def _get_model_config(loaded_config=None, **kwargs):
"""Updates the loaded configuration with runtime specified options, and creates a model
configuration object with the final configuration dictionary. If an application config
exists it should be passed in, if not the default config should be passed in.
Returns:
ModelConfig: The model configuration corresponding to the provided config name
"""
if 'params' in loaded_config and 'params' in kwargs:
kwargs['params'] = {**loaded_config['params'], **kwargs['params']}
try:
# If all params required for model config were passed in, use kwargs
return ModelConfig(**kwargs)
except (TypeError, ValueError):
# Use application specified or default config, customizing with provided kwargs
if not loaded_config:
logger.warning("loaded_config is not passed in")
model_config = loaded_config or {}
model_config.update(kwargs)
return ModelConfig(**model_config)
[docs] def dump(self, model_path, incremental_model_path=None):
"""Persists the trained classification model to disk.
Args:
model_path (str): The location on disk where the model should be stored.
incremental_model_path (str, optional): The timestamp folder where the cached
models are stored.
"""
for path in [model_path, incremental_model_path]:
if not path:
continue
# classifier specific dump
self._dump(path)
# model specific dump
if self._model:
# sometimes a model might be NoneType, eg. in role classifiers, in which case,
# no dumping is required. While loading such models, the model_path (.pkl)
# will not be found and the helpers.load_model() will return None, which makes it
# backwards compatible to loading a NoneType model
self._model.dump(path)
hash_path = path + ".hash"
os.makedirs(os.path.dirname(hash_path), exist_ok=True)
with open(hash_path, "w") as hash_file:
hash_file.write(self.hash)
if path == model_path:
self.dirty = False
def _dump(self, path):
pass
@staticmethod
def _get_classifier_resources_save_path(model_path):
head, ext = os.path.splitext(model_path)
classifier_resources_save_path = head + ".classifier_resources" + ext
os.makedirs(os.path.dirname(classifier_resources_save_path), exist_ok=True)
return classifier_resources_save_path
[docs] def unload(self):
"""
Unloads the model from memory. This helps reduce memory requirements while
training other models.
"""
self._model = None
self.config = None
self.ready = False
[docs] def load(self, model_path):
"""Loads the trained classification model from disk
Args:
model_path (str): The location on disk where the model is stored
"""
self._model = load_model(model_path)
# validate and initialize resources
if self._model is not None:
if not hasattr(self._model, "mindmeld_version"):
msg = (
"Your trained models are incompatible with this version of MindMeld. "
"Please run a clean build to retrain models"
)
raise ClassifierLoadError(msg)
try:
self._model.config.to_dict()
except AttributeError:
# Loaded model config is incompatible with app config.
self._model.config.resolve_config(self._get_model_config())
self._model.initialize_resources(self._resource_loader)
self.config = ClassifierConfig.from_model_config(self._model.config)
self.hash = self._load_hash(model_path)
self.ready = True
self.dirty = False
@staticmethod
def _load_hash(model_path):
hash_path = model_path + ".hash"
if not os.path.isfile(hash_path):
return ""
with open(hash_path, "r") as hash_file:
model_hash = hash_file.read()
return model_hash
@abstractmethod
def _get_queries_from_label_set(self, label_set=DEFAULT_TRAIN_SET_REGEX):
"""Returns the set of queries loaded from the label_set
Args:
label_set (list, optional): A label set to load. If not specified,
the default training set will be loaded.
Returns:
ProcessedQueryList
"""
raise NotImplementedError("Subclasses must implement this method")
@abstractmethod
def _get_examples_and_labels_hash(self, queries):
"""Returns a hashed string representing the labeled queries
Args:
queries (ProcessedQueryList): The queries used to fit this model
"""
raise NotImplementedError("Subclasses must implement this method")
@abstractmethod
def _get_examples_and_labels(self, queries):
"""Extracts examples and lables extracted from the queries
Args:
queries (ProcessedQueryList): The queries to extract examples and lables from
Returns:
tuple(ProcessedQueryList.Iterator(Any),
ProcessedQueryList.Iterator(Any)): A tuple of iterators
[0]: the examples, [1]: the labels
"""
def _get_model_hash(self, model_config, queries):
"""Returns a hash representing the inputs into the model
Args:
model_config (ModelConfig): The model configuration
queries (ProcessedQueryList): The queries used to fit this model
Returns:
str: The hash
"""
# Hash queries
queries_hash = self._get_examples_and_labels_hash(queries)
# Hash config
config_hash = self._resource_loader.hash_string(model_config.to_json())
# Hash resources
rsc_strings = []
for resource in sorted(model_config.required_resources()):
rsc_strings.append(self._resource_loader.hash_feature_resource(resource))
rsc_hash = self._resource_loader.hash_list(rsc_strings)
return self._resource_loader.hash_list([queries_hash, config_hash, rsc_hash])
def __repr__(self):
msg = "<{} ready: {!r}, dirty: {!r}>"
return msg.format(self.__class__.__name__, self.ready, self.dirty)