Source code for mindmeld.components.classifier

# -*- coding: utf-8 -*-
#
# Copyright (c) 2015 Cisco Systems, Inc. and others.  All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This module contains the base class for all the machine-learned classifiers in MindMeld.
"""
import json
import logging
import os
from abc import ABC, abstractmethod

from ..constants import DEFAULT_TEST_SET_REGEX, DEFAULT_TRAIN_SET_REGEX
from ..core import Query
from ..exceptions import ClassifierLoadError
from ..models import ModelConfig, create_model, load_model
from ..resource_loader import ProcessedQueryList

logger = logging.getLogger(__name__)


[docs]class ClassifierConfig:
    """A value object representing a classifier configuration

        Attributes:
            model_type (str): The name of the model type. Will be used to find the \
                model class to instantiate.
            model_settings (dict): Settings specific to the model type specified.
            params (dict): Params to pass to the underlying classifier.
            param_selection (dict): Configuration for param selection (using cross \
                validation). For example:
                {'type': 'shuffle',
                'n': 3,
                'k': 10,
                'n_jobs': 2,
                'scoring': '',
                'grid': {}
                }
            features (dict): The keys are the names of feature extractors and the \
                values are either a kwargs dict which will be passed into the \
                feature extractor function, or a callable which will be used as to \
                extract features.
    """

    __slots__ = [
        "model_type",
        "features",
        "model_settings",
        "params",
        "param_selection",
    ]

    def __init__(
        self,
        model_type=None,
        features=None,
        model_settings=None,
        params=None,
        param_selection=None,
    ):
        """Initializes a classifier configuration"""
        for arg, val in {"model_type": model_type}.items():
            if val is None:
                raise TypeError("__init__() missing required argument {!r}".format(arg))
        self.model_type = model_type
        self.features = features
        self.model_settings = model_settings
        self.params = params
        self.param_selection = param_selection

[docs]    def to_dict(self):
        """Converts the model config object into a dict.

        Returns:
            (dict): A dict version of the config.
        """
        result = {}
        for attr in self.__slots__:
            result[attr] = getattr(self, attr)
        return result

    def __repr__(self):
        args_str = ", ".join(
            "{}={!r}".format(key, getattr(self, key)) for key in self.__slots__
        )
        return "{}({})".format(self.__class__.__name__, args_str)

[docs]    @classmethod
    def from_model_config(cls, model_config):
        config = model_config.to_dict()
        config.pop("example_type")
        config.pop("label_type")
        config.pop("train_label_set")
        config.pop("test_label_set")
        return cls(**config)

[docs]    def to_json(self):
        """Converts the model config object to JSON.

        Returns:
            (str): JSON representation of the classifier.
        """
        return json.dumps(self.to_dict(), sort_keys=True)


[docs]class Classifier(ABC):
    """The base class for all the machine-learned classifiers in MindMeld. A classifier is a \
    machine-learned model that categorizes input examples into one of the pre-determined class \
    labels. Among other functionality, each classifier provides means by which to fit a \
    statistical model on a given training dataset and then use the trained model to make \
    predictions on new unseen data.

        Attributes:
            ready (bool): Whether the classifier is ready.
            dirty (bool): Whether the classifier has unsaved changes to its model.
            config (ClassifierConfig): The classifier configuration.
            hash (str): A hash representing the inputs into the model.
    """

    CLF_TYPE = None
    """Classifier type (`str`)."""

    def __init__(self, resource_loader):
        """Initializes a classifier

        Args:
            resource_loader (ResourceLoader): An object which can load resources for the classifier
        """
        self._resource_loader = resource_loader
        self._model = None  # will be set when model is fit or loaded
        self.ready = False
        self.dirty = False
        self.config = None
        self.hash = ""

[docs]    def fit(self,
            queries=None,
            label_set=None,
            incremental_timestamp=None,
            load_cached=True,
            **kwargs):
        """Trains a statistical model for classification using the provided training examples and
        model configuration.

        Args:
            queries (list(ProcessedQuery) or ProcessedQueryList, optional): A list of queries
                 to train on. If not specified the queries will be loaded from the label_set.
            label_set (str): A label set to load. If not specified, the default
                 training set will be loaded.
            incremental_timestamp (str, optional): The timestamp folder to cache models in
            model_type (str, optional): The type of machine learning model to use. If omitted, the
                 default model type will be used.
            model_settings (dict): Settings specific to the model type specified
            features (dict): Features to extract from each example instance to form the feature
                 vector used for model training. If omitted, the default feature set for the model
                 type will be used.
            params (dict): Params to pass to the underlying classifier
            params_selection (dict): The grid of hyper-parameters to search, for finding the optimal
                 hyper-parameter settings for the model. If omitted, the default hyper-parameter
                 search grid will be used.
            param_selection (dict): Configuration for param selection (using cross-validation)
                {'type': 'shuffle',
                'n': 3,
                'k': 10,
                'n_jobs': 2,
                'scoring': '',
                'grid': { 'C': [100, 10000, 1000000]}}
            features (dict): The keys are the names of feature extractors and the
                values are either a kwargs dict which will be passed into the
                feature extractor function, or a callable which will be used as to
                extract features.
            load_cached (bool): If the model is cached on disk, load it into memory.

        Returns:
            True if model was loaded and fit, False if a valid cached model exists but was not
            loaded (controlled by the load_cached arg).

        Examples:
            Fit using default the configuration.

                >>> clf.fit()

            Fit using a 'special' label set.

                >>> clf.fit(label_set='special')

            Fit using given params, bypassing cross-validation. This is useful for speeding up
            train times if you are confident the params are optimized.

                >>> clf.fit(params={'C': 10000000})

            Fit using given parameter selection settings (also known as cross-validation settings).

                >>> clf.fit(param_selection={})

            Fit using a custom set of features, including a custom feature extractor.
            This is only for advanced users.

                >>> clf.fit(features={
                        'in-gaz': {}, // gazetteer features
                        'contrived': lambda exa, res: {'contrived': len(exa.text) == 26}
                    })
        """

        # create model with given params
        model_config = self._get_model_config(**kwargs)
        model = create_model(model_config)

        # resolve query set
        label_set = label_set or model_config.train_label_set or DEFAULT_TRAIN_SET_REGEX
        queries = self._resolve_queries(queries, label_set)

        new_hash = self._get_model_hash(model_config, queries)
        cached_model_path = self._resource_loader.hash_to_model_path.get(new_hash)

        if incremental_timestamp and cached_model_path:
            logger.info("No need to fit.  Previous model is cached.")
            if load_cached:
                self.load(cached_model_path)
                return True
            return False

        examples, labels = self._get_examples_and_labels(queries)

        if not examples:
            logger.warning(
                "Could not fit model since no relevant examples were found. "
                'Make sure the labeled queries for training are placed in "%s" '
                "files in your MindMeld project.",
                label_set,
            )
            return True
        num_labels = len(set(labels))
        if num_labels <= 1:
            phrase = ["are no classes", "is only one class"][num_labels]
            logger.info("Not doing anything for fit since there %s.", phrase)
            return True

        model.initialize_resources(self._resource_loader, examples, labels)
        model.fit(examples, labels)
        self._model = model
        self.config = ClassifierConfig.from_model_config(self._model.config)
        self.hash = new_hash

        self.ready = True
        self.dirty = True
        return True

    def _resolve_queries(self, queries=None, label_set=None):
        """
        Resolve queries and/or label_set into a ProcessedQueryList.
        queries is preferred over label_set.

        Args:
            queries (ProcessedQueryList or list(ProcessedQuery): A set of queries to use for
                the operation.
            label_set (str): The label set to load queries from

        Returns:
            ProcessedQueryList: The set of queries
        """
        if not queries:
            queries = self._get_queries_from_label_set(label_set)
        elif not isinstance(queries, ProcessedQueryList):
            queries = ProcessedQueryList.from_in_memory_list(queries)
        return queries

[docs]    def predict(self, query, time_zone=None, timestamp=None, dynamic_resource=None):
        """Predicts a class label for the given query using the trained classification model

        Args:
            query (Query or str): The input query
            time_zone (str, optional): The name of an IANA time zone, such as
                'America/Los_Angeles', or 'Asia/Kolkata'
                See the [tz database](https://www.iana.org/time-zones) for more information.
            timestamp (long, optional): A unix time stamp for the request (in seconds).
            dynamic_resource (dict, optional): A dynamic resource to aid NLP inference

        Returns:
            str: The predicted class label
        """
        if not self._model:
            logger.error("You must fit or load the model before running predict")
            return None
        if not isinstance(query, Query):
            query = self._resource_loader.query_factory.create_query(
                query, time_zone=time_zone, timestamp=timestamp
            )
        return self._model.predict([query], dynamic_resource=dynamic_resource)[0]

[docs]    def predict_proba(
        self, query, time_zone=None, timestamp=None, dynamic_resource=None
    ):
        """Runs prediction on a given query and generates multiple hypotheses with their
        associated probabilities using the trained classification model

        Args:
            query (Query): The input query
            time_zone (str, optional): The name of an IANA time zone, such as
                'America/Los_Angeles', or 'Asia/Kolkata'
                See the [tz database](https://www.iana.org/time-zones) for more information.
            timestamp (long, optional): A unix time stamp for the request (in seconds).
            dynamic_resource (dict, optional):  A dynamic resource to aid NLP inference

        Returns:
            list: a list of tuples of the form (str, float) grouping predicted class labels and \
                their probabilities
        """
        if not self._model:
            logger.error("You must fit or load the model before running predict_proba")
            return []
        if not isinstance(query, Query):
            query = self._resource_loader.query_factory.create_query(
                query, time_zone=time_zone, timestamp=timestamp
            )

        predict_proba_result = self._model.predict_proba(
            [query], dynamic_resource=dynamic_resource
        )
        class_proba_tuples = list(predict_proba_result[0][1].items())
        return sorted(class_proba_tuples, key=lambda x: x[1], reverse=True)

[docs]    def evaluate(self, queries=None, label_set=None, fetch_distribution=False):
        """Evaluates the trained classification model on the given test data

        Args:
            queries (Optional(list(ProcessedQuery))): optional list of queries to evaluate
            label_set (str): The label set to use for evaluation.

        Returns:
            ModelEvaluation: A ModelEvaluation object that contains evaluation results
        """
        if not self._model:
            logger.error("You must fit or load the model before running evaluate.")
            return None

        model_config = self._get_model_config()
        label_set = label_set or model_config.test_label_set or DEFAULT_TEST_SET_REGEX
        queries = self._resolve_queries(queries, label_set)

        examples, labels = self._get_examples_and_labels(queries)

        if not examples:
            logger.info(
                "Could not evaluate model since no relevant examples were found. Make sure "
                'the labeled queries for evaluation are placed in "%s" files '
                "in your MindMeld project.",
                label_set,
            )
            return None

        # enables fetching probability distribution for entity recognizer
        kwargs = {}
        if self.config.model_type == 'tagger':
            kwargs["fetch_distribution"] = fetch_distribution

        evaluation = self._model.evaluate(examples, labels, **kwargs)
        return evaluation

[docs]    def inspect(self, query, gold_label=None, dynamic_resource=None):
        raise NotImplementedError

[docs]    def view_extracted_features(
        self, query, time_zone=None, timestamp=None, dynamic_resource=None
    ):
        """Extracts features for the given input based on the model config.

        Args:
            query (Query or str): The input query
            time_zone (str, optional): The name of an IANA time zone, such as \
                'America/Los_Angeles', or 'Asia/Kolkata' \
                See the [tz database](https://www.iana.org/time-zones) for more information.
            timestamp (long, optional): A unix time stamp for the request (in seconds).
            dynamic_resource (dict): Dynamic gazetteer to be included for feature extraction.

        Returns:
            dict: The extracted features from the given input
        """
        if not self._model:
            logger.error("You must fit or load the model to initialize resources")
            return None
        if not isinstance(query, Query):
            query = self._resource_loader.query_factory.create_query(
                query, time_zone=time_zone, timestamp=timestamp
            )
        return self._model.view_extracted_features(query, dynamic_resource)

    @staticmethod
    def _get_model_config(loaded_config=None, **kwargs):
        """Updates the loaded configuration with runtime specified options, and creates a model
        configuration object with the final configuration dictionary. If an application config
        exists it should be passed in, if not the default config should be passed in.

        Returns:
            ModelConfig: The model configuration corresponding to the provided config name
        """
        if 'params' in loaded_config and 'params' in kwargs:
            kwargs['params'] = {**loaded_config['params'], **kwargs['params']}
        try:
            # If all params required for model config were passed in, use kwargs
            return ModelConfig(**kwargs)
        except (TypeError, ValueError):
            # Use application specified or default config, customizing with provided kwargs
            if not loaded_config:
                logger.warning("loaded_config is not passed in")
            model_config = loaded_config or {}
            model_config.update(kwargs)

        return ModelConfig(**model_config)

[docs]    def dump(self, model_path, incremental_model_path=None):
        """Persists the trained classification model to disk.

        Args:
            model_path (str): The location on disk where the model should be stored.
            incremental_model_path (str, optional): The timestamp folder where the cached
                models are stored.
        """
        for path in [model_path, incremental_model_path]:
            if not path:
                continue

            # classifier specific dump
            self._dump(path)

            # model specific dump
            if self._model:
                # sometimes a model might be NoneType, eg. in role classifiers, in which case,
                # no dumping is required. While loading such models, the model_path (.pkl)
                # will not be found and the helpers.load_model() will return None, which makes it
                # backwards compatible to loading a NoneType model
                self._model.dump(path)

            hash_path = path + ".hash"
            os.makedirs(os.path.dirname(hash_path), exist_ok=True)
            with open(hash_path, "w") as hash_file:
                hash_file.write(self.hash)

            if path == model_path:
                self.dirty = False

    def _dump(self, path):
        pass

    @staticmethod
    def _get_classifier_resources_save_path(model_path):
        head, ext = os.path.splitext(model_path)
        classifier_resources_save_path = head + ".classifier_resources" + ext
        os.makedirs(os.path.dirname(classifier_resources_save_path), exist_ok=True)
        return classifier_resources_save_path

[docs]    def unload(self):
        """
        Unloads the model from memory. This helps reduce memory requirements while
        training other models.
        """
        self._model = None
        self.config = None
        self.ready = False

[docs]    def load(self, model_path):
        """Loads the trained classification model from disk

        Args:
            model_path (str): The location on disk where the model is stored
        """

        self._model = load_model(model_path)

        # validate and initialize resources
        if self._model is not None:
            if not hasattr(self._model, "mindmeld_version"):
                msg = (
                    "Your trained models are incompatible with this version of MindMeld. "
                    "Please run a clean build to retrain models"
                )
                raise ClassifierLoadError(msg)

            try:
                self._model.config.to_dict()
            except AttributeError:
                # Loaded model config is incompatible with app config.
                self._model.config.resolve_config(self._get_model_config())

            self._model.initialize_resources(self._resource_loader)
            self.config = ClassifierConfig.from_model_config(self._model.config)

        self.hash = self._load_hash(model_path)

        self.ready = True
        self.dirty = False

    @staticmethod
    def _load_hash(model_path):
        hash_path = model_path + ".hash"
        if not os.path.isfile(hash_path):
            return ""
        with open(hash_path, "r") as hash_file:
            model_hash = hash_file.read()
        return model_hash

    @abstractmethod
    def _get_queries_from_label_set(self, label_set=DEFAULT_TRAIN_SET_REGEX):
        """Returns the set of queries loaded from the label_set

        Args:
            label_set (list, optional): A label set to load. If not specified,
                the default training set will be loaded.
        Returns:
            ProcessedQueryList
        """
        raise NotImplementedError("Subclasses must implement this method")

    @abstractmethod
    def _get_examples_and_labels_hash(self, queries):
        """Returns a hashed string representing the labeled queries

        Args:
            queries (ProcessedQueryList): The queries used to fit this model
        """
        raise NotImplementedError("Subclasses must implement this method")

    @abstractmethod
    def _get_examples_and_labels(self, queries):
        """Extracts examples and lables extracted from the queries

        Args:
            queries (ProcessedQueryList): The queries to extract examples and lables from

        Returns:
            tuple(ProcessedQueryList.Iterator(Any),
                  ProcessedQueryList.Iterator(Any)): A tuple of iterators
                [0]: the examples, [1]: the labels
        """

    def _get_model_hash(self, model_config, queries):
        """Returns a hash representing the inputs into the model

        Args:
            model_config (ModelConfig): The model configuration
            queries (ProcessedQueryList): The queries used to fit this model

        Returns:
            str: The hash
        """

        # Hash queries
        queries_hash = self._get_examples_and_labels_hash(queries)

        # Hash config
        config_hash = self._resource_loader.hash_string(model_config.to_json())

        # Hash resources
        rsc_strings = []
        for resource in sorted(model_config.required_resources()):
            rsc_strings.append(self._resource_loader.hash_feature_resource(resource))
        rsc_hash = self._resource_loader.hash_list(rsc_strings)

        return self._resource_loader.hash_list([queries_hash, config_hash, rsc_hash])

    def __repr__(self):
        msg = "<{} ready: {!r}, dirty: {!r}>"
        return msg.format(self.__class__.__name__, self.ready, self.dirty)