# -*- coding: utf-8 -*-
#
# Copyright (c) 2015 Cisco Systems, Inc. and others. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module contains the Memm entity recognizer.
"""
import logging
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectFromModel, SelectPercentile
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder as SKLabelEncoder
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from .taggers import START_TAG, Tagger, extract_sequence_features
logger = logging.getLogger(__name__)
[docs]class MemmModel(Tagger):
"""A maximum-entropy Markov model."""
@staticmethod
def _predict_proba(X):
del X
pass
[docs] @staticmethod
def load(model_path):
del model_path
pass
[docs] def fit(self, X, y):
self._clf.fit(X, y)
return self
[docs] def set_params(self, **parameters):
self._clf = LogisticRegression()
self._clf.set_params(**parameters)
return self
[docs] def get_params(self, deep=True):
return self._clf.get_params()
[docs] def predict(self, X, dynamic_resource=None):
return self._clf.predict(X)
[docs] def extract_and_predict(self, examples, config, resources):
return [
self._predict_example(example, config, resources) for example in examples
]
def _predict_example(self, example, config, resources):
features_by_segment = self.extract_example_features(example, config, resources)
if len(features_by_segment) == 0:
return []
predicted_tags = []
prev_tag = START_TAG
for features in features_by_segment:
features["prev_tag"] = prev_tag
X, _ = self._preprocess_data([features])
prediction = self.predict(X)
predicted_tag = self.class_encoder.inverse_transform(prediction)[0]
predicted_tags.append(predicted_tag)
prev_tag = predicted_tag
return predicted_tags
[docs] def predict_proba(self, examples, config, resources):
return [
self._predict_proba_example(example, config, resources)
for example in examples
]
def _predict_proba_example(self, example, config, resources):
features_by_segment = self.extract_example_features(example, config, resources)
if len(features_by_segment) == 0:
return []
prev_tag = START_TAG
seq_log_probs = []
for features in features_by_segment:
features["prev_tag"] = prev_tag
X, _ = self._preprocess_data([features])
prediction = self._clf.predict_proba(X)[0]
predicted_tag = np.argmax(prediction)
prev_tag = self.class_encoder.inverse_transform([predicted_tag])[0]
seq_log_probs.append([prev_tag, prediction[predicted_tag]])
return seq_log_probs
[docs] def predict_proba_distribution(self, examples, config, resources):
return [
self._predict_proba_distribution_example(example, config, resources)
for example in examples
]
def _predict_proba_distribution_example(self, example, config, resources):
features_by_segment = self.extract_example_features(example, config, resources)
if len(features_by_segment) == 0:
return []
prev_tag = START_TAG
predictions = []
tag_maps = []
for features in features_by_segment:
features["prev_tag"] = prev_tag
X, _ = self._preprocess_data([features])
prediction = self._clf.predict_proba(X)[0]
predictions.append(list(prediction))
tag_maps.append(
[
self.class_encoder.inverse_transform([i])[0] for i in range(len(prediction))
]
)
return [tag_maps, predictions]
@staticmethod
def _get_feature_selector(selector_type):
"""Get a feature selector instance based on the feature_selector model
parameter.
Returns:
(Object): A feature selector which returns a reduced feature matrix, \
given the full feature matrix, X and the class labels, y.
"""
selector = {
"l1": SelectFromModel(LogisticRegression(penalty="l1", C=1, solver="liblinear")),
"f": SelectPercentile(),
}.get(selector_type)
return selector
@staticmethod
def _get_feature_scaler(scale_type):
"""Get a feature value scaler based on the model settings"""
scaler = {
"std-dev": StandardScaler(with_mean=False),
"max-abs": MaxAbsScaler(),
}.get(scale_type)
return scaler
[docs] def setup_model(self, config):
if config.model_settings is None:
selector_type = None
scale_type = None
else:
selector_type = config.model_settings.get("feature_selector")
scale_type = config.model_settings.get("feature_scaler")
self.class_encoder = SKLabelEncoder()
self.feat_vectorizer = DictVectorizer()
self._feat_selector = self._get_feature_selector(selector_type)
self._feat_scaler = self._get_feature_scaler(scale_type)
def _preprocess_data(self, X, y=None, fit=False):
if fit:
y = self.class_encoder.fit_transform(y)
X = self.feat_vectorizer.fit_transform(X)
if self._feat_scaler is not None:
X = self._feat_scaler.fit_transform(X)
if self._feat_selector is not None:
X = self._feat_selector.fit_transform(X, y)
else:
X = self.feat_vectorizer.transform(X)
if self._feat_scaler is not None:
X = self._feat_scaler.transform(X)
if self._feat_selector is not None:
X = self._feat_selector.transform(X)
return X, y