# -*- coding: utf-8 -*-
#
# Copyright (c) 2015 Cisco Systems, Inc. and others. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module contains the CRF entity recognizer.
"""
from distutils.util import strtobool
import logging
import os
import numpy as np
from .taggers import Tagger, extract_sequence_features
from .pytorch_crf import CRFModel
logger = logging.getLogger(__name__)
ZERO = 1e-20
STORE_CRF_FEATURES_IN_MEMORY = bool(
strtobool(os.environ.get("MM_CRF_FEATURES_IN_MEMORY", "1").lower())
)
[docs]class CRFTagger(Tagger):
"""A Conditional Random Fields model."""
[docs] def fit(self, X, y):
self._clf.fit(X, y)
return self
# TODO: Refactor to move initialization into init() or setup_model()
[docs] def set_params(self, **parameters):
self._clf = CRFModel()
self._clf.set_params(**parameters)
return self
[docs] def get_params(self, deep=True):
return self._clf.get_params()
[docs] def predict(self, X, dynamic_resource=None):
return self._clf.predict(X)
[docs] def predict_proba(self, examples, config, resources):
"""
Args:
examples (list of mindmeld.core.Query): a list of queries to predict on
config (ModelConfig): The ModelConfig which may contain information used for feature
extraction
resources (dict): Resources which may be used for this model's feature extraction
Returns:
list of tuples of (mindmeld.core.QueryEntity): a list of predicted labels \
with confidence scores
"""
X, _, _ = self.extract_features(examples, config, resources, in_memory=True)
seq = self._clf.predict(X)
marginals_dict = self._clf.predict_marginals(X)
marginal_tuples = []
for query_index, query_seq in enumerate(seq):
query_marginal_tuples = []
for i, tag in enumerate(query_seq):
query_marginal_tuples.append([tag, marginals_dict[query_index][i][tag]])
marginal_tuples.append(query_marginal_tuples)
return marginal_tuples
[docs] def predict_proba_distribution(self, examples, config, resources):
"""
Args:
examples (list of mindmeld.core.Query): a list of queries to predict on
config (ModelConfig): The ModelConfig which may contain information used for feature
extraction
resources (dict): Resources which may be used for this model's feature extraction
Returns:
list of list of ((list of str) and (list of float)): a list of predicted labels \
with confidence scores
"""
X, _, _ = self.extract_features(examples, config, resources, in_memory=True)
seq = self._clf.predict(X)
marginals_dict = self._clf.predict_marginals(X)
predictions = []
tag_maps = []
for query_index, query_seq in enumerate(seq):
tags = []
preds = []
for i, _ in enumerate(query_seq):
tags.append(list(marginals_dict[query_index][i].keys()))
preds.append(list(marginals_dict[query_index][i].values()))
tag_maps.extend(tags)
predictions.extend(preds)
return [[tag_maps, predictions]]
def _preprocess_data(self, X, fit=False):
"""Converts data into formats of CRF suite.
Args:
X (list of list of dict): features of an example
fit (bool, optional): True if processing data at fit time, false for predict time.
Returns:
(list of list of str): features in CRF suite format
"""
if fit:
self._feat_binner.fit(X)
# We want to use a list for in-memory and a LineGenerator for disk based
new_X = X.__class__()
# Maintain append code structure to make sure it supports in-memory and FileBackedList()
for feat_seq in self._feat_binner.transform(X):
new_X.append(feat_seq)
return new_X
[docs] def setup_model(self, config):
self._feat_binner = FeatureBinner()
@property
def is_serializable(self):
return False
[docs] def dump(self, path):
best_model_save_path = os.path.join(os.path.split(path)[0], "best_crf_wts.pt")
self._clf.save_best_weights_path(best_model_save_path)
[docs] def load(self, path):
best_model_save_path = os.path.join(os.path.split(path)[0], "best_crf_wts.pt")
self._clf.build_params(*self.get_torch_encoder().get_feats_and_classes())
self._clf.load_best_weights_path(best_model_save_path)
[docs] def get_torch_encoder(self):
return self._clf.get_encoder()
[docs] def set_torch_encoder(self, encoder):
self._clf.set_encoder(encoder)
# Feature extraction for CRF
[docs]class FeatureMapper:
"""
Mapper for one feature to map numerical values to corresponding bins which are generated
by the mean and standard deviation of this feature.
The size and number of bins are decided by num_std and size_std. For example, say
num_std = 2 and size_std = 0.5, then the bins would look like:
* bucket 0: (-INF, mean - std * 2)
* bucket 1: [mean - std * 2, mean - std * 1.5)
* bucket 2: [mean - std * 1.5, mean - std * 1)
* ...
* bucket 8: [mean + std * 1.5, mean + std * 2)
* bucket 9: [mean + std * 2, INF)
Attributes:
_num_std (int): number of standard deviations to generate the bins
_size_std (float): size of each bin in standard deviation
"""
def __init__(self, num_std=2, size_std=0.5):
self._num_std = num_std
self._size_std = size_std
self.values = []
self.std = None
self.mean = None
self._std_bins = []
[docs] def add_value(self, value):
"""Collect values for this feature.
Args:
value (numeric): A numeric value
"""
self.values.append(value)
[docs] def fit(self):
"""Calculate statistics and then create the bins."""
self.std = np.std(self.values)
self.mean = np.mean(self.values)
range_start = self.mean - self.std * self._num_std
num_bin = 2 * int(self._num_std / self._size_std)
bins = [range_start]
while num_bin > 0 and self.std > ZERO:
range_start += self.std * self._size_std
bins.append(range_start)
num_bin -= 1
self._std_bins = np.array(bins)
[docs] def map_bucket(self, value):
"""
Get corresponding bucket number for this value.
Args:
value (float): numerical value of this feature
"""
return np.searchsorted(self._std_bins, value)
[docs]class FeatureBinner:
"""
Class to convert features with numerical values to categorical values.
"""
def __init__(self):
self.features = {}
[docs] def fit(self, X_train):
"""
Create and fit FeatureMapper for numerical features.
Args:
X_train (list of list of dict): training data
"""
for sentence in X_train:
for word in sentence:
for feat_name, feat_value in word.items():
self._collect_feature(feat_name, feat_value)
for mapper in self.features.values():
mapper.fit()
def _collect_feature(self, feat_name, feat_value):
"""
Collect numerical feature values to fit corresponding mapper.
Args:
feat_name (str): feature name
feat_value (any): feature value
"""
try:
feat_value = float(feat_value)
except ValueError:
# Skip collection of non numerical features
return
mapper = self.features.get(feat_name, FeatureMapper())
mapper.feat_name = feat_name
mapper.add_value(feat_value)
self.features[feat_name] = mapper
def _map_feature(self, feat_name, feat_value):
"""
Map numerical feature values to categorical values.
Args:
feat_name (str): feature name
feat_value (any): feature value
"""
try:
feat_value = float(feat_value)
except ValueError:
# Don't do bucketing of non numerical features
return {feat_name: feat_value}
if feat_name not in self.features:
return {feat_name: feat_value}
mapper = self.features[feat_name]
new_feat_value = mapper.map_bucket(feat_value)
return {feat_name: new_feat_value}