Source code for mindmeld.markup

# -*- coding: utf-8 -*-
#
# Copyright (c) 2015 Cisco Systems, Inc. and others.  All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""The markup module contains functions for interacting with the MindMeld Markup language for
representing annotations of query text inline.
"""
import codecs
import csv
import logging
import sys

from .constants import SPACY_SYS_ENTITIES_NOT_IN_DUCKLING
from .core import Entity, NestedEntity, ProcessedQuery, QueryEntity, Span
from .exceptions import MarkupError, SystemEntityMarkupError, SystemEntityResolutionError
from .query_factory import QueryFactory

logger = logging.getLogger(__name__)

ENTITY_START = "{"
ENTITY_END = "}"
GROUP_START = "["
GROUP_END = "]"
META_SPLIT = "|"

START_CHARACTERS = frozenset({ENTITY_START, GROUP_START})
END_CHARACTERS = frozenset({ENTITY_END, GROUP_END})
SPECIAL_CHARACTERS = frozenset(
    {ENTITY_START, ENTITY_END, GROUP_START, GROUP_END, META_SPLIT}
)
TIME_FORMAT = "%Y%m%dT%H%M%S"


MINDMELD_FORMAT = "mindmeld"
BRAT_FORMAT = "brat"
MARKUP_FORMATS = frozenset({MINDMELD_FORMAT, BRAT_FORMAT})


[docs]def load_query( markup, query_factory=None, app_path=None, domain=None, intent=None, is_gold=False, query_options=None, ): """Creates a processed query object from marked up query text. Args: markup (str): The marked up query text. query_factory (QueryFactory, optional): An object which can create queries. app_path (str, optional): The dir path of the application domain (str, optional): The name of the domain annotated for the query. intent (str, optional): The name of the intent annotated for the query. is_gold (bool, optional): True if the markup passed in is a reference, human-labeled example. Defaults to False. query_options (dict, optional): A dict containing options for creating a Query, such as `language`, `time_zone` and `timestamp` Returns: ProcessedQuery: a processed query """ query_factory = query_factory or QueryFactory.create_query_factory(app_path) query_options = query_options or {} _, query, entities = process_markup( markup, query_factory=query_factory, query_options=query_options ) return ProcessedQuery( query, domain=domain, intent=intent, entities=entities, is_gold=is_gold )
[docs]def cache_query_file( file_path, query_cache, query_factory=None, app_path=None, domain=None, intent=None, is_gold=False ): """Loads the specified query file into the query cache Args: file_path (str): The path of the file to load query_cache (QueryCache): A container containing cache query objects query_factory (QueryFactory, optional): An object which can create queries. app_path (str): The app path domain (str, optional): The name of the domain annotated for the query. intent (str, optional): The name of the intent annotated for the query. is_gold (bool, optional): True if the markup passed in is a reference, human-labeled example. Defaults to False. Returns: List of cached query ids """ query_factory = query_factory or QueryFactory.create_query_factory(app_path) query_ids = [] for query_text in read_query_file(file_path): if query_text[0] == "-": continue key = query_cache.get_key(domain, intent, query_text) row_id = query_cache.key_to_row_id(key) if not row_id: query = load_query( query_text, query_factory=query_factory, domain=domain, intent=intent, is_gold=is_gold, ) row_id = query_cache.put(key, query) query_ids.append(row_id) return query_ids
[docs]def mark_down_file(file_path): """Read all annotated queries from the input file and remove all the annotations Args: file_path (str): The path of the file to load Yields: (str): marked down query text for each line """ for markup in read_query_file(file_path): yield mark_down(markup)
[docs]def read_query_file(file_path): """Summary Args: file_path (str): The path of the file to load Yields: str: query text for each line """ try: with codecs.open(file_path, encoding="utf-8") as queries_file: for line in queries_file: line = line.strip() # only create query if line is not empty string query_text = line.split("\t")[0].strip() if query_text: yield query_text except IOError: logger.error("Problem reading file %s.", file_path) yield from ()
[docs]def bootstrap_query_file(input_file, output_file, nlp, **kwargs): """ Apply predicted annotations to a file of text queries Args: input_file (str): filename of queries to be processed output_file (str or None): filename for processed queries nlp (NaturalLanguageProcessor): an application's NLP with built models kwargs (dict): A dictionary of additional args """ show_confidence = kwargs.get("confidence") with open(output_file, "w") if output_file else sys.stdout as csv_file: field_names = ["query"] if not kwargs.get("no_domain"): field_names.append("domain") if show_confidence: field_names.append("domain_conf") if not kwargs.get("no_intent"): field_names.append("intent") if show_confidence: field_names.append("intent_conf") if show_confidence and not kwargs.get("no_entity"): field_names.append("entity_conf") if show_confidence and not kwargs.get("no_role"): field_names.append("role_conf") csv_output = csv.DictWriter(csv_file, field_names, dialect=csv.excel_tab) csv_output.writeheader() for raw_query in mark_down_file(input_file): proc_query = nlp.process_query(nlp.create_query(raw_query), verbose=True) csv_row = bootstrap_query_row(proc_query, show_confidence, **kwargs) csv_output.writerow(csv_row)
[docs]def bootstrap_query_row(proc_query, show_confidence, **kwargs): """ Produce predicted annotation values and confidences for a single query Args: proc_query (ProcessedQuery): a labeled query show_confidence (bool): whether to generate confidence columns **kwargs: flags indicating which columns to generate Returns: (dict) """ marked_up_query = dump_query(proc_query, **kwargs) csv_row = {"query": marked_up_query} if not kwargs.get("no_domain"): csv_row["domain"] = proc_query.domain if show_confidence: csv_row["domain_conf"] = proc_query.confidence["domains"][proc_query.domain] if not kwargs.get("no_intent"): csv_row["intent"] = proc_query.intent if show_confidence: csv_row["intent_conf"] = proc_query.confidence["intents"][proc_query.intent] if show_confidence and not kwargs.get("no_entity"): csv_row["entity_conf"] = min( [max(e.values()) for e in proc_query.confidence["entities"]] + [1.0] ) if show_confidence and not kwargs.get("no_role"): csv_row["role_conf"] = min( [max(r.values()) for r in proc_query.confidence["roles"] if r] + [1.0] ) return csv_row
[docs]def process_markup(markup, query_factory, query_options): """This function takes in some text and returns a constructed Query object associated with the text, along with other objects like a list of entities. Args: markup (str): The markup string to process query_factory (QueryFactory): The factory used to construct Query objects query_options (dict): A dictionary containing options for language, time_zone and time_stamp Returns: (str, Query, list): Returns a tuple of the raw text, the Query object associated with the text and a list of entities (ProcessedQuery) associated with the text """ try: raw_text, annotations = _parse_tokens(_tokenize_markup(markup)) query = query_factory.create_query(raw_text, **query_options) entities = _process_annotations( query, annotations, query_factory.system_entity_recognizer, ) except (MarkupError, IndexError) as exc: msg = "Invalid markup in query {!r}: {}" raise MarkupError(msg.format(markup, exc)) from exc except SystemEntityResolutionError as exc: msg = "Unable to load query {!r}: {}" raise SystemEntityMarkupError(msg.format(markup, exc)) from exc return raw_text, query, entities
def _process_annotations(query, annotations, system_entity_recognizer): """ Args: query (Query) annotations (list) system_entity_recognizer (SystemEntityRecognizer) Returns: list of ProcessedQuery: """ stack = [] def _close_ann(ann, entities): if ann["ann_type"] == "group": try: head = ann["head"] except KeyError as exc: msg = "Group between {} and {} missing head".format( ann["start"], ann["end"] ) raise MarkupError(msg) from exc try: children = ann["children"] except KeyError as exc: msg = "Group between {} and {} missing children".format( ann["start"], ann["end"] ) raise MarkupError(msg) from exc entity = head.with_children(children) entities.remove(head) entities.append(entity) if ann.get("parent"): parent = ann.get("parent") children = parent.get("children", []) children.append(entity) parent["children"] = children if ann["ann_type"] == "entity": span = Span(ann["start"], ann["end"]) if Entity.is_system_entity(ann["type"]): if ann["type"] in SPACY_SYS_ENTITIES_NOT_IN_DUCKLING: raw_entity = Entity( text=ann["text"], entity_type=ann["type"], value={"value": ann["text"]} ) else: try: raw_entity = system_entity_recognizer.resolve_system_entity( query, ann["type"], span ).entity except SystemEntityResolutionError as e: logger.warning("Unable to load query: %s", e) return try: raw_entity.role = ann["role"] except KeyError: pass else: try: value = {"children": ann["children"]} except KeyError: value = None raw_entity = Entity( ann["text"], ann["type"], role=ann.get("role"), value=value ) if ann.get("parent"): parent = ann.get("parent") if parent["ann_type"] == "entity": children = parent.get("children", []) children.append( NestedEntity.from_query( query, span.shift(-parent["start"]), entity=raw_entity, parent_offset=parent["start"], ) ) parent["children"] = children if parent["ann_type"] == "group": entity = QueryEntity.from_query(query, span, entity=raw_entity) entities.append(entity) if parent["type"] == ann["type"]: # this is the head parent["head"] = entity else: children = parent.get("children", []) children.append(entity) parent["children"] = children else: entities.append(QueryEntity.from_query(query, span, entity=raw_entity)) def _open_ann(ann): if stack: ann["parent"] = stack[-1] stack.append(ann) entities = [] for ann in annotations: while stack and stack[-1]["depth"] >= ann["depth"]: # if there are annotations on the stack of the same or greater depth, # they have no more children so close them _close_ann(stack.pop(), entities) _open_ann(ann) while stack: _close_ann(stack.pop(), entities) return tuple(sorted(entities, key=lambda e: e.span.start)) def _parse_tokens(tokens): text = "" annotations = [] stack = [] token_is_meta = False for token in tokens: if token in START_CHARACTERS: annotation = { "start": len(text), "ann_type": "group" if token == GROUP_START else "entity", "depth": len(stack), } stack.append(annotation) elif token == META_SPLIT: token_is_meta = True elif token in END_CHARACTERS: annotation = stack.pop() annotation["end"] = len(text) - 1 # the index of the last character annotation["text"] = text[annotation["start"] : annotation["end"] + 1] token_is_meta = False annotations.append(annotation) elif token_is_meta: annotation = stack[-1] if annotation["ann_type"] == "group": key = "type" else: key = "role" if "type" in annotation else "type" annotation[key] = token else: text += token annotations = sorted(annotations, key=lambda a: a["depth"]) annotations = sorted(annotations, key=lambda a: a["start"]) return text, annotations def _tokenize_markup(markup): """Converts markup into a series of 'tokens'. A token can fall into one of 5 general categories: - raw text - a marker indicating the start of an entity or entity group - a marker indicating the end of an entity or entity group - a marker indicating the start of a label for an entity or entity group - a label for an entity or entity group Args: markup (str): The markup text Raises: MarkupError: When markup is invalid """ token = "" token_is_meta = False open_annotations = {"group": 0, "entity": 0} for idx, char in enumerate(markup): if char in SPECIAL_CHARACTERS: if char in START_CHARACTERS: if token: yield token token = "" if char == GROUP_START: open_annotations["group"] += 1 else: open_annotations["entity"] += 1 yield char elif char == META_SPLIT: # TODO: improve this check to accept {{a|b}|c} but reject {|c} and {a||c} # if not token: # raise MarkupError('Entity or group text is empty at position {}'.format(idx)) if token: yield token token = "" token_is_meta = True yield char elif char in END_CHARACTERS: if char == GROUP_END: key = "group" else: key = "entity" if open_annotations[key] == 0: msg = "Mismatched end for {} at position {}: {}".format( key, idx, markup ) raise MarkupError(msg) if not token_is_meta: msg = "Missing label for {} at position {}: {}".format( key, idx, markup ) raise MarkupError(msg) if not token: msg = "Empty label for {} at position {}: {}".format( key, idx, markup ) raise MarkupError(msg) open_annotations[key] -= 1 yield token token = "" token_is_meta = False yield char continue token += char for key in open_annotations: if open_annotations[key]: raise MarkupError("Mismatched start for {}: {}".format(key, markup)) if token: yield token
[docs]def dump_query(processed_query, markup_format=MINDMELD_FORMAT, **kwargs): """Converts a processed query into marked up query text. Args: processed_query (ProcessedQuery): The query to convert markup_format (str, optional): The format to use. Valid formats include 'mindmeld' and 'brat'. Defaults to 'mindmeld' **kwargs: additional format specific parameters may be passed in as keyword arguments. Returns: str: A marked up representation of the query Raises: ValueError """ if markup_format not in MARKUP_FORMATS: raise ValueError("Invalid markup format {!r}".format(markup_format)) return {MINDMELD_FORMAT: _dump_mindmeld, BRAT_FORMAT: _dump_brat}[markup_format]( processed_query, **kwargs )
[docs]def dump_queries(queries, markup_format=MINDMELD_FORMAT, **kwargs): """Converts a collection of processed queries to marked up query text Args: queries (iterable): A collection of processed queries markup_format (str, optional): The format to use. Valid formats include 'mindmeld' and 'brat'. Defaults to 'mindmeld' **kwargs: additional format specific parameters may be passed in as keyword arguments. Yields: str or tuple: A marked up representation of the query """ if markup_format == BRAT_FORMAT: for result in _dump_brat_queries(queries, **kwargs): yield result return for query in queries: yield dump_query(query, markup_format, **kwargs)
def _dump_brat_queries(queries, **kwargs): entity_offset = kwargs.get("entity_offset", 0) relation_offset = kwargs.get("relation_offset", 0) char_offset = kwargs.get("char_offset", 0) for query in queries: text, annotations = _dump_brat( query, char_offset=char_offset, entity_offset=entity_offset, relation_offset=relation_offset, ) yield text, annotations char_offset += len(text) + 1 entity_offset += len(query.entities) relation_offset += len(annotations.split("\n")) - len(query.entities) def _dump_brat(processed_query, **kwargs): # TODO: support nested entities entity_offset = kwargs.get("entity_offset", 0) relation_offset = kwargs.get("relation_offset", 0) char_offset = kwargs.get("char_offset", 0) text = processed_query.query.text annotations = [] entity_dict = {} for index, entity in enumerate(processed_query.entities): params = { "index": entity_offset + index + 1, "entity": entity.entity.type.capitalize(), "start": char_offset + entity.span.start, "end": char_offset + entity.span.end + 1, "text": entity.entity.text, } entity_dict[(entity.entity.type, entity.span.start)] = params["index"] annotations.append("T{index}\t{entity} {start} {end}\t{text}".format(**params)) # Loop again for dependents for entity in enumerate(processed_query.entities): if entity.parent is None: continue relation_offset += 1 # increment this first so first index is 1 params = { "index": relation_offset, "entity": entity.entity.type, "head": entity_dict[(entity.parent.entity.type, entity.parent.span.start)], "dependent": entity_dict[(entity.entity.type, entity.span.start)], } annotation = "R{index}\t{entity} Arg1:T{head} Arg2:T{dependent}\t".format( **params ) annotations.append(annotation) return (text, "\n".join(annotations)) def _dump_mindmeld(processed_query, **kwargs): raw_text = processed_query.query.text markup = _mark_up_entities( raw_text, processed_query.entities, exclude_entity=kwargs.get("no_entity"), exclude_role=kwargs.get("no_role"), exclude_group=kwargs.get("no_group"), ) return markup
[docs]def validate_markup(markup, query_factory): """Checks whether the markup text is well-formed. Args: markup (str): The marked up query text query_factory (QueryFactory): An object which can create queries Returns: bool: True if the markup is valid """ del markup del query_factory return NotImplemented
def _mark_up_entities( query_str, entities, exclude_entity=False, exclude_group=False, exclude_role=False ): annotations = [] for entity in entities or tuple(): annotations.extend(_annotations_for_entity(entity)) # remove duplicates from annotations ann_map = {} for ann in annotations: ann_key = (ann["ann_type"], ann["start"], ann["end"], ann["type"]) if ann_key in ann_map: # a similar annotation has already been found if ann["depth"] < ann_map[ann_key]["depth"]: # keep the annotation already in the map ann = ann_map[ann_key] ann_map[ann_key] = ann annotations = ann_map.values() annotations = sorted(annotations, key=lambda a: a["depth"]) annotations = sorted(annotations, key=lambda a: a["start"]) stack = [] cursor = 0 tokens = [] def _open_ann(ann, cursor): if cursor < ann["start"]: tokens.append(query_str[cursor : ann["start"]]) if ann["ann_type"] == "group": if not exclude_group: tokens.append(GROUP_START) elif not exclude_entity or (not exclude_role and ann.get("role") is not None): tokens.append(ENTITY_START) stack.append(ann) return ann["start"] def _close_ann(ann, cursor): if cursor < ann["end"] + 1: tokens.append(query_str[cursor : ann["end"] + 1]) if ann["ann_type"] == "group": if not exclude_group: tokens.append(META_SPLIT) tokens.append(ann["type"]) tokens.append(GROUP_END) elif not exclude_entity or (not exclude_role and ann.get("role") is not None): if not exclude_entity: tokens.append(META_SPLIT) tokens.append(ann["type"]) if not exclude_role and ann.get("role") is not None: tokens.append(META_SPLIT) tokens.append(ann["role"]) tokens.append(ENTITY_END) cursor = ann["end"] + 1 return cursor for ann in annotations: while stack and stack[-1]["depth"] >= ann["depth"]: # if there are annotations on the stack of the same depth, they have no more children # so finish them cursor = _close_ann(stack.pop(), cursor) cursor = _open_ann(ann, cursor) while stack: cursor = _close_ann(stack.pop(), cursor) tokens.append(query_str[cursor:]) return "".join(tokens) def _annotations_for_entity(entity, depth=0, parent_offset=0): annotations = [] start = entity.span.start + parent_offset end = entity.span.end + parent_offset if entity.children: # This entity is the head of a group. Add an annotation for the group. leftmost = entity while ( leftmost.children and leftmost.children[0].span.start < leftmost.span.start ): leftmost = leftmost.children[0] g_start = leftmost.span.start rightmost = entity while ( rightmost.children and rightmost.children[-1].span.end > rightmost.span.end ): rightmost = rightmost.children[-1] g_end = rightmost.span.end annotations.append( { "ann_type": "group", "type": entity.entity.type, "start": g_start, "end": g_end, "depth": depth, } ) depth += 1 for child in entity.children: # Add annotations for each of the dependents annotations.extend(_annotations_for_entity(child, depth)) annotations.append( { "ann_type": "entity", "type": entity.entity.type, "role": entity.entity.role, "start": start, "end": end, "depth": depth, } ) # Iterate over 'nested' entities if entity.entity.value and isinstance(entity.entity.value, dict): children = entity.entity.value.get("children", []) else: children = [] for child in children: annotations.extend(_annotations_for_entity(child, depth + 1, start)) annotations = sorted(annotations, key=lambda a: a["depth"]) annotations = sorted(annotations, key=lambda a: a["start"]) return annotations
[docs]def mark_down(markup): """Removes all entity mark up from a string Args: markup (str): A marked up string Returns: str: A clean string with no mark up """ text, _ = _parse_tokens(_tokenize_markup(markup)) return text