# -*- coding: utf-8 -*-
#
# Copyright (c) 2015 Cisco Systems, Inc. and others. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This module contains the DialogflowConverter class used to convert Dialogflow projects
into MindMeld projects"""
import json
import logging
import os
import re
import importlib.util
from shutil import copyfile
from mindmeld.converter.converter import Converter
from mindmeld.converter.code_generator import MindmeldCodeGenerator
from mindmeld.components._config import DEFAULT_INTENT_CLASSIFIER_CONFIG
logger = logging.getLogger(__name__)
package_dir = os.path.dirname(os.path.abspath(__file__))
[docs]class DialogflowConverter(Converter):
"""The class is a sub class of the abstract Converter class. This class
contains the methods required to convert a Dialogflow project into a MindMeld project
"""
sys_entity_map = {
"@sys.date-time": "sys_interval",
"@sys.date": "sys_time",
"@sys.date-period": "sys_interval",
"@sys.time": "sys_time",
"@sys.time-period": "sys_duration",
"@sys.duration": "sys_duration",
"@sys.number": "sys_number",
"@sys.cardinal": "sys_number",
"@sys.ordinal": "sys_ordinal",
"@sys.unit-currency": "sys_amount-of-money",
"@sys.unit-volume": "sys_volume",
"@sys.email": "sys_email",
"@sys.phone-number": "sys_phone-number",
"@sys.url": "sys_url",
"@sys.temperature": "sys_temperature",
}
# TODO: provide support for entities listed in sys_entity_map_todo
sys_entity_map_todo = [
"@sys.number-integer",
"@sys.number-sequence",
"@sys.flight-number",
"@sys.unit-area",
"@sys.unit-length",
"@sys.unit-speed",
"@sys.unit-information",
"@sys.percentage",
"@sys.age",
"@sys.currency-name",
"@sys.unit-area-name",
"@sys.unit-length-name",
"@sys.unit-speed-name",
"@sys.unit-volume-name",
"@sys.unit-weight-name",
"@sys.unit-information-name",
"@sys.address",
"@sys.zip-code",
"@sys.geo-capital",
"@sys.geo-country",
"@sys.geo-country-code",
"@sys.geo-city",
"@sys.geo-state",
"@sys.geo-city",
"@sys.geo-state",
"@sys.place-attraction",
"@sys.airport",
"@sys.location",
"@sys.given-name",
"@sys.last-name",
"@sys.person",
"@sys.music-artist",
"@sys.music-genre",
"@sys.color",
"@sys.language",
"@sys.any",
]
def __init__(
self,
dialogflow_project_directory,
mindmeld_project_directory,
custom_config_file_path=None,
language="en",
):
if os.path.exists(os.path.dirname(dialogflow_project_directory)):
self.dialogflow_project_directory = dialogflow_project_directory
self.mindmeld_project_directory = mindmeld_project_directory
self.directory = os.path.dirname(os.path.realpath(__file__))
self.entities_list = set()
self.intents_list = set()
self.code_gen = MindmeldCodeGenerator()
self.custom_config_file_path = custom_config_file_path
self.language = language
else:
msg = "`{dialogflow_project_directory}` does not exist. Please verify."
msg = msg.format(dialogflow_project_directory=dialogflow_project_directory)
raise FileNotFoundError(msg)
[docs] def create_mindmeld_directory(self):
self.create_directory(self.mindmeld_project_directory)
self.create_directory(os.path.join(self.mindmeld_project_directory, "data"))
self.create_directory(os.path.join(self.mindmeld_project_directory, "domains"))
self.create_directory(
os.path.join(self.mindmeld_project_directory, "domains", "app_specific")
)
self.create_directory(
os.path.join(self.mindmeld_project_directory, "domains", "unrelated")
)
self.create_directory(os.path.join(self.mindmeld_project_directory, "entities"))
# =========================
# create training data (entities, intents)
# =========================
def _create_entities_directories(self, entities):
"""Creates directories + files for all languages/files.
Currently does not use meta data in entityName.json files (the keys in var entities).
"""
for languages in entities.values():
for sub in languages.values():
if sub != self.language:
# Each MindMeld app works on one language
continue
dialogflow_entity_file = os.path.join(
self.dialogflow_project_directory, "entities", sub + ".json"
)
mindmeld_entity_directory_name = self.clean_check(
sub, self.entities_list
)
mindmeld_entity_directory = os.path.join(
self.mindmeld_project_directory,
"entities",
mindmeld_entity_directory_name,
)
# remove DF entity reference "entries"
mindmeld_entity_directory = mindmeld_entity_directory.replace(
"entries_", ""
)
self.create_directory(mindmeld_entity_directory)
self._create_entity_file(
dialogflow_entity_file, mindmeld_entity_directory
)
@staticmethod
def _create_entity_file(dialogflow_entity_file, mindmeld_entity_directory):
source_en = open(dialogflow_entity_file, "r")
target_gazetteer = open(
os.path.join(mindmeld_entity_directory, "gazetteer.txt"), "w"
)
target_mapping = open(
os.path.join(mindmeld_entity_directory, "mapping.json"), "w"
)
datastore = json.load(source_en)
mapping_dict = {"entities": []}
for item in datastore:
new_dict = {}
while ("value" in item) and (item["value"] in item["synonyms"]):
item["synonyms"].remove(item["value"])
new_dict["whitelist"] = item["synonyms"]
new_dict["cname"] = item["value"]
mapping_dict["entities"].append(new_dict)
target_gazetteer.write(item["value"] + "\n")
json.dump(mapping_dict, target_mapping, ensure_ascii=False, indent=2)
source_en.close()
target_gazetteer.close()
target_mapping.close()
def _create_intents_directories(self, intents):
""" Creates directories + files for all languages/files."""
for languages in intents.values():
for language, sub in languages.items():
if language != self.language:
# Each MindMeld app works on one language
continue
dialogflow_intent_file = os.path.join(
self.dialogflow_project_directory, "intents", sub + ".json"
)
mindmeld_intent_directory_name = self.clean_check(
sub, self.intents_list
)
# DF has "default" intents like "default_fallback" and "default_greeting"
# which are in-built intents. We map these intents to the "unrelated" domain
# compared to the other app specific intents being mapped to the "app_specific"
# domain.
if "default" in mindmeld_intent_directory_name:
domain = "unrelated"
else:
domain = "app_specific"
mindmeld_intent_directory = os.path.join(
self.mindmeld_project_directory,
"domains",
domain,
mindmeld_intent_directory_name,
)
# remove DF intent reference "usersays_"
mindmeld_intent_directory = mindmeld_intent_directory.replace(
"usersays_", ""
)
self.create_directory(mindmeld_intent_directory)
self._create_intent_file(
dialogflow_intent_file, mindmeld_intent_directory, language
)
def _create_intent_file(
self, dialogflow_intent_file, mindmeld_intent_directory, language
):
source_en = open(dialogflow_intent_file, "r")
target_train = open(os.path.join(mindmeld_intent_directory, "train.txt"), "w")
datastore = json.load(source_en)
all_text = []
default_intent_to_training_file = {
"default_fallback_intent": "unrelated.txt",
"default_welcome_intent": "greetings.txt",
}
for usersay in datastore:
sentence = ""
for texts in usersay["data"]:
df_text = texts["text"]
if "meta" in texts and texts["meta"] != "@sys.ignore":
df_meta = texts["meta"]
role_type = texts["alias"].replace("-", "_")
if re.match(
"(@sys.).+", df_meta
): # if text is a dialogflow sys entity
if df_meta in DialogflowConverter.sys_entity_map:
mm_meta = DialogflowConverter.sys_entity_map[df_meta]
entity_type = mm_meta
else:
mm_meta = "[DNE: {sysEntity}]".format(sysEntity=df_meta[1:])
logger.info(
"Unfortunately mindmeld does not currently support"
"%s as a sys entity."
"Please create an entity for this.",
df_meta[1:],
)
entity_type = self.clean_name(mm_meta) + "_" + language
part = "{" + df_text + "|" + entity_type + "|" + role_type + "}"
else:
entity_type = self.clean_name(df_meta[1:]) + "_" + language
part = "{" + df_text + "|" + entity_type + "|" + role_type + "}"
else:
part = df_text
sentence += part
all_text.append(sentence)
for key in default_intent_to_training_file:
if key in mindmeld_intent_directory:
with open(
os.path.join(package_dir, default_intent_to_training_file[key])
) as fp:
for line in fp:
all_text.append(line.strip())
# Double the size of the training set if there are less than the number of
# folds for cross-val in the config.py file
intent_config = DEFAULT_INTENT_CLASSIFIER_CONFIG
if self.custom_config_file_path:
config_path = os.path.join(self.mindmeld_project_directory, "config.py")
spec = importlib.util.spec_from_file_location("mindmeld_app", config_path)
config = importlib.util.module_from_spec(spec)
spec.loader.exec_module(config)
intent_config = getattr(config, "INTENT_RECOGNIZER_CONFIG", intent_config)
while len(all_text) < intent_config["param_selection"]["k"]:
all_text = all_text * 2
target_train.write("\n".join(all_text))
source_en.close()
target_train.close()
def _get_file_names(self, level):
"""Gets the names of the entities from Dialogflow as a dictionary.
levels (str): either "entities" or "intents"
ex. if we had the following files in our entities directory:
["test.json", "test_entries_en.json", "test_entries_de.json"]
it returns:
{'test': {'en': 'test_entries_en', 'de': 'test_entries_de'}}"""
directory = os.path.join(self.dialogflow_project_directory, level)
files = os.listdir(directory)
w = {"entities": "entries", "intents": "usersays"}
p = r".+(?<=(_" + w[level] + "_))(.*)(?=(.json))"
language = "en"
info = {}
for name in files:
match = re.match(p, name)
if match:
isbase = False
base = name[: match.start(1)]
language = str(match.group(2))
else:
isbase = True
base = name[:-5]
if base not in info:
info[base] = {}
if not isbase:
info[base][language] = name[:-5]
return info
[docs] def create_mindmeld_training_data(self):
entities = self._get_file_names("entities")
self._create_entities_directories(entities)
intents = self._get_file_names("intents")
self._create_intents_directories(intents)
# =========================
# create init
# =========================
[docs] @staticmethod
def clean_name(name):
""" Takes in a string and returns a valid folder name (no spaces, all lowercase)."""
name = re.sub(r"[^\w\s-]", "", name).strip().lower()
name = re.sub(r"[-\s]+", "_", name)
return name
[docs] def clean_check(self, name, lst):
"""Takes in a list of strings and a name.
Returns name cleaned if the cleaned name is not found in lst."""
cleaned = self.clean_name(name)
if cleaned not in lst:
lst.add(cleaned)
return cleaned
else:
logger.error(
"%s name has been created twice. Please ensure there "
"are no duplicate names in the dialogflow files and "
"filenames are valid (no spaces or special characters)",
cleaned,
)
[docs] def create_mindmeld_init(self):
with open(
os.path.join(self.mindmeld_project_directory, "__init__.py"), "w"
) as target:
self.code_gen.begin(tab=" ")
self.code_gen.generate_top_block()
intents = self._get_file_names("intents")
for main in intents:
df_main = os.path.join(
self.dialogflow_project_directory, "intents", main + ".json"
)
with open(df_main) as source:
if "usersays" in df_main:
logger.error(
"Please check if your intent file"
"names are correctly labeled."
)
return
datastore = json.load(source)
intent = self.clean_name(datastore["name"])
for response in datastore["responses"]:
self.generate_handlers(intent, response)
target.write(self.code_gen.end())
target.write("\n")
[docs] def generate_handlers(self, intent, response):
message = response["messages"][0]
language = message["lang"]
intent_lang = "%s_%s" % (intent, language)
intent_entity_role_replies = {intent_lang: {}}
for param in response["parameters"]:
if param["required"]:
entity = param["dataType"]
if entity in DialogflowConverter.sys_entity_map:
entity = DialogflowConverter.sys_entity_map[entity]
else:
entity = param["dataType"].replace("@", "").replace("-", "_")
entity = "%s_%s" % (entity, language)
role = param["name"].replace("@", "").replace("-", "_")
prompts = []
if "prompts" in param:
prompts = [x["value"] for x in param["prompts"]]
else:
prompts = ["What is the " + param["name"]]
if entity in intent_entity_role_replies[intent_lang]:
intent_entity_role_replies[intent_lang][entity][role] = prompts
else:
intent_entity_role_replies[intent_lang][entity] = {role: prompts}
if "speech" in message:
data = message["speech"]
replies = data if isinstance(data, list) else [data]
slot_templated_replies = []
is_slot_template = False
for resp in replies:
template = resp
slots = re.findall("\$([\w\-\_]+)", resp)
for slot in slots:
template = template.replace(
"$" + slot, "{" + slot.replace("-", "_") + "}"
)
if template != resp:
is_slot_template = True
slot_templated_replies.append(template)
handle = "intent='%s_%s'" % (intent, language)
function_name = intent + "_" + language + "_handler"
if is_slot_template:
self.code_gen.generate_followup_function_code_block(
handle,
function_name,
intent_entity_role_replies,
slot_templated_replies,
)
else:
self.code_gen.generate_function(
handle=handle,
function_name=function_name,
replies=replies,
)
# =========================
# convert project
# =========================
[docs] def convert_project(self):
"""Converts a Dialogflow project into a MindMeld project.
Dialogflow projects consist of entities and intents.
note on languages:
Dialogflow supports multiple languages and locales. They store their training
data for different languages in different files. So, the name of each training
file ends with a meta tag, two letters long for language, and an additional
two letters for dialect (if applicable). For example, a file ending in "_en-au"
indicates it's in English (Australia). Below we use "la" to represent this
meta tag.
entities folder contains:
entityName.json - Meta data about entityName for all languages.
entityName_la.json - One for each language, contains entitiy mappings.
intents folder contain:
intentName.json - Contains rules, information about conversation flow, meta data.
Contains previously mentioned information and responses for all languages.
intentName_usersays_la.json - one for each language,
contains training data to recognize intentName
Limitations:
- The converter is unable to create an entity when it encounters an
unrecognized entity (an entity not defined under entities folder
or system entities), and labels such entities as DNE in training data.
- The converter currently does not automatically convert features like
slot filling, contexts, and follow-up intents. Users can still implement such
features and more.
- Information in agent.json are not copied over.
- There is no official support for different languages. Users can still
implement this. The converter is able to successfully convert dialogflow
bots that support multiple languages.
MindMeld:
- Users can store data locally
- Users can build a knowledge base (currently beta in Dialogflow).
- Users can configure the machine learning models to best suit their needs.
- Users have more flexibility in defining their own features, including
ones like slot filling, contexts, and follow-up intents.
"""
logger.info("Converting project.")
# Create project directory with sub folders
self.create_mindmeld_directory()
# copy config file to the MindMeld dir
if self.custom_config_file_path:
copyfile(
self.custom_config_file_path,
os.path.join(self.mindmeld_project_directory, "config.py"),
)
file_loc = os.path.dirname(os.path.realpath(__file__))
self.create_main(self.mindmeld_project_directory, file_loc)
self.create_mindmeld_init()
# Transfer over test data from Dialogflow project and reformat to MindMeld project
self.create_mindmeld_training_data()
logger.info("Project converted.")