Data model is a central concept in NLPCraft defining natural language interface to your data sources like a database or a SaaS application. NLPCraft employs a model-as-a-code approach where entire data model is an implementation of NCModel interface which can be developed using any JVM programming language like Java, Scala, Kotlin, or Groovy.
A data model defines:
Note that model-as-a-code approach natively supports any software life cycle tools and frameworks like various build tools, CI/SCM tools, IDEs, etc. You don't have to use additional web-based tools to manage some aspects of your data models - your entire model and all of its components are part of your project source code.
Here's two quick examples of the fully-functional data model implementations (from Light Switch and Alarm Clock examples). You will find specific details about these implementations in the following sections:
package org.apache.nlpcraft.examples.lightswitch import org.apache.nlpcraft.model.{NCIntentTerm, _} class LightSwitchModel extends NCModelFileAdapter("lightswitch_model.yaml") { @NCIntentRef("ls") @NCIntentSample(Array( "Turn the lights off in the entire house.", "Switch on the illumination in the master bedroom closet.", "Get the lights on.", "Lights up in the kitchen.", "Please, put the light out in the upstairs bedroom.", "Set the lights on in the entire house.", "Turn the lights off in the guest bedroom.", "Could you please switch off all the lights?", "Dial off illumination on the 2nd floor.", "Please, no lights!", "Kill off all the lights now!", "No lights in the bedroom, please.", "Light up the garage, please!" )) def onMatch( @NCIntentTerm("act") actTok: NCToken, @NCIntentTerm("loc") locToks: List[NCToken] ): NCResult = { val status = if (actTok.getId == "ls:on") "on" else "off" val locations = if (locToks.isEmpty) "entire house" else locToks.map(_.meta[String]("nlpcraft:nlp:origtext")).mkString(", ") // Add HomeKit, Arduino or other integration here. // By default - return a descriptive action string. NCResult.text(s"Lights are [$status] in [${locations.toLowerCase}].") } }
id: "nlpcraft.lightswitch.ex" name: "Light Switch Example Model" version: "1.0" description: "NLI-powered light switch example model." macros: - name: "<ACTION>" macro: "{turn|switch|dial|let|set|get|put}" - name: "<KILL>" macro: "{shut|kill|stop|eliminate}" - name: "<ENTIRE_OPT>" macro: "{entire|full|whole|total|_}" - name: "<FLOOR_OPT>" macro: "{upstairs|downstairs|{1st|first|2nd|second|3rd|third|4th|fourth|5th|fifth|top|ground} floor|_}" - name: "<TYPE>" macro: "{room|closet|attic|loft|{store|storage} {room|_}}" - name: "<LIGHT>" macro: "{all|_} {it|them|light|illumination|lamp|lamplight}" enabledBuiltInTokens: [] # This example doesn't use any built-in tokens. # # Allows for multi-word synonyms in this entire model # to be sparse and permutate them for better detection. # These two properties generally enable a free-form # natural language comprehension. # permutateSynonyms: true sparse: true elements: - id: "ls:loc" description: "Location of lights." synonyms: - "<ENTIRE_OPT> <FLOOR_OPT> {kitchen|library|closet|garage|office|playroom|{dinning|laundry|play} <TYPE>}" - "<ENTIRE_OPT> <FLOOR_OPT> {master|kid|children|child|guest|_} {bedroom|bathroom|washroom|storage} {<TYPE>|_}" - "<ENTIRE_OPT> {house|home|building|{1st|first} floor|{2nd|second} floor}" - id: "ls:on" groups: - "act" description: "Light switch ON action." synonyms: - "<ACTION> {on|up|_} <LIGHT> {on|up|_}" - "<LIGHT> {on|up}" - id: "ls:off" groups: - "act" description: "Light switch OFF action." synonyms: - "<ACTION> <LIGHT> {off|out|down}" - "{<ACTION>|<KILL>} {off|out|down} <LIGHT>" - "<KILL> <LIGHT>" - "<LIGHT> <KILL>" - "{out|no|off|down} <LIGHT>" - "<LIGHT> {out|off|down}" intents: - "intent=ls term(act)={has(tok_groups, 'act')} term(loc)={# == 'ls:loc'}*"
package org.apache.nlpcraft.examples.alarm; import org.apache.nlpcraft.model.*; import java.time.*; import java.util.*; import static java.time.temporal.ChronoUnit.MILLIS; public class AlarmModel extends NCModelFileAdapter { private static final DateTimeFormatter FMT = DateTimeFormatter.ofPattern("HH'h' mm'm' ss's'").withZone(ZoneId.systemDefault()); private final Timer timer = new Timer(); public AlarmModel() { // Loading the model from the file. super("alarm_model.json"); } @NCIntentRef("alarm") // Intent is defined in JSON model file (alarm_model.json and intents.idl). @NCIntentSampleRef("alarm_samples.txt") // Samples supplied in an external file. NCResult onMatch( NCIntentMatch ctx, @NCIntentTerm("nums") List<NCToken> numToks ) { long ms = calculateTime(numToks); assert ms >= 0; timer.schedule( new TimerTask() { @Override public void run() { System.out.println( "BEEP BEEP BEEP for: " + ctx.getContext().getRequest().getNormalizedText() + "" ); } }, ms ); return NCResult.text("Timer set for: " + FMT.format(LocalDateTime.now().plus(ms, MILLIS))); } @Override public void onDiscard() { // Clean up when model gets discarded (e.g. during testing). timer.cancel(); } public static long calculateTime(List<NCToken> numToks) { LocalDateTime now = LocalDateTime.now(); LocalDateTime dt = now; for (NCToken num : numToks) { String unit = num.meta("nlpcraft:num:unit"); // Skip possible fractional to simplify. long v = ((Double)num.meta("nlpcraft:num:from")).longValue(); if (v <= 0) throw new NCRejection("Value must be positive: " + unit); switch (unit) { case "second": { dt = dt.plusSeconds(v); break; } case "minute": { dt = dt.plusMinutes(v); break; } case "hour": { dt = dt.plusHours(v); break; } case "day": { dt = dt.plusDays(v); break; } case "week": { dt = dt.plusWeeks(v); break; } case "month": { dt = dt.plusMonths(v); break; } case "year": { dt = dt.plusYears(v); break; } default: // It shouldn't be an assertion, because 'datetime' unit can be extended outside. throw new NCRejection("Unsupported time unit: " + unit); } } return now.until(dt, MILLIS); } }
// Fragments (mostly for demo purposes here). fragment=buzz term~{# == 'x:alarm'} fragment=when term(nums)~{ // Demonstrating term variables. @type = meta_tok('nlpcraft:num:unittype') @iseq = meta_tok('nlpcraft:num:isequalcondition') // Excludes conditional statements. # == 'nlpcraft:num' && @type == 'datetime' && @iseq == true }[1,7] // Intents (using fragments). intent=alarm fragment(buzz) fragment(when)
{ "id": "nlpcraft.alarm.ex", "name": "Alarm Example Model", "version": "1.0", "description": "Alarm example model.", "enabledBuiltInTokens": [ "nlpcraft:num" ], "elements": [ { "id": "x:alarm", "description": "Alarm token indicator.", "synonyms": [ "{ping|buzz|wake|call|hit} {me|up|me up|_}", "{set|_} {my|_} {wake|wake up|_} {alarm|timer|clock|buzzer|call} {clock|_} {up|_}" ] } ], "intents": [ "import('intents.idl')" // Import intents from external file. ] }
Further sub-sections will provide details on model's static configuration and dynamic programmable logic implementation.
Let's review the general dataflow of the user request in NLPCraft (from right to left). User request starts with the user application (like a chatbot or NLI-based system) making a REST call using NLPCraft REST API. That REST call carries among other things the input text and data model ID, and it arrives first to the REST server.
Upon receiving the user request, the REST server performs NLP pre-processing converting the input text into a sequence of tokens and enriching them with additional information. Once finished, the sequence of tokens is sent further down to the probe where the requested data model is deployed.
Upon receiving that sequence of tokens, the data probe further enriches it based on the user data model and matches it against declared intents. When a matching intent is found its callback method is called and its result travels back from the data probe to the REST server and eventually to the user that made the REST call.
Security & Isolation
Note that in this architecture the user-defined data model is fully isolated from the REST server accepting user calls. Users never access data probes and hence data models directly. Typically REST server should be deployed in DMZ and only ingress connectivity is required from the REST server to data probes.
Data model is an implementation of NCModel interface. NCModel interface has defaults for most of its methods. These are the only methods that must to be implemented by its sub-class:
You can either implement NCModel interface directly or use one of the adapters (recommended in most cases):
Note that you can also use 3rd party IoC frameworks like Spring to construct your data models. See NCModelFactory for more information.
Using Adapters
It is recommended to use one of the adapter classes when defining your own data model in the most uses cases.
Data models get deployed to and hosted by the data probes - a lightweight container whose job is to host data models and securely transfer requests between REST server and the data models. When a data probe starts it reads its configuration to see which models to deploy.
Note that data probes don't support hot-redeployment. To redeploy the data model you need to restart the data probe. Note also that data probe can be started in embedded mode, i.e. it can be started from within an existing JVM process like user application.
There are two lifecycle callbacks on NCModel interface (by way of extending NCLifecycle interface) that you can override to affect the the default lifecycle behavior:
There are also several callbacks that you can override to affect model behavior during intent matching to perform logging, debugging, statistic or usage collection, explicit update or initialization of conversation context, security audit or validation:
Callbacks onContext(...) and onMatchedIntent(...) are especially handy to perform a soft reset on the conversation context. Read their Javadoc documentation to understand these callbacks protocol.
Note that both the server and the probe provide their own lifecycle components support. When registered in the probe or server configuration the lifecycle components will be called during various stages of the probe or server startup or shutdown procedures. These callbacks can be used to control lifecycle of external libraries and systems that the data probe or the server rely on, i.e. OpenCensus exporters, security environment, devops hooks, etc.
See server and probe configuration.
Apart from mandatory model ID, name and version there is a number of static model configurations that you can set. All of these properties have sensible defaults that you can override, when required, in either sub-classes or via external JSON/YAML declaration:
You can move out all the static model configuration into an external JSON or YAML file. To load that configuration you need to use NCModelFileAdapter adapter when creating your data model. Here are JSON and YAML sample templates and you can find more details in NCModel Javadoc and in examples.
{ "id": "user.defined.id", "name": "User Defined Name", "version": "1.0", "description": "Short model description.", "enabledBuiltInTokens": ["google:person", "google:location"] "macros": [], "metadata": {}, "elements": [ { "id": "x:id", "description": "", "groups": [], "parentId": "", "synonyms": [], "metadata": {}, "values": [] } ], ... "intents": [] }
id: "user.defined.id" name: "User Defined Name" version: "1.0" description: "Short model description." macros: enabledBuiltInTokens: elements: - id: "x:id" description: "" synonyms: groups: values: parentId: metadata: ... intents:
Named entity, also known as a model element or a token, is one of the main a components defined by the NLPCraft data model. A named entity is one or more individual words that have a consistent semantic meaning and typically denote a real-world object, such as persons, locations, number, date and time, organizations, products, etc. Such object can be abstract or have a physical existence.
For example, in the following sentence:
the following named entities can be detected:
Words | Type | Normalized Value |
---|---|---|
Top 20 | nlpcraft:limit | top 20 |
best pages | user:element | best pages |
California USA | nlpcraft:geo | USA, California |
last 3 months | nlpcraft:date | 1/1/2021 - 4/1/2021 |
In most cases named entities will have associated normalized value. It is especially important for named entities that have many notational forms such as time and date, currency, geographical locations, etc. For example, New York
, New York City
and NYC
all refer to the same "New York City, NY USA" location which is a standard normalized form.
The process of detecting named entities is called Named Entity Recognition (NER). There are many ways of how a certain named entity can be detected: through list of synonyms, by name, rule-based or by using statistical techniques like neural networks with large corpus of predefined data. NLPCraft natively supports synonym-based named entities definition as well as the ability to compose new named entities through powerful Intent Definition Language (IDL) combining other named entities including named entities from external project such OpenNLP, spaCy or Stanford CoreNLP.
Named entities allow you to abstract from basic linguistic forms like nouns and verbs to deal with the higher level semantic abstractions like geographical location or time when you are trying to understand the meaning of the sentence. One of the main goals of named entities is to act as an input ingredients for intent matching.
😀 User Input → Named Entities → Parsing Variants → Intent Matcher → Winning Intent 🚀
User input is parsed into the list of named entities. That list is then further transformed into one or more parsing variants where each variant represents a particular order and combination of detected named entities. Finally, the list of variants act as an input to intent matching where each variant is matched against every intent in the process of detecting the best matching intent for the original user input.
Data model element defines a named entity that will be detected in the user input. Model element is an implementation of NCElement interface. NCModel provides its elements via getElements() method. Typically, you create model elements by either:
Note that when you use external static model configuration with JSON or YAML you can still modify it after it was loaded using NCModelFileAdapter adapter. It is particular convenient when synonyms or values are loaded separately from, or in addition to, the model elements themselves, i.e. from a database or another file.
Model Element & Named Entity & Token
Terms 'model element', 'named entity' and 'token' are used throughout this documentation relatively interchangeably:
Although model element and named entity describe a similar concept, the NLPCraft model elements provide a much more powerful instrument. Unlike named entities support in other projects NLPCraft model elements have number of unique capabilities:
Additionally to the model elements that are defined by the user in the data model (i.e. user model elements) NLPCraft provides its own built-in named entities as well as the integration with number of 3rd party projects. You can think of these built-in elements as if they were implicitly defined in your model - you can use them in exactly the same way as if you defined them yourself. You can find more information on how to configure external token providers in Integrations section.
Note that you can't directly change group membership, parent-child relationship or metadata of the built-in elements. You can, however, "wrap" built-in entity into your own one using ^^{tok_id() == 'external.id'}^^
IDL expression as its synonym where you can define all necessary additional configuration properties (more on that below).
NLPCraft uses fully deterministic named entity recognition and is not based on statistical approaches that would require pre-existing marked up data sets and extensive training. For each model element you can either provide a set of synonyms to match on or specify a piece of code that would be responsible for detecting that named entity (discussed below). A synonym can have one or more individual words. Note that element's ID is its implicit synonym so that even if no additional synonyms are defined at least one synonym always exists. Note also that synonym matching is performed on normalized and stemmatized forms of both a synonym and user input.
Here's an example of a simple model element definition in JSON:
... "elements": [ { "id": "transport.vehicle", "description": "Transportation vehicle", "synonyms": [ "car", "truck", "light duty truck" "heavy duty truck" "sedan", "coupe" ] } ] ...
While adding multi-word synonyms looks somewhat trivial - in real models, the naive approach can lead to thousands and even tens of thousands of possible synonyms due to words, grammar, and linguistic permutations - which quickly becomes untenable if performed manually.
NLPCraft provides an effective tool for a compact synonyms representation. Instead of listing all possible multi-word synonyms one by one you can use combination of following techniques:
Each whitespace separated string in the synonym can be either a regular word (like in the above transportation example where it will be matched on using its normalized and stemmatized form) or one of the above expression.
Note that this synonyms definition is also used in the following NCElement methods:
getSynonyms()
- gets synonyms to match on.getValues()
- get values to match on (see below).Model element can have an optional set of special synonyms called values or "proper nouns" for this element. Unlike basic synonyms, each value is a pair of a name and a set of standard synonyms by which that value, and ultimately its element, can be recognized in the user input. Note that the value name itself acts as an implicit synonym even when no additional synonyms added for that value.
When a model element is recognized it is made available to the model's matching logic as an instance of the NCToken interface. This interface has a method getValue() which returns the name of the value, if any, by which that model element was recognized. That value name can be further used in intent matching.
To understand the importance of the values consider the following changes to our transportation example model:
... "macros": [ { "name": "<TRUCK_TYPE>", "macro": "{light duty|heavy duty|half ton|1/2 ton|3/4 ton|one ton|super duty}" } ] "elements": [ { "id": "transport.vehicle", "description": "Transportation vehicle", "synonyms": [ "car", "{<TRUCK_TYPE>|_} {pickup|_} truck" "sedan", "coupe" ], "values": [ { "value": "mercedes", "synonyms": ["mercedes-ben{z|s}", "mb", "ben{z|s}"] }, { "value": "bmw", "synonyms": ["{bimmer|bimer|beemer}", "bayerische motoren werke"] } { "value": "chevrolet", "synonyms": ["chevy"] } ] } ] ...
With that setup transport.vehicle
element will be recognized by any of the following input string:
car
benz
(with value mercedes
)3/4 ton pickup truck
light duty truck
chevy
(with value chevrolet
)bimmer
(with value bmw
)transport.vehicle
Each model element always belongs to one or more groups. Model element provides its groups via getGroups() method. By default, if element group is not specified, the element ID will act as its default group ID. Group membership is a quick and easy way to organise similar model elements together and use this categorization in IDL intents.
Note that the proper grouping of the elements is also necessary for the correct operation of Short-Term-Memory (STM) in the conversational context. Consider a NCToken that represents a previously found model element that is stored in the conversation. Such token will be overridden in the conversation by the more recent token from the same group - a critical rule of maintaining the proper conversational context. See NCConversation for mode details.
Element ParentEach model element can form an optional hierarchical relationship with other element by specifying its parent element ID via getParentID() method. The main idea here is that sometimes model elements can act not only individually but their place in the hierarchy can be important too.
For example, we could have designed our transportation example model in a different way by using multiple model elements linked with this hierarchy:
+-- vehicle | +--truck | | |-- light.duty.truck | | |-- heavy.duty.truck | | +-- medium.duty.truck | +--car | | |-- coupe | | |-- sedan | | |-- hatchback | | +-- wagon
Then in our intent, for example, we could look for any token with root parent ID vehicle
or immediate parent ID truck
or car
without a need to match on all current and future individual sub-IDs. For example:
intent=vehicle.intent term~{has(tok_ancestors, 'vehicle')} intent=truck.intent term~{tok_parent == 'truck'} intent=car.intent term~{tok_parent == 'car'}
Listing all possible multi-word synonyms for a given element can be a time-consuming task. Macros together with option groups allow for significant simplification of this task. Macros allow you to give a name to an often used set of words or option groups and reuse it without repeating those words or option groups again and again. A model provides a list of macros via getMacros() method. Each macro has a name in a form of <X>
where X
is any string, and a string value. Note that macros can be nested (but not recursive), i.e. macro value can include references to other macros. When macro name X
is encountered in the synonym it gets recursively replaced with its value.
Here's a code snippet of macro definitions using JSON definition:
"macros": [ { "name": "<A>", "macro": "aaa" }, { "name": "<B>", "macro": "<A> bbb" }, { "name": "<C>", "macro": "<A> bbb {z|w}" } ]Option Groups
Option groups are similar to wildcard patterns that operates on a single word base. One line of option group expands into one or more individual synonyms. Option groups is the key mechanism for shortened synonyms notation. The following examples demonstrate how to use option groups.
Consider the following macros defined below (note that macros <B>
and <C>
are nested):
Name | Value |
---|---|
<A> | aaa |
<B> | <A> bbb |
<C> | <A> bbb {z|w} |
Then the following option group expansions will occur in these examples:
Synonym | Synonym Expansions |
---|---|
<A> {b|_} c | "aaa b c" "aaa c" |
<A> {b|a}[1,2] c | "aaa b c" "aaa b b c" "aaa a c" "aaa a a c" "aaa c" |
<B> {b|_} c or <B> {b}[0,1] c | "aaa bbb b c" "aaa bbb c" |
{b|\{\_\}} | "b" "b {_}" |
a {b|_}. c | "a b. c" "a . c" |
a .{b, |_}. c | "a .b, . c" "a .. c" |
a {{b|c}|_}. | "a ." "a b." "a c." |
a {{{<C>}}|{_}} c | "a aaa bbb z c" "a aaa bbb w c" "a c" |
{{{a}}} {b||_|{{_}}||_} | "a b" "a" |
Specifically:
{A|B}
denotes either A
or B
.{A|B|_}
denotes either A
or B
or nothing._
cam appear anywhere in the list of options, i.e. {A|B|_}
is equal to {A|_|B}
.{C}[x,y]
denotes an option group with quantifier, i.e. group C
appearing from x
to y
times inclusive.{C}[1,3]
is the same as {C|C C|C C C}
notation.{C|_}
is equal to {C}[0,1]
'\'
(backslash) can be used to escape '{'
, '}'
, '|'
and '_'
special symbols used by the option groups.We can rewrite our transportation model element in a more efficient way using macros and option groups. Even though the actual length of definition hasn't changed much it now auto-generates many dozens of synonyms we would have to write out manually otherwise:
... "macros": [ { "name": "<TRUCK_TYPE>", "macro": "{ {light|super|heavy|medium} duty|half ton|1/2 ton|3/4 ton|one ton}" } ] "elements": [ { "id": "transport.vehicle", "description": "Transportation vehicle", "synonyms": [ "car", "{<TRUCK_TYPE>|_} {pickup|_} truck" "sedan", "coupe" ] } ] ...Regular Expressions
Any individual synonym word that starts and ends with //
(two forward slashes) is considered to be Java regular expression as defined in java.util.regex.Pattern
. Note that regular expression can only span a single word, i.e. only individual words from the user input will be matched against given regular expression and no whitespaces are allowed within regular expression. Note also that option group special symbols {
, }
, |
and _
have to be escaped in the regular expression using \
(backslash).
For example, the following synonym:
"synonyms": [ "{foo|//[bar].+//}}" ]
will match word foo
or any other strings that start with bar
as long as this string doesn't contain whitespaces.
It's important to note that regular expressions can significantly affect the performance of the NLPCraft processing if used uncontrolled. Use it with caution and test the performance of your model to ensure it meets your requirements.
Any individual synonym word that that starts and ends with ^^
is a IDL expression. IDL expression inside of ^^ ... ^^
markers allows you to define a predicate on already parsed and detected token. It is very important to note that unlike all other synonyms the IDL expression operates on a already detected token, not on an individual unparsed word.
IDL expressions allows you to compose named entities, i.e. use one name entity when defining another one. For example, we could define a model element for the race car using our previous transportation example (note how synonym on line 18 references the element defined on line 4):
... "elements": [ { "id": "transport.vehicle", "description": "Transportation vehicle", "synonyms": [ "car", "truck", "{light|heavy|super|medium} duty {pickup|_} truck" "sedan", "coupe" ] }, { "id": "race.vehicle", "description": "Race vehicle", "synonyms": [ "{race|speed|track} ^^{# == 'transport.vehicle'}^^" ] } ] ...
Greedy NERs & Synonyms Conflicts
Note that in the above example you need to ensure that words race
, speed
or track
are not part of the transport.vehicle
token. It is particular important for the 3rd party NERs where specific rules about what words can or cannot be part of the token are unclear or undefined. In such cases the only remedy is to extensively test with 3rd party NERs and verify the synonyms recognition in data probe logs.
Another use case is to wrap 3rd party named entities to add group membership, metadata or hierarchical relationship to the externally defined named entity. For example, you can wrap google:location
token and add group membership for my_group
group:
... "elements": [ { "id": "google.loc.wrap", "description": "Wrapper for google location", "groups": ["my_group"], "synonyms": [ "^^{# == 'google:location'}^^" ] } ] ...IDL Expression Syntax
IDL expressions are a subset of overall IDL syntax. You can review formal ANTLR4 grammar but basically an IDL expression for synonym is a term expression with the optional alias at the beginning. Here's an example of IDL expression defining a synonym for the population of any city in France:
"synonyms": [ "population {of|for} ^^[city]{# == 'nlpcraft:city' && lowercase(meta_tok('city:country')) == 'france'}^^" ]NOTES:
city
can be used to access a constituent part token (with ID nlpcraft:city
).{
and }
brackets is a standard IDL term expression.By default, the data model detects its elements by their synonyms, regexp or IDL expressions. However, in some cases these methods are either not expressive enough or cannot be used. For example, detecting model elements based on neural networks or integration with a non-standard 3rd-party NER components. In such cases, a user-defined parser can be defined for the model that would allow the user to define its own arbitrary NER logic to detect the model elements in the user input programmatically. Note that a custom parser can detect any number of model elements.
Model provides its custom parsers via getParsers() method.
When a user sends its request via REST API it is received by the REST server. Upon receipt, the REST server does the basic NLP processing and enriching. Once finished, the REST server sends the enriched request down to a specific data probe selected based on the requested data model.
The model logic is defined in intents, specifically in the intent callbacks that get called when their intent is chosen as a winning match against the user request. Below we will quickly discuss the key APIs that are essential for developing intent callbacks. Note that this does now replace a more detailed Javadoc documentation that you are encouraged to read through as well:
This interface provides read-only view on data model. Model view defines a declarative, or configurable, part of the model. All properties in this interface can be defined or overridden in JSON/YAML external presentation when used with NCModelFileAdapter adapter.
This interface defines a context of a particular intent match. It can be passed into the callback of the matched intent and provides the following:
This interface provides all available data about the parsed user input and all its supplemental information. It's accessible from NCIntentMatch
interface and provide large amount of information to the intent callback logic:
NCRequest interface is one of the several important entities in Data Model API that you as a model developer will be working with. You should review its Javadoc but here is an outline of the information it provides:
NCToken object is another key abstraction in Data Model API. A token is a detected model element and is a part of a fully parsed user input. Sequence of tokens represents parsed user input. A single token corresponds to a one or more words, sequential or not, in the user sentence.
Most of the token's information is stored in map-based metadata accessible via getMetadata() method. Depending on the token ID each token will have different set of metadata properties. Some common NLP properties are always present for tokens of all types.
This class defines the result returned from model's intent callbacks. Result consists of the text body and the type. The result types are similar in notion to MIME type and have specific meaning only for REST applications that interpret them accordingly. For example, the REST client interfacing between NLPCraft and Amazon Alexa or Apple HomeKit could only accept text result type and ignore everything else.
Interface NCMetadata provides support for mutable runtime-only metadata. This interface can be used to attach user-defined runtime data to variety of different objects in NLPCraft API. This interface is implemented by the following types:
NLPCraft provides a number of built-in model elements (i.e. tokens) including the integration with several popular 3rd party NER frameworks. Table below provides information about these built-in tokens. Section about token metadata provides further information about metadata that each type of token carries.
Built-in tokens have to be explicitly enabled on both the REST server and in the model. See nlpcraft.server.tokenProviders
configuration property and NCModelView#getEnabledBuiltInTokens() method for more details. By default, only NLPCraft tokens are enabled (token ID starting with nlpcraft
).
Token ID | Description | Example |
---|---|---|
nlpcraft:nlp | This token denotes a word (always a single word) that is not a part of any other token. It's also call a free-word, i.e. a word that is not linked to any other detected model element. NOTE: the metadata from this token defines a common set of NLP properties and is present in every other token as well. |
|
nlpcraft:date | This token denotes a date range. It recognizes dates from 1900 up to 2023. Note that it does not currently recognize time component. |
|
nlpcraft:num | This token denotes a single numeric value or numeric condition. |
|
nlpcraft:continent | This token denotes a geographical continent. |
|
nlpcraft:subcontinent | This token denotes a geographical subcontinent. |
|
nlpcraft:region | This token denotes a geographical region/state. |
|
nlpcraft:country | This token denotes a country. |
|
nlpcraft:city | This token denotes a city. |
|
nlpcraft:metro | This token denotes a metro area. |
|
nlpcraft:sort | This token denotes a sorting or ordering. |
|
nlpcraft:limit | This token denotes a numerical limit. |
|
nlpcraft:coordinate | This token denotes a latitude and longitude coordinates. |
|
nlpcraft:relation | This token denotes a relation function: compare or correlate . Note this token always need another two tokens that it references. |
|
google:xxx | These tokens denote See integration section for more details on how to configure Google named entity provider. |
|
opennlp:xxx | These tokens denote See integration section for more details on how to configure Apache OpenNLP named entity provider. |
|
spacy:xxx | These tokens denote See integration section for more details on how to configure spaCy named entity provider. |
|
stanford:xxx | These tokens denote See integration section for more details on how to configure Stanford CoreNLP named entity provider. |
|
Each token has different set of metadata. Sections below describe metadata for each built-in token supported by NLPCraft:
nlpcraft:nlp
nlpcraft:date
nlpcraft:num
nlpcraft:city
nlpcraft:continent
nlpcraft:subcontinent
nlpcraft:region
nlpcraft:country
nlpcraft:metro
nlpcraft:coordinate
nlpcraft:sort
nlpcraft:limit
nlpcraft:relation
stanford:xxx
spacy:xxx
google:xxx
opennlp:xxx
Metadata Name Conflicts
Note that model element metadata gets merged into the same map container as common NLP token metadata (see nlpcraft:nlp:xxx
properties below). In other words, their share the same namespace. It is important to remember that and choose unique names for user-defined metadata properties. One possible way that is used by NLPCraft internally is to prefix metadata name with some unique prefix based on the token ID.
nlpcraft:nlp
This token's metadata provides common basic NLP properties that are part of any token. All tokens without exception have these metadata properties. This metadata represents a common set of NLP properties for a given token. All these metadata properties are mandatory. Note also that interface NCToken provides a direct access to most of these properties.
Property | Java Type | Description |
---|---|---|
nlpcraft:nlp:unid | java.lang.String | Internal globally unique system ID of the token. |
nlpcraft:nlp:bracketed | java.lang.Boolean | Whether or not this token is surrounded by any of '[' , ']' , '{' , '}' , '(' , ')' brackets. |
nlpcraft:nlp:freeword | java.lang.Boolean | Whether or not this token represents a free word. A free word is a token that was detected neither as a part of user defined or system tokens. |
nlpcraft:nlp:direct | java.lang.Boolean | Whether or not this token was matched on direct (not permutated) synonym. |
nlpcraft:nlp:english | java.lang.Boolean | Whether this token represents an English word. Note that this only checks that token's text consists of characters of English alphabet, i.e. the text doesn't have to be necessary a known valid English word. See NCModelView.isNonEnglishAllowed() method for corresponding model configuration. |
nlpcraft:nlp:lemma | java.lang.String | Lemma of this token, i.e. a canonical form of this word. Note that stemming and lemmatization allow to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form. Lemmatization refers to the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma. Learn more at https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html |
nlpcraft:nlp:stem | java.lang.String | Stem of this token. Note that stemming and lemmatization allow to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form. Unlike lemma, stemming is a basic heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. Learn more at https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html |
nlpcraft:nlp:pos | java.lang.String | Penn Treebank POS tag for this token. Note that additionally to standard Penn Treebank POS tags NLPCraft introduced '---' synthetic tag to indicate a POS tag for multiword tokens. Learn more at http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html |
nlpcraft:nlp:posdesc | java.lang.String | Description of Penn Treebank POS tag. Learn more at http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html |
nlpcraft:nlp:swear | java.lang.Boolean | Whether or not this token is a swear word. NLPCraft has built-in list of common English swear words. See NCModelView.isSwearWordsAllowed() for corresponding model configuration |
nlpcraft:nlp:origtext | java.lang.String | Original user input text for this token. |
nlpcraft:nlp:normtext | java.lang.String | Normalized user input text for this token. |
nlpcraft:nlp:sparsity | java.lang.Integer | Numeric value of how sparse the token is. Sparsity zero means that all individual words in the token follow each other. |
nlpcraft:nlp:minindex | java.lang.Integer | Index of the first word in this token. Note that token may not be contiguous. |
nlpcraft:nlp:maxindex | java.lang.Integer | Index of the last word in this token. Note that token may not be contiguous. |
nlpcraft:nlp:wordindexes | java.util.List<Integer> | List of original word indexes in this token. Note that a token can have words that are not contiguous in the original sentence. Always has at least one element in it. |
nlpcraft:nlp:wordlength | java.lang.Integer | Number of individual words in this token. Equal to the size of wordindexes list. |
nlpcraft:nlp:contiguous | java.lang.Boolean | Whether or not this token has zero sparsity, i.e. consists of contiguous words. |
nlpcraft:nlp:start | java.lang.Integer | Start character index of this token. |
nlpcraft:nlp:end | java.lang.Integer | End character index of this token. |
nlpcraft:nlp:index | java.lang.Integer | Index of this token in the sentence. |
nlpcraft:nlp:charlength | java.lang.Integer | Character length of this token. |
nlpcraft:nlp:quoted | java.lang.Boolean | Whether or not this token is surrounded by single or double quotes. |
nlpcraft:nlp:stopword | java.lang.Boolean | Whether or not this token is a stopword. Stopwords are some extremely common words which add little value in helping understanding user input and are excluded from the processing entirely. For example, words like a, the, can, of, about, over, etc. are typical stopwords in English. NLPCraft has built-in set of stopwords. |
nlpcraft:nlp:dict | java.lang.Boolean | Whether or not this token is found in Princeton WordNet database. |
nlpcraft:date
This token denotes a date range including single days. Additionally to nlpcraft:nlp:xxx
properties this type of token will have the following metadata properties all of which are mandatory.
Property | Java Type | Description |
---|---|---|
nlpcraft:date:from | java.lang.Long | Start timestamp of the datetime range. |
nlpcraft:date:to | java.lang.Long | End timestamp of the datetime range. |
nlpcraft:num
This token denotes a single numerical value or a numeric condition. Additionally to nlpcraft:nlp:xxx
properties this type of token will have the following metadata properties all of which are mandatory unless otherwise noted.
Property | Java Type | Description |
---|---|---|
nlpcraft:num:from | java.lang.Double | Start of numeric range that satisfies the condition (exclusive). Note that if from and to are the same this token represent a single value (whole or fractional) in which case isequalcondition > will be true . |
nlpcraft:num:to | java.lang.Double | Ed of numeric range that satisfies the condition (exclusive). Note that if from and to are the same this token represent a single value (whole or fractional) in which case isequalcondition > will be true . |
nlpcraft:num:fromincl | java.lang.Boolean | Whether or not start of the numeric range is inclusive |
nlpcraft:num:toincl | java.lang.Boolean | Whether or not end of the numeric range is inclusive |
nlpcraft:num:isequalcondition | java.lang.Boolean | Whether this is an equality condition. Note that single numeric values also default to equality condition and this property will be true . Indeed, A is equal to 2 and A is 2 have the same meaning. |
nlpcraft:num:isnotequalcondition | java.lang.Boolean | Whether this is a not-equality condition. |
nlpcraft:num:isfromnegativeinfinity | java.lang.Boolean | Whether this range is from negative infinity. |
nlpcraft:num:israngecondition | java.lang.Boolean | Whether this is a range condition. |
nlpcraft:num:istopositiveinfinity | java.lang.Boolean | Whether this range is to positive infinity. |
nlpcraft:num:isfractional | java.lang.Boolean | Whether this token's value (single numeric value of a range) is a whole or a fractional number. |
nlpcraft:num:unit opt. | java.lang.String | Optional numeric value unit name (see below). |
nlpcraft:num:unittype opt. | java.lang.String | Optional numeric value unit type (see below). |
Following table provides possible values for nlpcraft:num:unit
and nlpcraft:num:unittype
properties:
num:unittype | num:unit possible values |
---|---|
mass | feet per second grams kilogram grain dram ounce pound hundredweight ton tonne slug |
torque | newton meter |
area | square meter acre are hectare square inches square feet square yards square miles |
paper quantity | paper bale |
force | kilopond pond |
pressure | pounds per square inch |
solid angle | steradian |
pressure stress | pascal |
luminous | flux lumen |
amount of substance | mole |
luminance | candela per square metre |
angle | radian degree |
magnetic flux density magnetic field | tesla |
power radiant flux | watt |
datetime | second minute hour day week month year |
electrical inductance | henry |
electric charge | coulomb |
temperature | kelvin centigrade fahrenheit |
voltage electrical | volt |
momentum | kilogram meters per second |
amount of heat | calorie |
electrical capacitance | farad |
radioactive decay | becquerel |
electrical conductance | siemens |
luminous intensity | candela |
work energy | joule |
quantities | dozen |
density | density |
sound | decibel |
electrical resistance impedance | ohm |
force weight | newton |
light quantity | lumen seconds |
length | meter millimeter centimeter decimeter kilometer astronomical unit light year parsec inch foot yard mile nautical mile |
refractive index | diopter |
frequency | hertz angular frequency |
power | kilowatt horsepower bar |
magnetic flux | weber |
current | ampere |
acceleration of gravity | gravity imperial gravity metric |
volume | cubic meter liter milliliter centiliter deciliter hectoliter cubic inch cubic foot cubic yard acre-foot teaspoon tablespoon fluid ounce cup gill pint quart gallon |
speed | miles per hour meters per second |
illuminance | lux |
nlpcraft:city
This token denotes a city. Additionally to nlpcraft:nlp:xxx
properties this type of token will have the following metadata properties all of which are mandatory unless otherwise noted.
Property | Java Type | Description |
---|---|---|
nlpcraft:city:city | java.lang.String | Name of the city. |
nlpcraft:city:continent | java.lang.String | Continent name. |
nlpcraft:city:subcontinent | java.lang.String | Subcontinent name. |
nlpcraft:city:subcontinent | java.lang.String | Subcontinent name. |
nlpcraft:city:countrymeta | java.util.Map | Supplemental metadata for city's country (see below). |
nlpcraft:city:citymeta | java.util.Map | Supplemental metadata for city (see below). |
Following tables provides possible values for nlpcraft:city:countrymeta
map. The data is obtained from The United Nations Statistics Division datasets:
Key | Java Type | Description |
---|---|---|
iso | java.lang.String | ISO country code. |
iso3 | java.lang.String | ISO 3166 country code. |
isocode | java.lang.String | ISO country code. |
capital opt. | java.lang.String | Optional country capital city name. |
area opt. | java.lang.Double | Optional country surface area. |
population opt. | java.lang.Long | Optional country population. |
continent | java.lang.String | Optional country continent. |
currencycode | java.lang.String | Country currency code. |
currencyname | java.lang.String | Country currency name. |
phone opt. | java.lang.String | Optional country phone code. |
postalcodeformat opt. | java.lang.String | Optional country postal code format. |
postalcoderegex opt. | java.lang.String | Optional country postal code regular expression. |
languages opt. | java.lang.String | Optional country list of languages. |
neighbours opt. | java.lang.String | Optional country list of neighbours. |
Following tables provides possible values for nlpcraft:city:citymeta
map. The data is obtained from The United Nations Statistics Division datasets:
Key | Java Type | Description |
---|---|---|
latitude | java.lang.Double | City latitude. |
longitude | java.lang.Double | City longitude. |
population | java.lang.Long | City population. |
elevation opt. | java.lang.Integer | Optional city elevation in meters. |
timezone | java.lang.String | City timezone. |
nlpcraft:continent
This token denotes a continent. Additionally to nlpcraft:nlp:xxx
properties this type of token will have the following metadata properties all of which are mandatory unless otherwise noted.
Property | Java Type | Description |
---|---|---|
nlpcraft:continent:continent | java.lang.String | Name of the continent. |
nlpcraft:subcontinent
This token denotes a subcontinent. Additionally to nlpcraft:nlp:xxx
properties this type of token will have the following metadata properties all of which are mandatory unless otherwise noted.
Property | Java Type | Description |
---|---|---|
nlpcraft:subcontinent:continent | java.lang.String | Name of the continent. |
nlpcraft:subcontinent:subcontinent | java.lang.String | Name of the subcontinent. |
nlpcraft:metro
This token denotes a metro area. Additionally to nlpcraft:nlp:xxx
properties this type of token will have the following metadata properties all of which are mandatory unless otherwise noted.
Property | Java Type | Description |
---|---|---|
nlpcraft:metro:metro | java.lang.String | Name of the metro area. |
nlpcraft:region
This token denotes a geographical region. Additionally to nlpcraft:nlp:xxx
properties this type of token will have the following metadata properties all of which are mandatory unless otherwise noted.
Property | Java Type | Description |
---|---|---|
nlpcraft:region:region | java.lang.String | Name of the region. |
nlpcraft:region:continent | java.lang.String | Continent name. |
nlpcraft:region:subcontinent | java.lang.String | Subcontinent name. |
nlpcraft:region:subcontinent | java.lang.String | Subcontinent name. |
nlpcraft:region:countrymeta | java.util.Map | Supplemental metadata for region's country (see below). |
Following tables provides possible values for nlpcraft:region:countrymeta
map. The data is obtained from The United Nations Statistics Division datasets:
Key | Java Type | Description |
---|---|---|
iso | java.lang.String | ISO country code. |
iso3 | java.lang.String | ISO 3166 country code. |
isocode | java.lang.String | ISO country code. |
capital opt. | java.lang.String | Optional country capital city name. |
area opt. | java.lang.Double | Optional country surface area. |
population opt. | java.lang.Long | Optional country population. |
continent | java.lang.String | Optional country continent. |
currencycode | java.lang.String | Country currency code. |
currencyname | java.lang.String | Country currency name. |
phone opt. | java.lang.String | Optional country phone code. |
postalcodeformat opt. | java.lang.String | Optional country postal code format. |
postalcoderegex opt. | java.lang.String | Optional country postal code regular expression. |
languages opt. | java.lang.String | Optional country list of languages. |
neighbours opt. | java.lang.String | Optional country list of neighbours. |
nlpcraft:country
This token denotes a country. Additionally to nlpcraft:nlp:xxx
properties this type of token will have the following metadata properties all of which are mandatory unless otherwise noted.
Property | Java Type | Description |
---|---|---|
nlpcraft:country:country | java.lang.String | Name of the country. |
nlpcraft:country:continent | java.lang.String | Continent name. |
nlpcraft:country:subcontinent | java.lang.String | Subcontinent name. |
nlpcraft:country:subcontinent | java.lang.String | Subcontinent name. |
nlpcraft:country:countrymeta | java.util.Map | Supplemental metadata for region's country (see below). |
Following tables provides possible values for nlpcraft:country:countrymeta
map. The data is obtained from The United Nations Statistics Division datasets:
Key | Java Type | Description |
---|---|---|
iso | java.lang.String | ISO country code. |
iso3 | java.lang.String | ISO 3166 country code. |
isocode | java.lang.String | ISO country code. |
capital opt. | java.lang.String | Optional country capital city name. |
area opt. | java.lang.Double | Optional country surface area. |
population opt. | java.lang.Long | Optional country population. |
continent | java.lang.String | Optional country continent. |
currencycode | java.lang.String | Country currency code. |
currencyname | java.lang.String | Country currency name. |
phone opt. | java.lang.String | Optional country phone code. |
postalcodeformat opt. | java.lang.String | Optional country postal code format. |
postalcoderegex opt. | java.lang.String | Optional country postal code regular expression. |
languages opt. | java.lang.String | Optional country list of languages. |
neighbours opt. | java.lang.String | Optional country list of neighbours. |
nlpcraft:coordinate
This token denotes a latitude and longitude coordinate. Additionally to nlpcraft:nlp:xxx
properties this type of token will have the following metadata properties all of which are mandatory unless otherwise noted.
Property | Java Type | Description |
---|---|---|
coordinate:latitude | java.lang.Double | Coordinate latitude. |
coordinate:longitude | java.lang.Double | Coordinate longitude. |
nlpcraft:sort
This token denotes a sorting or ordering function. Additionally to nlpcraft:nlp:xxx
properties this type of token will have the following metadata properties all of which are mandatory unless otherwise noted.
Property | Java Type | Description |
---|---|---|
nlpcraft:sort:subjindexes | java.util.List<Integer> | One of more indexes of the target tokens (i.e. the token that being sorted). |
nlpcraft:sort:byindexes | java.util.List<Integer> | Zero or more (i.e. optional) indexes of the reference token (i.e. the token being sorted by). |
nlpcraft:sort:asc | java.lang.Boolean | Whether sorting is in ascending or descending order. |
nlpcraft:limit
This token denotes a numeric limit value (like in "top 10" or "bottom five"). Additionally to nlpcraft:nlp:xxx
properties this type of token will have the following metadata properties all of which are mandatory unless otherwise noted.
Property | Java Type | Description |
---|---|---|
nlpcraft:limit:indexes | java.util.List<Integer> | Index (always only one) of the reference token (i.e. the token being limited). |
nlpcraft:limit:asc | java.lang.Boolean | Whether limit order is ascending or descending. |
nlpcraft:limit:limit | java.lang.Integer | Numeric value of the limit. |
nlpcraft:relation
This token denotes a numeric limit value (like in "top 10" or "bottom five"). Additionally to nlpcraft:nlp:xxx
properties this type of token will have the following metadata properties all of which are mandatory unless otherwise noted.
Property | Java Type | Description |
---|---|---|
nlpcraft:relation:indexes | java.util.List<Integer> | Index (always only one) of the reference token (i.e. the token being related to). |
nlpcraft:relation:type | java.lang.String | Type of the relation. One of the following values:
|
google:xxx
These tokens denote xxx
that is a lower case name of the named entity in Google APIs, i.e. google:person
, google:location
, etc. Additionally to nlpcraft:nlp:xxx
properties this type of token will have the following metadata properties all of which are mandatory unless otherwise noted.
Property | Java Type | Description |
---|---|---|
google:salience | java.lang.Double | Correctness probability of this token by Google Natural Language. |
google:meta | java.util.Map<String> | Map-based container for Google Natural Language specific properties. |
google:mentionsbeginoffsets | java.util.List<String> | List of the mention begin offsets in the original normalized text. |
google:mentionscontents | java.util.List<String> | List of the mentions. |
google:mentionstypes | java.util.List<String> | List of the mention types. |
stanford:xxx
These tokens denote xxx
that is a lower case name of the named entity in Stanford CoreNLP, i.e. stanford:person
, stanford:location
, etc. Additionally to nlpcraft:nlp:xxx
properties this type of token will have the following metadata properties all of which are mandatory unless otherwise noted.
Property | Java Type | Description |
---|---|---|
stanford:confidence | java.lang.Double | Correctness probability of this token by Stanford CoreNLP. |
stanford:nne | java.lang.String | Normalized Named Entity (NNE) text. |
spacy:xxx
These tokens denote xxx
that is a lower case name of the named entity in spaCy, i.e. spacy:person
, spacy:location
, etc. Additionally to nlpcraft:nlp:xxx
properties this type of token will have the following metadata properties all of which are mandatory unless otherwise noted.
Property | Java Type | Description |
---|---|---|
spacy:vector | java.lang.Double | spaCy span vector. |
spacy:sentiment | java.lang.Double | A scalar value indicating the positivity or negativity of the token. |
opennlp:xxx
These tokens denote xxx
that is a lower case name of the named entity in Apache OpenNLP, i.e. opennlp:person
, opennlp:money
, etc. Additionally to nlpcraft:nlp:xxx
properties this type of token will have the following metadata properties all of which are mandatory unless otherwise noted.
Property | Java Type | Description |
---|---|---|
opennlp:probability | java.lang.Double | Correctness probability of this token by OpenNLP. |
nlpcraft:nlp
nlpcraft:date
nlpcraft:num
nlpcraft:city
nlpcraft:continent
nlpcraft:subcontinent
nlpcraft:region
nlpcraft:country
nlpcraft:metro
nlpcraft:coordinate
nlpcraft:sort
nlpcraft:limit
nlpcraft:relation
stanford:xxx
spacy:xxx
google:xxx
opennlp:xxx