Smart Search using Elasticsearch
Features
- Weightage on specific fields
- Multi fields Query
- Synonym & Acronym implementation
- Exact search, Autocomplete, Typo (did-you-mean)
Implementation
- Settings
- Mapping
- Query
Settings
- Filters
- Tokenizer & Analyzer
Filters
Elasticsearch provides a lot of filters. We gonna use:
- Synonym Token Filter for synonym & acronym features.
- Ngram Token Filter for autocomplete features.
- Lowercase Token Filter for avoiding case sensitive query.
PUT /synonym_test
{
"settings": {
"index": {
"max_ngram_diff": 99,
"analysis": {
"analyzer": {
"synonym_analyzer": {
"tokenizer": "whitespace",
"filter": [
"lowercase",
"synonym"
]
},
"autocomplete_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"autocomplete_filter"
]
}
},
"filter": {
"synonym": {
"type": "synonym",
"synonyms": [
"unlimited, endlessness => infinity",
"chaos, conflict, bloodshed => war"
]
},
"autocomplete_filter": {
"type": "ngram",
"min_gram": 1,
"max_gram": 20
}
}
}
}
}
}
Synonym Filter
- Give a name to the filter. Here we use "synonym".
- Set the filter "type" : "synonym" .
- Set the synonym/acronym words and the standard keyword. For example, 'unlimited' and 'endlessness' is the words, 'infinity' is the standard keyword.
- We can put multiple line of synonym/acronym words.
-
If we have a lot of synonym, we can use
"synonyms_path" : "synonym.txt"
PUT /synonym_test
{
"settings": {
"index": {
"max_ngram_diff": 99,
"analysis": {
"analyzer": {
"synonym_analyzer": {
"tokenizer": "whitespace",
"filter": [
"lowercase",
"synonym"
]
},
"autocomplete_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"autocomplete_filter"
]
}
},
"filter": {
"synonym": {
"type": "synonym",
"synonyms": [
"unlimited, endlessness => infinity",
"chaos, conflict, bloodshed => war"
]
},
"autocomplete_filter": {
"type": "ngram",
"min_gram": 1,
"max_gram": 20
}
}
}
}
}
}
Ngram & Lowercase Filter
- Give a name to the filter. Here we use "autocomplete_filter".
- Set the filter "type" : "ngram" .
- Set "min_gram": 1 and "max_gram": 20 .
- By default the difference between min_gram and max_gram should not more than 1. To overcome this we use "max_gram_diff": 99 .
- Lowercase filter is built in function in elasticsearch. We can implement it directly into analyzer.
PUT /synonym_test
{
"settings": {
"index": {
"max_ngram_diff": 99,
"analysis": {
"analyzer": {
"synonym_analyzer": {
"tokenizer": "whitespace",
"filter": [
"lowercase",
"synonym"
]
},
"autocomplete_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"autocomplete_filter"
]
}
},
"filter": {
"synonym": {
"type": "synonym",
"synonyms": [
"unlimited, endlessness => infinity",
"chaos, conflict, bloodshed => war"
]
},
"autocomplete_filter": {
"type": "ngram",
"min_gram": 1,
"max_gram": 20
}
}
}
}
}
}
Tokenizer/Analyzer
- Elasticsearch have a lot of Tokenizer and Analyzer. We will implement Tokenizer inside the Analyzer.
-
We will use Whitespace Tokenizer to breaks text into terms whenever it encounters a whitespace character.
-
Then, we will implement Custom Analyzer to combine several filter.
-
We will build two separate analyzer, autocomplete_analyzer at index time, and synonym_analyzer at search time. This is because we want to implement Ngram filter on our synonym_analyzer.
PUT /synonym_test
{
"settings": {
"index": {
"max_ngram_diff": 99,
"analysis": {
"analyzer": {
"synonym_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"synonym"
]
},
"autocomplete_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"autocomplete_filter"
]
}
},
"filter": {
"synonym": {
"type": "synonym",
"synonyms": [
"unlimited, endlessness => infinity",
"chaos, conflict, bloodshed => war"
]
},
"autocomplete_filter": {
"type": "ngram",
"min_gram": 1,
"max_gram": 20
}
}
}
}
}
}
Tokenizer/Analyzer
- Give name to the analyzer. Here we use "synonym_analyzer".
- Set "tokenizer": "whitespace".
- Then, we will implement "filter": ["lowercase", "synonym"] for synonym_analyzer.
- Repeat the same step for the "autocomplete_analyzer". Here we will use "filter": ["lowercase", "autocomplete_filter"].
PUT /synonym_test
{
"settings": {
"index": {
"max_ngram_diff": 99,
"analysis": {
"analyzer": {
"synonym_analyzer": {
"tokenizer": "whitespace",
"filter": [
"lowercase",
"synonym"
]
},
"autocomplete_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"autocomplete_filter"
]
}
},
"filter": {
"synonym": {
"type": "synonym",
"synonyms": [
"unlimited, endlessness => infinity",
"chaos, conflict, bloodshed => war"
]
},
"autocomplete_filter": {
"type": "ngram",
"min_gram": 1,
"max_gram": 20
}
}
}
}
}
}
Mapping
- Field and Data Type
- Analyzer and search analyzer.
Field & Datatype
- Set the field name as "movie_name".
- Set the data type as "type": "text". We can use any data type here, but to implement analyzers for the field, keyword data type is not supported.
- The solution is to build a nested data type in the field we want to implement analyzers. "fields": { "keyword": { "type": "keyword" } }
- Repeat the step for as many fields we want to add.
PUT /synonym_test/_mapping/doc
{
"properties": {
"movie_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
},
"analyzer": "autocomplete_analyzer",
"search_analyzer": "synonym_analyzer"
},
"year": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
},
"analyzer": "autocomplete_analyzer",
"search_analyzer": "synonym_analyzer"
},
"subtitle": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
},
"analyzer": "autocomplete_analyzer",
"search_analyzer": "synonym_analyzer"
},
"weight": {
"type": "integer",
"fields": {
"keyword": {
"type": "keyword"
}
}
}
}
}
Analyzer in Mapping
- After setting the datatype for the field, we will implement the analyzers we already build.
- For the default analyzer we will use "analyzer": "autocomplete_analyzer".
- For the search analyzer we will implement "search_analyzer": "synonym_analyzer".
- We use autocomplete_analyzer for the default analyzer because we want to tokenize the data/phrase with ngram filter first, before we match the synonym/acronym at the search_analyzer.
PUT /synonym_test/_mapping/doc
{
"properties": {
"movie_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
},
"analyzer": "autocomplete_analyzer",
"search_analyzer": "synonym_analyzer"
},
"year": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
},
"analyzer": "autocomplete_analyzer",
"search_analyzer": "synonym_analyzer"
},
"subtitle": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
},
"analyzer": "autocomplete_analyzer",
"search_analyzer": "synonym_analyzer"
},
"weight": {
"type": "integer",
"fields": {
"keyword": {
"type": "keyword"
}
}
}
}
}
INDEXING
POST /synonym_test/doc
{
"movie_name": "marvels INFINITY WAR",
"year" : "2018",
"subtitle" : "english",
"weight": 5
}
POST /synonym_test/doc
{
"movie_name": "marvels thor",
"year" : "2014",
"subtitle" : "malay",
"weight": 1
}
POST /synonym_test/doc
{
"movie_name": "marvels the avengers",
"year" : "2016",
"subtitle" : "chinese",
"weight": 3
}
Query
- Autocomplete
- Synonym
- Weightage
- Exact search
- Typo
Autocomplete & Synonym
- For the autocomplete query, we will use the multi-match query.
- Multi-match query will allow us to query multiple fields at once.
- We also can use a synonym/acronym for our query. Example, for query "bloodshed ENDLESSNESS", the result will be "marvels INFINITY WAR".
- It still works for case sensitive word because we already set "lowercase" filter for our analyzer.
GET /synonym_test/_search
{
"query": {
"multi_match" : {
"query": "marv",
"fields": [ "movie_name", "year", "subtitle" ]
}
}
}
GET /synonym_test/_search
{
"query": {
"multi_match" : {
"query": "bloodshed ENDLESSNESS",
"fields": [ "movie_name", "year", "subtitle" ]
}
}
}
Weightage
- We can implement weightage on our query by implementing Boosting by Popularity feature in Elasticsearch.
- We need to put the "multi_match" query inside the "function_score" object.
- Then, inside the "field_value_factor", we set "field": "weight".
- We also can use any field other than "weight" field, that we already set during the mapping. As long as the field contain integer datatype.
GET /synonym_test/_search
{
"query": {
"function_score": {
"query": {
"multi_match": {
"query": "inf",
"fields": [ "movie_name", "year", "subtitle" ]
}
},
"field_value_factor": {
"field": "weight",
"modifier": "log1p"
}
}
}
}
Exach Search & Typo
- To implement exact search, we can add "type": "phrase" in the multi-match query.
- It will search all the included fields for the exact query. It will return result if any one of the fields exactly match with the query.
- We can set "fuzziness": "auto" if we want to use typo/did-you-mean feature.
GET /synonym_test/_search
{
"query": {
"multi_match" : {
"query": "marv",
"type": "phrase",
"fields": [ "movie_name", "year", "subtitle" ]
}
}
}
GET /synonym_test/_search
{
"query": {
"multi_match" : {
"query": "marv",
"fields": [ "movie_name", "year", "subtitle" ],
"fuzziness": "auto"
}
}
}
Best Practice Autocomplete
1. Index name: city_weight
2. Settings:
- filter: lowercase, synonym, autocomplete_filter.
- analyzer: synonym_analyzer, autocomplete_analyzer.
3. Mapping:
- nested keyword: text and keyword.
- analyzer: autocomplete_analyzer
- search_analyzer: synonym_analyzer
{
"template": "city_weight",
"order": 1,
"settings": {
"index": {
"max_ngram_diff": 99,
"analysis": {
"analyzer": {
"synonym_analyzer": {
"tokenizer": "whitespace",
"filter": [
"lowercase",
"synonym"
]
},
"autocomplete_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"autocomplete_filter"
]
}
},
"filter": {
"synonym": {
"type": "synonym",
"synonyms" : [
"AYER => AIR",
"ALUR => ALOR",
"AMPG => AMPANG",
"EMPAT => AMPAT",
"HANTU => ANTU",
"ASHN => ASAHAN",
"ATS => ATAS",
"AIR => AYER",
"BGN => BAGAN",
"BARU, BAHARU, BHARU => BAHRU",
"BLK, BALEK => BALIK",
"BDR, B., BNDR => BANDAR"
]
},
"autocomplete_filter": {
"type": "ngram",
"min_gram": 1,
"max_gram": 20
}
}
}
}
},
"mappings": {
"doc": {
"properties": {
"city_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
},
"analyzer": "autocomplete_analyzer",
"search_analyzer": "synonym_analyzer"
},
"state": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
},
"analyzer": "autocomplete_analyzer",
"search_analyzer": "synonym_analyzer"
},
"filter": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
},
"analyzer": "autocomplete_analyzer",
"search_analyzer": "synonym_analyzer"
},
"weight": {
"type": "long",
"fields": {
"keyword": {
"type": "keyword"
}
}
}
}
}
}
}
Best Practice Autocomplete
4. Query:
- weightage: function_score, field_value_factor
- exact match: "type": "phrase"
- multiple field: multi_match query
GET /city_weight/_search
{
"query": {
"function_score": {
"query": {
"multi_match": {
"query": "kuala",
"fields": [ "city_name", "state", "filter" ]
}
},
"field_value_factor": {
"field": "weight"
},
"boost_mode": "max"
}
}
}
Best Practice Did-you-mean
1. Query:
- multiple field: multi_match query
- fuzzines: auto
GET /city_weight/_search
{
"query": {
"multi_match" : {
"query": "pahag",
"fields": [ "city_name", "state", "filter" ],
"fuzziness": "auto"
}
}
}
Best Practice NodeJS
const elasticsearch = require('elasticsearch');
const client = new elasticsearch.Client({
host : '103.245.90.189:3002',
});
const index = 'city_weight';
const simpleQuery = async () => {
const response = await client.search({
index: index,
body: {
"query": {
"function_score": {
"query": {
"multi_match": {
"query": "pahang",
"type": "phrase",
"fields": [ "city_name", "state", "filter" ]
}
},
"field_value_factor": {
"field": "weight" }
}
}
},
});
try{
res = response;
console.dir(res, {depth:null, colors:true })
} catch (error) {
console.log(error.message)
}
}
simpleQuery();
More Resources
Smart Search using Elasticsearch
By Muhammad Izzuddin Abdul Mutalib
Smart Search using Elasticsearch
- 373