Elasticsearch: Query Basics

Han Yi

April 30, 2018

RESTful API of Search

  • Basic Search
    • URI based search
    • Request body based search
GET products/_doc/_search?q=rating:5

POST products/_doc/_search
{
  "query": {
    "nested":{
      "path":"child",
      "score_mode":"max",
      "query":{
        "simple_query_string":{
          "query":"red hat",
          "fields":[
            "child.color^2.0",
            "child.name^1.0",
          ]
        }
      }
    }
  }
}

Filter and Query Context

  • Filter Context
    • Does this document match this query clause?
    • case 1: bool filter/must_not
    • case 2: constant_score filter
    • case 3: aggregation filter
    • case 4: score functions
  • Query Context
    • How well does this document match this query clause?

Filter Context Examples

//1. filter inner bool query
POST context/_doc/_search
{
  "query": {
    "bool": {
      "must": {
        "match": {
          "name": "red hat"
        }      
      },
      "filter": {
        "term": {
          "context": "generic"
        }
      }
    }
  }
}

//2. constant_score
POST products/_doc/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "term": {
          "status": "active"
        }
      }
    }
  }
}

//3. aggregation filter
POST products/_doc/_search
{
  "aggs": {
    "nested_aggs": {
      "nested": {
        "path":"child"
      },
      "aggs": {
        "filtered_aggs": {
          "filter": {
            "bool": {
              "must": [
              {
                "term": {
                  "child.color":"red"
                }
              }
              ]
            }
          },
          "aggs": {
            "lvl": {
              "terms": {
                "field": "child.lvl",
                "order": {
                  "count":"desc"
                }
              },
              "aggs": {
                "count": {
                  "reverse_nested":{}
                }
              }
            }
          }
        }
      }
    }
  }
}

//4. Score functions
POST products/_doc/_search
{
  "query": {
    "function_score": {
      "query": { "match_all": {} },
        "boost": "5", 
        "functions": [
        {
          "filter": { "match": { "test": "bar" } },
          "random_score": {}, 
          "weight": 23
        },
        {
          "filter": { "match": { "test": "cat" } },
          "weight": 42
        }
        ],
        "max_boost": 42,
        "score_mode": "max",
        "boost_mode": "multiply",
        "min_score" : 42
    }
  }
}

Pros and Cons of Filter Context 

  • Performance
  • Caching
    • Frequently used filters will be cached
    • Cache is stored at the node level
    • Default to be 10% of the heap
    • Can be configured by indices.queries.cache.size: 10%
  • No relevancy

Basic Query Patterns

  • Term Level Query
    • term(s), range, exists, prefix, wildcard, regexp, fuzzy, type, ids
  • Full Text Query
    • (multi) match (phrase), common terms, (simple) query string
  • Compound Query
    • constant score, bool, dis max, function score, boosting

Term Level Query

//1. term vs terms query
POST expmgrrules/doc/_search
{
  "query": {
    "term": {
      "triggers": "26972"
    }
  }
}

POST expmgrrules/doc/_search
{
  "query": {
    "terms": {
      "triggers": ["26972","20676"]
    }
  }
}

//2. range query
POST products/doc/_search
{
  "query": {
    "nested": {
      "path": "child",
      "query": {
        "range": {
          "child.price": {
            "lte": 5
          }
        }
      }
    }
  }
}

//3. range with date query
POST products/doc/_search
{
  "from": 0,
  "size": 24,
  "query": {
    "nested": {
      "path": "path",
      "query": {
        "range": {
          "releaseDate": {
            "gt": "now-1h"
          }
        }
      }
    }
  }
}

//4. exists query
POST products/doc/_search
{
  "query": {
    "nested": {
      "path": "child",
      "query": {
        "exists": {
          "field": "special"
        }
      }
    }
  }
}

//5. wildcard/regex query
//* means any match, ? means single match
POST products/doc/_search
{
  "query": {
    "nested": {
      "path": "child",
      "query": {
        "wildcard": {
          "name": "boys*"
        }
      }
    }
  }
}

Term Level Query vs Full-Text Query

  • Term vs Match
    • Term level query can be used for numbers, booleans, dates, and text type field, but it ignores mapping types and only matches low-level exact terms inverted index
    • Match allows to use mapping type for input query and document field, then build complex term level query to look up the inverted index
    • Match by default use "should" to build term query, the user can use {"operator": "and"} to generate "must" term query

Full-Text Query: Match

//1. basic match
POST products/_doc/_search
{
  "query": {
    "nested":{
      "path":"child",
      "query":{
        "match": {
          "child.name": "men hat"
        }
      }
    }
  }
}

//2. complete match with the operator
POST products/_doc/_search
{
  "query": {
    "nested":{
      "path":"child",
      "query":{
        "match": {
          "child.name": {
            "query": "men hat",
            "operator": "AND"
          }
        }
      }
    }
  }
}

//3. multi match
POST products/_doc/_search
{
  "query": {
    "nested":{
      "path":"child",
      "query":{
        "multi_match": {
          "query": "red hat",
          "fields": ["child.name^1.0", "child.color^2.0"]
        }
      }
    }
  }
}

//4. match phrase
POST products/doc/_search
{
  "query": {
    "nested": {
      "path": "child",
      "query": {
        "match_phrase": {
          "child.name": {
            "query": "men hat",
            "slop": 2
          }
        }
      }
    }
  }
}

Full-Text Query: (Simple) Query String

//1. query string
POST products/_doc/_search
{
  "query": {
    "nested":{
      "path":"child",
      "score_mode":"max",
      "query":{
        "query_string":{
          "query":"(red hat) OR (blue hat)",
          "fields":[
            "child.color^2.0",
            "child.name^1.0",
          ]
        }
      }
    }
  }
}

//2. simple query string
POST products/_doc/_search
{
  "query": {
    "nested":{
      "path":"child",
      "score_mode":"max",
      "query":{
        "simple_query_string":{
          "query":"(red hat) OR (blue hat)",
          "fields":[
            "child.color^2.0",
            "child.name^1.0",
          ]
        }
      }
    }
  }
}

Full-Text Query: (Simple) Query String

  • Query String Syntax
    • Field names: "rating:(4 OR 5)"
    • Wildcards: "g*l ha?"
    • Regular expression: "name:/red.*hat/"
    • Fuzziness: "synoyms~1"
    • Proximity searches: "red hat~5"
    • Ranges: "rating:[3 TO 5]"
    • Boosting
      • "color^2.0" or "red^2.0 hat"
    • Boolean operators: "+red hat -shirt"
    • Grouping: "(red OR blue) hat"

Full-Text Query: (Simple) Query String

  • Query String vs Simple Query String
    • The query_string query parses the input and splits text around operators
    • Each textual part is analyzed independently of each other
    • Query string contains a lot of reserved characters, which could lead to a syntax error that prevents the query from running
    • Unlike the regular query_string query, the Simple query string query will never throw an exception and just discards invalid parts of the query

Compound Query

  • bool query structure
    • must, must_not, should are in the scoring mode
    • the filter is must match without scoring mode
{
  "query": {
    "bool": {
      "must": [],
      "must_not": [],
      "should": [],
      "filter": [],
    }
  }
}

Manual Intervention Scoring

  • Multi match / Query string
  • Function score query
  • Compound query boost
  • Rescore
  • Debugging score calculation
  • Pros and Cons of Query Context search

Function Score Query

  • Modify the score of documents that are retrieved by a query
  • Score function works on a filtered set of documents
GET products/_doc/_search
{
  "query": {
    "nested": {
      "path": "child",
      "query": {
        "function_score": {
          "query": {
            "match": {
              "child.color": "red"
            }
          },
          "boost": "1",
          "random_score": {}, 
          "boost_mode":"multiply"
        }
      }
    }
  }
}

Function Score Query

  • Score calculation with multiple filter context query
GET products/_doc/_search
{
  "query": {
    "nested": {
      "path": "child",
      "query": {
        "function_score": {
          "query": {
            "match": {
              "child.color": "red"
            }
          },
          "functions": [{
            "filter": {
              "match": { 
                "child.lvl1": "men"
              }
            },
            "weight": 2
          }, {
            "filter": {
              "match": { 
                "child.lvl1": "women"
              }
            },
            "weight": 5
          }],
          "boost": "1",
          "boost_mode":"multiply"
        }
      }
    }
  }
}

Function Score Query

  • Score calculation based on script score function
    • ​boost_mode: multiply, replace, sum, avg, max, min
GET products/_doc/_search
{
  "query": {
    "nested": {
      "path": "child",
      "query": {
        "function_score": {
          "query": {
            "match": {
              "child.color": "red"
            }
          },
          "script_score" : {
            "script" : {
              "source": "doc['child.price'].value"
            }
          },
          "boost": 0.1,
          "boost_mode":"sum"
        }
      }
    }
  }
}

Compound Query Boost

  • Score calculation based on script score function
    • ​boost_mode: multiply, replace, sum, avg, max, min
GET products/_doc/_search
{
  "query": {
    "bool": {
      "must": {
        "term": {
          "name": "jacket"
        }
      },
      "should": [
        {
          "range": {
            "price": {
              "lte": 20,
              "boost": 0.5
            }
          }
        },
        {
          "range": {
            "price": {
              "lte": 15,
              "boost": 2
            }
          }
        }
      ]
    }
  }
}

Rescore

  • Rescoring only the top N documents for precision improving and cost reduction, window_size refers to top N on each shard
POST products/doc/_search
{
  "query": {
    "nested": {
      "type": "child",
      "query": {
        "match_all": {}
      }
    }
  },
  "rescore": {
    "window_size" : 500,
    "query": {
      "rescore_query" : {
        "nested": {
          "type": "child",
          "query": {
            "function_score": {
              "script_score": {
                "script": {
                  "inline": "doc['price'].value"
                }
              }
            }  
          }
        }
      }
    }
  }
}

Debugging relevancy calculation

  • Explain scoring for specific document
  • Explain scoring for query
GET /products/_doc/{_id}/_explain

POST products/_doc/_search
{
  "explain": true,
  "query": {
    //......
  }
}
  • Support manually impact on the order of search result
  • Caching on shard
    • The results of the entire query are cached here
    • Only hits count, aggregation, and suggestions are cached
    • The result are only cached if size is 0 and no hits/document
    • Query json will be used as cache key
    • Default to be 1% of the heap
    • Can be configured by indices.requests.cache.size: 1%

Pros and Cons of Query Context 

Special processing in common query

  • Source fields selection
  • Sort
  • Pagination
  • Highlight

Selecting the fields in the response

  • White list in _source
    • can use wildcards match "*"
POST products/doc/_search
{
  "from": 0,
  "size": 24,
  "_source": ["pro*"],
  "query": {
    "nested": {
      "path": "child",
      "query": {
        "term": {
          "child.productName": "hat"
        }
      }
    }
  }
}

Selecting the fields in the response

  • script_fields to format output programatically
POST products/doc/_search
{
  "from": 0,
  "size": 24,
  "_source": [],
  "script_fields": {
    "max_price_including_unit": {
      "script": {
        "inline": "'$' + params['_source']['price']"
      }
    }
  },
  "query": {
    "nested": {
      "path": "child",
      "query": {
        "term": {
          "name": "hat"
        }
      }
    }
  }
}

Sorting

  • Sort
    • either by single field or multiple fields
POST products/doc/_search
{
  "from": 0,
  "size": 24,
  "query": {
    "nested": {
      "path": "child",
      "query": {
        "term": {
          "child.name": "hat"
        }
      }
    }
  },
  "sort": [{
      "child.price": "asc"
    }, {
      "child.no": "asc"
  }]
}

Pagination

  • Pagination
    • Default page size is 10
POST products/doc/_search
{
  "from": 0,
  "size": 24,
  "query": {
    "nested": {
      "path": "child",
      "query": {
        "term": {
          "name": "hat"
        }
      }
    }
  }
}

Highlighting

POST products/doc/_search
{
  "query": {
    "nested" : {
      "type" : "child",
        "score_mode" : "sum",
        "query": {
          "simple_query_string" : {
            "fields" : ["color^2", "size^1"],
            "query": "Red"
          }
        },
        "inner_hits": {
          "size": 1,
          "highlight": {
            "fields" : {
              "color" : {},
              "size" : {},
            }
          }
        }
    }
  }
}

Thanks

Elasticsearch: Query Basics

By hanyi8000

Elasticsearch: Query Basics

  • 2,168