Elasticsearch: Data Modeling
Han Yi
March 29, 2018
Determine your position
2.Data Model
2.1 Logical Model
2.2 Physical Model
1.Domain Model
?
Master your kits
- Clusters
- Indices
- Mapping
- Analysis
Master your kits
- Cross Clusters Search
Master your kits
- Cross Clusters Search
PUT _cluster/settings
{
"persistent": {
"search": {
"remote": {
"cluster_one": {
"seeds": [
"127.0.0.1:9300"
]
},
"cluster_two": {
"seeds": [
"127.0.0.1:9301"
]
},
"cluster_three": {
"seeds": [
"127.0.0.1:9302"
]
}
}
}
}
}
Master your kits
- Cross Clusters Search
GET /cluster_one:twitter,twitter/_search
{
"query": {
"match": {
"user": "kimchy"
}
}
}
- Flexibility & Scalability
- Performance
- Software Upgrades
- A/B testing
Master your kits
- Indices
Master your kits
- Mapping
- Field data type ("schema")
{
"internet_products-2018-06-27-06-25-10": {
"mappings": {
"_doc": {
"properties": {
"alternateImageUrl": {
"type": "keyword"
},
"availableProductColors": {
"type": "nested",
"properties": {
"colorSwatchImageName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"colorSwatchName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"familyName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"productColorAlternateImageUrl": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"productColorImageUrl": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"averageRatingNumber": {
"type": "float"
},
"imageUrl": {
"type": "keyword"
},
"maxOriginalPrice": {
"type": "float"
},
"maxProductPrice": {
"type": "float"
},
"minOriginalPrice": {
"type": "float"
},
"minProductPrice": {
"type": "float"
},
"newProductEndDate": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss"
},
"productBeginLiveDate": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss"
},
"productNumber": {
"type": "keyword"
},
"productUrl": {
"type": "keyword"
},
"reviewQuantity": {
"type": "long"
},
"sku": {
"type": "nested",
"properties": {
"ageRangeList": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
},
"analyzer": "commonFieldAnalyzer"
},
"alternativeImageUrl": {
"type": "keyword"
},
"categories": {
"type": "text",
"analyzer": "categoryFieldAnalyzer"
},
"categoryLevel": {
"properties": {
"lvl1": {
"type": "keyword"
},
"lvl2": {
"type": "keyword"
},
"lvl3": {
"type": "keyword"
},
"lvl4": {
"type": "keyword"
}
}
},
"colorFamilyName": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
},
"analyzer": "commonFieldAnalyzer"
},
"colorName": {
"type": "text",
"analyzer": "colorFieldAnalyzer"
},
"combineColorList": {
"type": "text",
"analyzer": "commonFieldAnalyzer"
},
"combineSearchableList": {
"type": "text",
"analyzer": "commonFieldAnalyzer"
},
"currentPrice": {
"type": "float"
},
"imageUrl": {
"type": "keyword"
},
"productFeatureUrl": {
"type": "keyword"
},
"searchTermText": {
"type": "text",
"analyzer": "commonFieldAnalyzer"
},
"shoeWidthList": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
},
"analyzer": "commonFieldAnalyzer"
},
"sizeDescription": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
},
"analyzer": "commonFieldAnalyzer"
},
"sizeRangeList": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
},
"analyzer": "commonFieldAnalyzer"
},
"styleName": {
"type": "text",
"analyzer": "commonFieldAnalyzer"
},
"styleNumber": {
"type": "keyword"
}
}
}
}
}
}
}
}
Master your kits
- Mapping
- Meta-Fields
PUT logs
{
"mappings": {
"_doc": {
"_source": {
"includes": [
"*.count",
"meta.*"
],
"excludes": [
"meta.description",
"meta.other.*"
]
}
}
}
}
Master your kits
- Mapping
-
Dynamic Mapping
- boolean, float, long, text, date, object can be automatically detected
- array in JSON will depend on the first non-null value in that array
- string will be either a date, double, long field or a text field, with a keyword sub-field
- date/numeric detection from string can be turned on/off
- Dynamic mapping can be turned on/off
-
Dynamic Mapping
Master your kits
- Mapping
- Dynamic Mapping Template
{
"mappings":{
"_doc":{
"dynamic_templates":[
{
"features":{
"match":"*_features",
"match_mapping_type":"string",
"mapping":{
"type":"string",
//should use synonym, stemming
"analyzer":"features_analyzer"
}
}
}
]
}
}
}
Master your kits
- Analysis
Deeper in Mapping
{
"internet_products-2018-06-27-06-25-10": {
"aliases": {
"internet_products": {}
},
"settings": {
"index": {
"number_of_shards": "1",
"provided_name": "internet_products-2018-06-27-06-25-10",
"creation_date": "1530080710871",
"analysis": {
"filter": {
"stemmer": {
"type": "snowball",
"language": "English"
},
"colorSynonym": {
"type": "synonym",
"synonyms": [
//...
]
},
"commonFieldSynonym": {
"type": "synonym",
"synonyms": [
//...
]
},
"categoryFieldSynonym": {
"type": "synonym",
"synonyms": [
//...
]
}
},
"analyzer": {
"commonFieldAnalyzer": {
"filter": [
"standard",
"lowercase",
"stemmer",
"commonFieldSynonym"
],
"tokenizer": "standard"
},
"colorFieldAnalyzer": {
"filter": [
"standard",
"lowercase",
"stemmer",
"colorSynonym"
],
"tokenizer": "standard"
},
"categoryFieldAnalyzer": {
"filter": [
"standard",
"lowercase",
"stemmer",
"categoryFieldSynonym"
],
"tokenizer": "standard"
}
}
},
"number_of_replicas": "0",
"uuid": "Cl9s4MqKRhWgqEICR97iKw",
"version": {
"created": "6020299"
}
}
},
"mappings": {
"_doc": {
"properties": {
"alternateImageUrl": {
"type": "keyword"
},
"availableProductColors": {
"type": "nested",
"properties": {
"colorSwatchImageName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"colorSwatchName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"familyName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"productColorAlternateImageUrl": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"productColorImageUrl": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"averageRatingNumber": {
"type": "float"
},
"imageUrl": {
"type": "keyword"
},
"maxOriginalPrice": {
"type": "float"
},
"maxProductPrice": {
"type": "float"
},
"minOriginalPrice": {
"type": "float"
},
"minProductPrice": {
"type": "float"
},
"newProductEndDate": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss"
},
"productBeginLiveDate": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss"
},
"productNumber": {
"type": "keyword"
},
"productUrl": {
"type": "keyword"
},
"reviewQuantity": {
"type": "long"
},
"sku": {
"type": "nested",
"properties": {
"ageRangeList": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
},
"analyzer": "commonFieldAnalyzer"
},
"alternativeImageUrl": {
"type": "keyword"
},
"categories": {
"type": "text",
"analyzer": "categoryFieldAnalyzer"
},
"categoryLevel": {
"properties": {
"lvl1": {
"type": "keyword"
},
"lvl2": {
"type": "keyword"
},
"lvl3": {
"type": "keyword"
},
"lvl4": {
"type": "keyword"
}
}
},
"colorFamilyName": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
},
"analyzer": "commonFieldAnalyzer"
},
"colorName": {
"type": "text",
"analyzer": "colorFieldAnalyzer"
},
"combineColorList": {
"type": "text",
"analyzer": "commonFieldAnalyzer"
},
"combineSearchableList": {
"type": "text",
"analyzer": "commonFieldAnalyzer"
},
"currentPrice": {
"type": "float"
},
"imageUrl": {
"type": "keyword"
},
"productFeatureUrl": {
"type": "keyword"
},
"searchTermText": {
"type": "text",
"analyzer": "commonFieldAnalyzer"
},
"shoeWidthList": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
},
"analyzer": "commonFieldAnalyzer"
},
"sizeDescription": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
},
"analyzer": "commonFieldAnalyzer"
},
"sizeRangeList": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
},
"analyzer": "commonFieldAnalyzer"
},
"styleName": {
"type": "text",
"analyzer": "commonFieldAnalyzer"
},
"styleNumber": {
"type": "keyword"
}
}
}
}
}
}
}
}
Deeper in Mapping: Analyzers
- Pain point of traditional database
- Given product name as "School Uniform Girls Solid Cable Knee Socks"
- What if user input "girl keee sock"
- Analyzer help to make index text/input query analyzable and searchable
- By default standard analyzer will be used (Standard Tokenizer, Standard Token Filter, Lower Case Token Filter, etc)
Deeper in Mapping: Customized Analyzers
- Character Filter
- Optional pre-processer
- HTML Strip Character Filter
- Mapping Character Filter
- Pattern Replace Character Filter
POST _analyze
{
"tokenizer": "keyword",
"char_filter": [ "html_strip" ],
"text": "<p>I'm so <b>happy</b>!</p>"
}
[ \nI'm so happy!\n ]
//default standard analyzer will give
[ I'm, so, happy ]
PUT my_index
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "standard",
"char_filter": [
"my_char_filter"
]
}
},
"char_filter": {
"my_char_filter": {
"type": "mapping",
"mappings": [
":) => _happy_",
":( => _sad_"
]
}
}
}
}
}
POST my_index/_analyze
{
"analyzer": "my_analyzer",
"text": "I'm delighted about it :("
}
[ I'm, delighted, about, it, _sad_ ]
PUT my_index
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "standard",
"char_filter": [
"my_char_filter"
]
}
},
"char_filter": {
"my_char_filter": {
"type": "pattern_replace",
"pattern": "(\\d+)-(?=\\d)",
"replacement": "$1_"
}
}
}
}
}
POST my_index/_analyze
{
"analyzer": "my_analyzer",
"text": "My credit card is 123-456-789"
}
[ My, credit, card, is 123_456_789 ]
- Tokenizer
- Breaks up text into individual tokens (usually individual words), and outputs a stream of tokens
- Recording the order or position of each term and the start and end character offsets of the original word
- Default tokenizer is "standard", based on the Unicode Text Segmentation algorithm
POST _analyze
{
"tokenizer": "standard",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
Deeper in Mapping: Customized Analyzers
- Token Filters
- Token filters accept a stream of tokens from a tokenizer and can modify tokens (eg lowercasing), delete tokens (eg remove stopwords) or add tokens (eg synonyms)
- By default standard token filter will be taken and does nothing
{
"internet_products-2018-06-27-06-25-10": {
"aliases": {
"internet_products": {}
},
"settings": {
"index": {
"number_of_shards": "1",
"provided_name": "internet_products-2018-06-27-06-25-10",
"creation_date": "1530080710871",
"analysis": {
"filter": {
"stemmer": {
"type": "snowball",
"language": "English"
},
"commonFieldSynonym": {
"type": "synonym",
"synonyms": [
//...
]
}
},
"analyzer": {
"commonFieldAnalyzer": {
"filter": [
"standard",
"lowercase",
"stemmer",
"commonFieldSynonym"
],
"tokenizer": "standard"
}
}
},
"number_of_replicas": "0",
"uuid": "Cl9s4MqKRhWgqEICR97iKw",
"version": {
"created": "6020299"
}
}
},
"mappings": {
"_doc": {
"properties": {
"sku": {
"type": "nested",
"properties": {
"ageRangeList": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
},
"analyzer": "commonFieldAnalyzer"
}
}
}
}
}
}
}
}
Deeper in Mapping: Customized Analyzers
- Chinese Characters
- smartcn supported by Lucene
- elasticsearch-analysis-ik
Deeper in Mapping: Customized Analyzers
GET _analyze
{
"analyzer": "smartcn",
"text":"战车翻了!德国0-2爆冷出局 中超外援进首球"
}
{
"tokens": [
{
"token": "战车",
"start_offset": 0,
"end_offset": 2,
"type": "word",
"position": 0
},
{
"token": "翻",
"start_offset": 2,
"end_offset": 3,
"type": "word",
"position": 1
},
{
"token": "了",
"start_offset": 3,
"end_offset": 4,
"type": "word",
"position": 2
},
{
"token": "德国",
"start_offset": 5,
"end_offset": 7,
"type": "word",
"position": 4
},
{
"token": "0",
"start_offset": 7,
"end_offset": 8,
"type": "word",
"position": 5
},
{
"token": "2",
"start_offset": 9,
"end_offset": 10,
"type": "word",
"position": 7
},
{
"token": "爆冷",
"start_offset": 10,
"end_offset": 12,
"type": "word",
"position": 8
},
{
"token": "出局",
"start_offset": 12,
"end_offset": 14,
"type": "word",
"position": 9
},
{
"token": "中",
"start_offset": 15,
"end_offset": 16,
"type": "word",
"position": 10
},
{
"token": "超",
"start_offset": 16,
"end_offset": 17,
"type": "word",
"position": 11
},
{
"token": "外援",
"start_offset": 17,
"end_offset": 19,
"type": "word",
"position": 12
},
{
"token": "进",
"start_offset": 19,
"end_offset": 20,
"type": "word",
"position": 13
},
{
"token": "首",
"start_offset": 20,
"end_offset": 21,
"type": "word",
"position": 14
},
{
"token": "球",
"start_offset": 21,
"end_offset": 22,
"type": "word",
"position": 15
}
]
}
Field Types
- Core data types
- Text, Keyword
- Date
- Numeric
- Boolean
- Binary
- Range
-
Complex data types
- Array
- Object (JSON)
- Nested
- Geo data types
- Geo point
- Geo shape
-
Specialized data types
- IP
- Join
- Percolator
- Token count
Example: Complex Data Types
-
Complex data types
- Array
- any type of fields can be multi-value
- Object
- Nested: Internally, nested objects index each object in the array as a separate hidden document
- Array
//Array of Object
PUT my_index/_doc/1
{
"group" : "fans",
"user" : [
{
"first" : "John",
"last" : "Smith"
},
{
"first" : "Alice",
"last" : "White"
}
]
}
//Array of object
{
"group" : "fans",
"user.first" : [ "alice", "john" ],
"user.last" : [ "smith", "white" ]
}
//Query
GET my_index/_search
{
"query": {
"bool": {
"must": [
{ "match": { "user.first": "Alice" }},
{ "match": { "user.last": "Smith" }}
]
}
}
}
-
join asks all children should be saved to same shard as parent
- hash(document_id) % no_of_shards
PUT my_index
{
"mappings": {
"_doc": {
"properties": {
"my_join_field": {
"type": "join",
"relations": {
"question": "answer"
}
}
}
}
}
}
PUT my_index/_doc/1?refresh
{
"text": "This is a question",
"my_join_field": {
"name": "question"
}
}
PUT my_index/_doc/3?routing=1&refresh
{
"text": "This is an answer",
"my_join_field": {
"name": "answer",
"parent": "1"
}
}
GET /_search
{
"query": {
"has_child": {
"type": "answer",
"query": {
"term": {
"text": "answer"
}
}
}
}
}
Parent children pattern: join vs nested
-
join asks all children should be saved to same shard as parent
- hash(document_id) % no_of_shards
PUT my_index
{
"mappings": {
"_doc": {
"properties": {
"user": {
"type": "nested"
}
}
}
}
}
PUT my_index/_doc/1
{
"group" : "fans",
"user" : [
{
"first" : "John",
"last" : "Smith"
},
{
"first" : "Alice",
"last" : "White"
}
]
}
GET my_index/_search
{
"query": {
"nested": {
"path": "user",
"query": {
"bool": {
"must": [
{ "match": { "user.first": "Alice" }},
{ "match": { "user.last": "Smith" }}
]
}
}
}
}
}
Parent children pattern: join vs nested
- fields parameter
{
"properties":{
"DIM_PATH_INDX":{
"type":"text",
"analyzer":"dimension_path_analyzer",
"fields":{
"reverse":{
"type":"text",
"analyzer":"reverse_dimension_path_analyzer"
}
}
},
"INET_PRDT_NUM":{
"type":"keyword"
}
}
}
Example: Multiple Sub-Fields
Thanks
Elasticsearch: Data Modeling
By hanyi8000
Elasticsearch: Data Modeling
- 2,006