Elasticsearch: Data Modeling

Han Yi

March 29, 2018

Determine your position

2.Data Model

2.1 Logical Model

2.2 Physical Model

1.Domain Model

?

Master your kits 

  • Clusters
  • Indices
  • Mapping
  • Analysis

Master your kits 

  • Cross Clusters Search

Master your kits 

  • Cross Clusters Search
PUT _cluster/settings
{
  "persistent": {
    "search": {
      "remote": {
        "cluster_one": {
          "seeds": [
            "127.0.0.1:9300"
          ]
        },
        "cluster_two": {
          "seeds": [
            "127.0.0.1:9301"
          ]
        },
        "cluster_three": {
          "seeds": [
            "127.0.0.1:9302"
          ]
        }
      }
    }
  }
}

Master your kits 

  • Cross Clusters Search
GET /cluster_one:twitter,twitter/_search
{
  "query": {
    "match": {
      "user": "kimchy"
    }
  }
}
  • Flexibility & Scalability
  • Performance
  • Software Upgrades
  • A/B testing

Master your kits 

  • Indices

Master your kits 

  • Mapping
    • Field data type ("schema")
{
  "internet_products-2018-06-27-06-25-10": {
    "mappings": {
      "_doc": {
        "properties": {
          "alternateImageUrl": {
            "type": "keyword"
          },
          "availableProductColors": {
            "type": "nested",
            "properties": {
              "colorSwatchImageName": {
                "type": "text",
                "fields": {
                  "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                  }
                }
              },
              "colorSwatchName": {
                "type": "text",
                "fields": {
                  "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                  }
                }
              },
              "familyName": {
                "type": "text",
                "fields": {
                  "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                  }
                }
              },
              "productColorAlternateImageUrl": {
                "type": "text",
                "fields": {
                  "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                  }
                }
              },
              "productColorImageUrl": {
                "type": "text",
                "fields": {
                  "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                  }
                }
              }
            }
          },
          "averageRatingNumber": {
            "type": "float"
          },
          "imageUrl": {
            "type": "keyword"
          },
          "maxOriginalPrice": {
            "type": "float"
          },
          "maxProductPrice": {
            "type": "float"
          },
          "minOriginalPrice": {
            "type": "float"
          },
          "minProductPrice": {
            "type": "float"
          },
          "newProductEndDate": {
            "type": "date",
            "format": "yyyy-MM-dd HH:mm:ss"
          },
          "productBeginLiveDate": {
            "type": "date",
            "format": "yyyy-MM-dd HH:mm:ss"
          },
          "productNumber": {
            "type": "keyword"
          },
          "productUrl": {
            "type": "keyword"
          },
          "reviewQuantity": {
            "type": "long"
          },
          "sku": {
            "type": "nested",
            "properties": {
              "ageRangeList": {
                "type": "text",
                "fields": {
                  "raw": {
                    "type": "keyword"
                  }
                },
                "analyzer": "commonFieldAnalyzer"
              },
              "alternativeImageUrl": {
                "type": "keyword"
              },
              "categories": {
                "type": "text",
                "analyzer": "categoryFieldAnalyzer"
              },
              "categoryLevel": {
                "properties": {
                  "lvl1": {
                    "type": "keyword"
                  },
                  "lvl2": {
                    "type": "keyword"
                  },
                  "lvl3": {
                    "type": "keyword"
                  },
                  "lvl4": {
                    "type": "keyword"
                  }
                }
              },
              "colorFamilyName": {
                "type": "text",
                "fields": {
                  "raw": {
                    "type": "keyword"
                  }
                },
                "analyzer": "commonFieldAnalyzer"
              },
              "colorName": {
                "type": "text",
                "analyzer": "colorFieldAnalyzer"
              },
              "combineColorList": {
                "type": "text",
                "analyzer": "commonFieldAnalyzer"
              },
              "combineSearchableList": {
                "type": "text",
                "analyzer": "commonFieldAnalyzer"
              },
              "currentPrice": {
                "type": "float"
              },
              "imageUrl": {
                "type": "keyword"
              },
              "productFeatureUrl": {
                "type": "keyword"
              },
              "searchTermText": {
                "type": "text",
                "analyzer": "commonFieldAnalyzer"
              },
              "shoeWidthList": {
                "type": "text",
                "fields": {
                  "raw": {
                    "type": "keyword"
                  }
                },
                "analyzer": "commonFieldAnalyzer"
              },
              "sizeDescription": {
                "type": "text",
                "fields": {
                  "raw": {
                    "type": "keyword"
                  }
                },
                "analyzer": "commonFieldAnalyzer"
              },
              "sizeRangeList": {
                "type": "text",
                "fields": {
                  "raw": {
                    "type": "keyword"
                  }
                },
                "analyzer": "commonFieldAnalyzer"
              },
              "styleName": {
                "type": "text",
                "analyzer": "commonFieldAnalyzer"
              },
              "styleNumber": {
                "type": "keyword"
              }
            }
          }
        }
      }
    }
  }
}

Master your kits 

  • Mapping
    • Meta-Fields
PUT logs
{
  "mappings": {
    "_doc": {
      "_source": {
        "includes": [
          "*.count",
          "meta.*"
        ],
        "excludes": [
          "meta.description",
          "meta.other.*"
        ]
      }
    }
  }
}

Master your kits 

  • Mapping
    • Dynamic Mapping
      • boolean, float, long, text, date, object can be automatically detected
      • array in JSON will depend on the first non-null value in that array
      • string will be either a date, double, long field or a text field, with a keyword sub-field
      • date/numeric detection from string can be turned on/off
      • Dynamic mapping can be turned on/off

Master your kits 

  • Mapping
    • Dynamic Mapping Template
{
    "mappings":{
        "_doc":{
            "dynamic_templates":[
                {
                    "features":{
                        "match":"*_features",
                        "match_mapping_type":"string",
                        "mapping":{
                            "type":"string",
                            //should use synonym, stemming
                            "analyzer":"features_analyzer"
                        }
                    }
                }
            ]
        }
    }
}

Master your kits 

  • Analysis

Deeper in Mapping

{
  "internet_products-2018-06-27-06-25-10": {
    "aliases": {
      "internet_products": {}
    },
    "settings": {
      "index": {
        "number_of_shards": "1",
        "provided_name": "internet_products-2018-06-27-06-25-10",
        "creation_date": "1530080710871",
        "analysis": {
          "filter": {
            "stemmer": {
              "type": "snowball",
              "language": "English"
            },
            "colorSynonym": {
              "type": "synonym",
              "synonyms": [               
              //...
              ]
            },
            "commonFieldSynonym": {
              "type": "synonym",
              "synonyms": [              
              //...
              ]
            },
            "categoryFieldSynonym": {
              "type": "synonym",
              "synonyms": [  
              //...             
              ]
            }
          },
          "analyzer": {
            "commonFieldAnalyzer": {
              "filter": [
                "standard",
                "lowercase",
                "stemmer",
                "commonFieldSynonym"
              ],
              "tokenizer": "standard"
            },
            "colorFieldAnalyzer": {
              "filter": [
                "standard",
                "lowercase",
                "stemmer",
                "colorSynonym"
              ],
              "tokenizer": "standard"
            },
            "categoryFieldAnalyzer": {
              "filter": [
                "standard",
                "lowercase",
                "stemmer",
                "categoryFieldSynonym"
              ],
              "tokenizer": "standard"
            }
          }
        },
        "number_of_replicas": "0",
        "uuid": "Cl9s4MqKRhWgqEICR97iKw",
        "version": {
          "created": "6020299"
        }
      }
    },
    "mappings": {
      "_doc": {
        "properties": {
          "alternateImageUrl": {
            "type": "keyword"
          },
          "availableProductColors": {
            "type": "nested",
            "properties": {
              "colorSwatchImageName": {
                "type": "text",
                "fields": {
                  "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                  }
                }
              },
              "colorSwatchName": {
                "type": "text",
                "fields": {
                  "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                  }
                }
              },
              "familyName": {
                "type": "text",
                "fields": {
                  "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                  }
                }
              },
              "productColorAlternateImageUrl": {
                "type": "text",
                "fields": {
                  "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                  }
                }
              },
              "productColorImageUrl": {
                "type": "text",
                "fields": {
                  "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                  }
                }
              }
            }
          },
          "averageRatingNumber": {
            "type": "float"
          },
          "imageUrl": {
            "type": "keyword"
          },
          "maxOriginalPrice": {
            "type": "float"
          },
          "maxProductPrice": {
            "type": "float"
          },
          "minOriginalPrice": {
            "type": "float"
          },
          "minProductPrice": {
            "type": "float"
          },
          "newProductEndDate": {
            "type": "date",
            "format": "yyyy-MM-dd HH:mm:ss"
          },
          "productBeginLiveDate": {
            "type": "date",
            "format": "yyyy-MM-dd HH:mm:ss"
          },
          "productNumber": {
            "type": "keyword"
          },
          "productUrl": {
            "type": "keyword"
          },
          "reviewQuantity": {
            "type": "long"
          },
          "sku": {
            "type": "nested",
            "properties": {
              "ageRangeList": {
                "type": "text",
                "fields": {
                  "raw": {
                    "type": "keyword"
                  }
                },
                "analyzer": "commonFieldAnalyzer"
              },
              "alternativeImageUrl": {
                "type": "keyword"
              },
              "categories": {
                "type": "text",
                "analyzer": "categoryFieldAnalyzer"
              },
              "categoryLevel": {
                "properties": {
                  "lvl1": {
                    "type": "keyword"
                  },
                  "lvl2": {
                    "type": "keyword"
                  },
                  "lvl3": {
                    "type": "keyword"
                  },
                  "lvl4": {
                    "type": "keyword"
                  }
                }
              },
              "colorFamilyName": {
                "type": "text",
                "fields": {
                  "raw": {
                    "type": "keyword"
                  }
                },
                "analyzer": "commonFieldAnalyzer"
              },
              "colorName": {
                "type": "text",
                "analyzer": "colorFieldAnalyzer"
              },
              "combineColorList": {
                "type": "text",
                "analyzer": "commonFieldAnalyzer"
              },
              "combineSearchableList": {
                "type": "text",
                "analyzer": "commonFieldAnalyzer"
              },
              "currentPrice": {
                "type": "float"
              },
              "imageUrl": {
                "type": "keyword"
              },
              "productFeatureUrl": {
                "type": "keyword"
              },
              "searchTermText": {
                "type": "text",
                "analyzer": "commonFieldAnalyzer"
              },
              "shoeWidthList": {
                "type": "text",
                "fields": {
                  "raw": {
                    "type": "keyword"
                  }
                },
                "analyzer": "commonFieldAnalyzer"
              },
              "sizeDescription": {
                "type": "text",
                "fields": {
                  "raw": {
                    "type": "keyword"
                  }
                },
                "analyzer": "commonFieldAnalyzer"
              },
              "sizeRangeList": {
                "type": "text",
                "fields": {
                  "raw": {
                    "type": "keyword"
                  }
                },
                "analyzer": "commonFieldAnalyzer"
              },
              "styleName": {
                "type": "text",
                "analyzer": "commonFieldAnalyzer"
              },
              "styleNumber": {
                "type": "keyword"
              }
            }
          }
        }
      }
    }    
  }
}

Deeper in Mapping: Analyzers

  • Pain point of traditional database
    • Given product name as "School Uniform Girls Solid Cable Knee Socks"
    • What if user input "girl keee sock"
  • Analyzer help to make index text/input query analyzable and searchable
    • By default standard analyzer will be used (Standard Tokenizer, Standard Token Filter, Lower Case Token Filter, etc)

Deeper in Mapping: Customized Analyzers

  • Character Filter
    • Optional pre-processer
    • HTML Strip Character Filter
    • Mapping Character Filter
    • Pattern Replace Character Filter
POST _analyze
{
  "tokenizer":      "keyword", 
  "char_filter":  [ "html_strip" ],
  "text": "<p>I&apos;m so <b>happy</b>!</p>"
}

[ \nI'm so happy!\n ]

//default standard analyzer will give
[ I'm, so, happy ]
PUT my_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "standard",
          "char_filter": [
            "my_char_filter"
          ]
        }
      },
      "char_filter": {
        "my_char_filter": {
          "type": "mapping",
          "mappings": [
            ":) => _happy_",
            ":( => _sad_"
          ]
        }
      }
    }
  }
}

POST my_index/_analyze
{
  "analyzer": "my_analyzer",
  "text": "I'm delighted about it :("
}

[ I'm, delighted, about, it, _sad_ ]
PUT my_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "standard",
          "char_filter": [
            "my_char_filter"
          ]
        }
      },
      "char_filter": {
        "my_char_filter": {
          "type": "pattern_replace",
          "pattern": "(\\d+)-(?=\\d)",
          "replacement": "$1_"
        }
      }
    }
  }
}

POST my_index/_analyze
{
  "analyzer": "my_analyzer",
  "text": "My credit card is 123-456-789"
}

[ My, credit, card, is 123_456_789 ]
  • Tokenizer
    • Breaks up text into individual tokens (usually individual words), and outputs a stream of tokens
    • Recording the order or position of each term and the start and end character offsets of the original word
    • Default tokenizer is "standard", based on the Unicode Text Segmentation algorithm
POST _analyze
{
  "tokenizer": "standard",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]

Deeper in Mapping: Customized Analyzers

  • Token Filters
    • Token filters accept a stream of tokens from a tokenizer and can modify tokens (eg lowercasing), delete tokens (eg remove stopwords) or add tokens (eg synonyms)
    • By default standard token filter will be taken and does nothing
{
  "internet_products-2018-06-27-06-25-10": {
    "aliases": {
      "internet_products": {}
    },
    "settings": {
      "index": {
        "number_of_shards": "1",
        "provided_name": "internet_products-2018-06-27-06-25-10",
        "creation_date": "1530080710871",
        "analysis": {
          "filter": {
            "stemmer": {
              "type": "snowball",
              "language": "English"
            },
            "commonFieldSynonym": {
              "type": "synonym",
              "synonyms": [              
              //...
              ]
            }
          },
          "analyzer": {
            "commonFieldAnalyzer": {
              "filter": [
                "standard",
                "lowercase",
                "stemmer",
                "commonFieldSynonym"
              ],
              "tokenizer": "standard"
            }
          }
        },
        "number_of_replicas": "0",
        "uuid": "Cl9s4MqKRhWgqEICR97iKw",
        "version": {
          "created": "6020299"
        }
      }
    },
    "mappings": {
      "_doc": {
        "properties": {
          "sku": {
            "type": "nested",
            "properties": {
              "ageRangeList": {
                "type": "text",
                "fields": {
                  "raw": {
                    "type": "keyword"
                  }
                },
                "analyzer": "commonFieldAnalyzer"
              }
            }
          }
        }
      }
    }    
  }
}

Deeper in Mapping: Customized Analyzers

  • Chinese Characters
    • smartcn supported by Lucene
    • elasticsearch-analysis-ik

Deeper in Mapping: Customized Analyzers

GET _analyze
{
  "analyzer": "smartcn",
  "text":"战车翻了!德国0-2爆冷出局 中超外援进首球"
}
{
  "tokens": [
    {
      "token": "战车",
      "start_offset": 0,
      "end_offset": 2,
      "type": "word",
      "position": 0
    },
    {
      "token": "翻",
      "start_offset": 2,
      "end_offset": 3,
      "type": "word",
      "position": 1
    },
    {
      "token": "了",
      "start_offset": 3,
      "end_offset": 4,
      "type": "word",
      "position": 2
    },
    {
      "token": "德国",
      "start_offset": 5,
      "end_offset": 7,
      "type": "word",
      "position": 4
    },
    {
      "token": "0",
      "start_offset": 7,
      "end_offset": 8,
      "type": "word",
      "position": 5
    },
    {
      "token": "2",
      "start_offset": 9,
      "end_offset": 10,
      "type": "word",
      "position": 7
    },
    {
      "token": "爆冷",
      "start_offset": 10,
      "end_offset": 12,
      "type": "word",
      "position": 8
    },
    {
      "token": "出局",
      "start_offset": 12,
      "end_offset": 14,
      "type": "word",
      "position": 9
    },
    {
      "token": "中",
      "start_offset": 15,
      "end_offset": 16,
      "type": "word",
      "position": 10
    },
    {
      "token": "超",
      "start_offset": 16,
      "end_offset": 17,
      "type": "word",
      "position": 11
    },
    {
      "token": "外援",
      "start_offset": 17,
      "end_offset": 19,
      "type": "word",
      "position": 12
    },
    {
      "token": "进",
      "start_offset": 19,
      "end_offset": 20,
      "type": "word",
      "position": 13
    },
    {
      "token": "首",
      "start_offset": 20,
      "end_offset": 21,
      "type": "word",
      "position": 14
    },
    {
      "token": "球",
      "start_offset": 21,
      "end_offset": 22,
      "type": "word",
      "position": 15
    }
  ]
}

Field Types

  • Core data types
    • Text, Keyword
    • Date
    • Numeric
    • Boolean
    • Binary
    • Range
  • Complex data types
    • Array
    • Object (JSON)
    • Nested
  • Geo data types
    • Geo point
    • Geo shape
  • Specialized data types
    • IP
    • Join
    • Percolator
    • Token count

Example: Complex Data Types

  • Complex data types
    • Array
      • any type of fields can be multi-value
    • Object
    • Nested: Internally, nested objects index each object in the array as a separate hidden document
//Array of Object
PUT my_index/_doc/1
{
  "group" : "fans",
  "user" : [ 
    {
      "first" : "John",
      "last" :  "Smith"
    },
    {
      "first" : "Alice",
      "last" :  "White"
    }
  ]
}

//Array of object
{
  "group" :        "fans",
  "user.first" : [ "alice", "john" ],
  "user.last" :  [ "smith", "white" ]
}

//Query
GET my_index/_search
{
  "query": {
    "bool": {
      "must": [
        { "match": { "user.first": "Alice" }},
        { "match": { "user.last":  "Smith" }}
      ]
    }
  }
}
  • join asks all children should be saved to same shard as parent
    • hash(document_id) % no_of_shards
PUT my_index
{
  "mappings": {
    "_doc": {
      "properties": {
        "my_join_field": { 
          "type": "join",
          "relations": {
            "question": "answer" 
          }
        }
      }
    }
  }
}
PUT my_index/_doc/1?refresh
{
  "text": "This is a question",
  "my_join_field": {
    "name": "question" 
  }
}

PUT my_index/_doc/3?routing=1&refresh
{
  "text": "This is an answer",
  "my_join_field": {
    "name": "answer", 
    "parent": "1" 
  }
}
GET /_search
{
  "query": {
    "has_child": {
      "type": "answer",
      "query": {
        "term": {
          "text": "answer"
        }
      }
    }
  }
}

Parent children pattern: join vs nested

  • join asks all children should be saved to same shard as parent
    • hash(document_id) % no_of_shards
PUT my_index
{
  "mappings": {
    "_doc": {
      "properties": {
        "user": {
          "type": "nested" 
        }
      }
    }
  }
}
PUT my_index/_doc/1
{
  "group" : "fans",
  "user" : [
    {
      "first" : "John",
      "last" :  "Smith"
    },
    {
      "first" : "Alice",
      "last" :  "White"
    }
  ]
}
GET my_index/_search
{
  "query": {
    "nested": {
      "path": "user",
      "query": {
        "bool": {
          "must": [
            { "match": { "user.first": "Alice" }},
            { "match": { "user.last":  "Smith" }} 
          ]
        }
      }
    }
  }
}

Parent children pattern: join vs nested

  • fields parameter
{
  "properties":{
    "DIM_PATH_INDX":{
      "type":"text",
      "analyzer":"dimension_path_analyzer",
      "fields":{
        "reverse":{
          "type":"text",
          "analyzer":"reverse_dimension_path_analyzer"
        }
      }
    },
    "INET_PRDT_NUM":{
      "type":"keyword"
    }
  }
}

Example: Multiple Sub-Fields

Thanks

Elasticsearch: Data Modeling

By hanyi8000

Elasticsearch: Data Modeling

  • 1,856