ElasticSearch SChema for Jarvis
"User" data flow Pattern
ORGANIZATION HAS_MANY DOCUMENTS
Problems with /<ORGANIZATION>/DOCUMENt/:ID
Cons
- Each shard is capable of storing alot of data.
- Small shards are wasting resources
- More indexes = more shards = more nodes required = more expensive
Pros
- Deleting data is easy, just delete the index
- Easy to rebuild/reindex since each index is much smaller
- No need for filtering when searching for a User's documents
How to adopt organizations/document/:id?
ALIASES TO THE RESCUE!
curl -XPOST localhost:9200/_aliases -d '{
"actions": [{
"add": {
"index": "organizations",
"alias": "organization_1",
"filter": {"term": {"organization_id": 1}},
"routing": 1
}
}]
}'
curl -XGET localhost:9200/organizations_1/document/_search
curl -XGET localhost:9200/organizations_1,organizations_2/document/_search
If any of the organizations gets too big, we can easily move that organization into its own index and update the aliase to point to it.
{
"mappings": {
"document": {
"properties": {
"segments": {
"type": "nested"
},
"organization_id": {
"type": "integer",
"index": "not_analyzed"
}
}
}
}
}
Treat Segments as a nested object
{
"organization_id": 1,
"title": "Singapore Model",
"document_type": "pdf",
"file_name": "MyForumPostExport",
"segments": [
{
"segment_id": 1,
"content": "Some lessons I would like to share about econom....",
"tags": ["singapore", "education", "english"]
},
{
"segment_id": 2,
"content": "hope my own growth could move in tandem with the nation growth. Happy New Year!",
"tags": ["chinese", "student"]
},
{
"segment_id": 3,
"content": " individual schools do celebrate the success of their top scorers at the school level, except they cheer them in groups instead. Read",
"tags": ["school", "psle"]
}
]
}
Treat Segments as a nested object
GET /organization_1/document/_search
{
"query": {
"nested": {
"path": "segments",
"query": {
"match_phrase": {
"segments.content": "growth"
}
}
}
},
"query": {
"match_phrase": {
"title": "model"
}
}
}
Searching by SEGMEnTS
Benchmarks!!
require 'elasticsearch'
require 'ffaker'
ORGANIZATION_COUNT = 100
DOCUMENT_COUNT = 1000
class ForumPost
def title
FFaker::Lorem.words.sample
end
def content
FFaker::Lorem.paragraph
end
def nickname
FFaker::Name.name
end
def date
FFaker::Time.date
end
def tags
[FFaker::Color.name, FFaker::Color.name]
end
def type
['pdf', 'doc', 'xls'].sample
end
end
client = Elasticsearch::Client.new
puts "Cleaning all indexes on existing cluster...."
system("curl -XDELETE 'http://localhost:9200/_all'")
puts "\n"
client.indices.create(
index: 'organizations',
body: {
"mappings": {
"document": {
"properties": {
"segments": {
"type": "nested"
},
"organization_id": {
"type": "integer",
"index": "not_analyzed"
}
}
}
}
}
)
ORGANIZATION_COUNT.times do |id|
DOCUMENT_COUNT.times do
forum_post = ForumPost.new
client.index(
index: 'organizations',
type: 'document',
body: {
organization_id: id + 1,
title: forum_post.title,
document_type: forum_post.type,
file_name: forum_post.title,
created_at: forum_post.date,
segments: [
{
content: forum_post.content,
tags: forum_post.tags
},
{
content: forum_post.content,
tags: forum_post.tags
}
]
}
)
end
end
ORGANIZATION_COUNT.times do |id|
id += 1
client.indices.put_alias(
index: 'organizations',
name: "organization_#{id}",
body: {
"filter": {"term": {"organization_id": id}},
"routing": id
}
)
end
client.index(
index: 'organization_1',
type: 'document',
body: {
organization_id: 1,
title: "donkey is good",
document_type: "pdf",
file_name: "king kong",
created_at: Date.today,
segments: [
{
content: "uniquecontenthere",
tags: ["rainbow", "and", "unicorns"]
}
]
}
)
sleep 1
response = client.search(
index: 'organization_1',
type: 'document',
body: {
query: {
nested: {
path: "segments",
query: {
match_phrase: {
"segments.content": "uniquecontenthere"
}
}
}
}
}
)
# puts response
puts "Searching for a Document through alias took #{response["took"]} ms"
response = client.search(
index: 'organizations',
type: 'document',
body: {
query: {
nested: {
path: "segments",
query: {
match_phrase: {
"segments.content": "uniquecontenthere"
}
}
}
}
}
)
# puts response
puts "Searching for a Document through entire index took #{response["took"]} ms"
/Organizations/documents/:ID
Results
Searching for a Document through alias took 37 ms
Searching for a Document through entire index took 27 ms
ElasticSearch
By tgxworld
ElasticSearch
- 1,286