ElasticSearch SChema for Jarvis

"User" data flow Pattern

ORGANIZATION HAS_MANY DOCUMENTS

Problems with /<ORGANIZATION>/DOCUMENt/:ID

Cons

Each shard is capable of storing alot of data.
Small shards are wasting resources
More indexes = more shards = more nodes required = more expensive

Pros

Deleting data is easy, just delete the index
Easy to rebuild/reindex since each index is much smaller
No need for filtering when searching for a User's documents

How to adopt organizations/document/:id?

ALIASES TO THE RESCUE!

curl -XPOST localhost:9200/_aliases -d '{
    "actions": [{
        "add": {
            "index": "organizations",
            "alias": "organization_1",
            "filter": {"term": {"organization_id": 1}},
            "routing": 1
        }
    }]
}'

curl -XGET localhost:9200/organizations_1/document/_search
curl -XGET localhost:9200/organizations_1,organizations_2/document/_search

If any of the organizations gets too big, we can easily move that organization into its own index and update the aliase to point to it.

{
  "mappings": {
    "document": {
      "properties": {
        "segments": {
          "type": "nested"
        },
        "organization_id": {
          "type": "integer",
          "index": "not_analyzed"
        }
      }
    }
  }
}

Treat Segments as a nested object

{
  "organization_id": 1,
  "title": "Singapore Model",
  "document_type": "pdf",
  "file_name": "MyForumPostExport",
  "segments": [ 
    {
      "segment_id": 1,
      "content": "Some lessons I would like to share about econom....",
      "tags": ["singapore", "education", "english"]
    },
    {
      "segment_id": 2,
      "content": "hope my own growth could move in tandem with the nation growth. Happy New Year!",
      "tags": ["chinese", "student"]
    },
    {
      "segment_id": 3,
      "content": " individual schools do celebrate the success of their top scorers at the school level, except they cheer them in groups instead. Read",
      "tags": ["school", "psle"]
    }
  ]
}

Treat Segments as a nested object

GET /organization_1/document/_search
{
  "query": {
    "nested": {
      "path": "segments",
      "query": {
        "match_phrase": {
          "segments.content": "growth"
        }
      }
    }
  },
  "query": {
    "match_phrase": {
      "title": "model"
    }
  }
}

Searching by SEGMEnTS

Benchmarks!!

require 'elasticsearch'
require 'ffaker'

ORGANIZATION_COUNT = 100
DOCUMENT_COUNT = 1000

class ForumPost
  def title
    FFaker::Lorem.words.sample
  end

  def content
    FFaker::Lorem.paragraph
  end

  def nickname
    FFaker::Name.name
  end

  def date
    FFaker::Time.date
  end

  def tags
    [FFaker::Color.name, FFaker::Color.name]
  end

  def type
    ['pdf', 'doc', 'xls'].sample
  end
end

client = Elasticsearch::Client.new

puts "Cleaning all indexes on existing cluster...."
system("curl -XDELETE 'http://localhost:9200/_all'")
puts "\n"

client.indices.create(
  index: 'organizations',
  body: {
    "mappings": {
      "document": {
        "properties": {
          "segments": {
            "type": "nested"
          },
          "organization_id": {
            "type": "integer",
            "index": "not_analyzed"
          }
        }
      }
    }
  }
)

ORGANIZATION_COUNT.times do |id|
  DOCUMENT_COUNT.times do
    forum_post = ForumPost.new

    client.index(
      index: 'organizations',
      type: 'document',
      body: {
        organization_id: id + 1,
        title: forum_post.title,
        document_type: forum_post.type,
        file_name: forum_post.title,
        created_at: forum_post.date,
        segments: [
          {
            content: forum_post.content,
            tags: forum_post.tags
          },
          {
            content: forum_post.content,
            tags: forum_post.tags
          }
        ]
      }
    )
  end
end

ORGANIZATION_COUNT.times do |id|
  id += 1

  client.indices.put_alias(
    index: 'organizations',
    name: "organization_#{id}",
    body: {
      "filter": {"term": {"organization_id": id}},
      "routing": id
    }
  )
end

client.index(
  index: 'organization_1',
  type: 'document',
  body: {
    organization_id: 1,
    title: "donkey is good",
    document_type: "pdf",
    file_name: "king kong",
    created_at: Date.today,
    segments: [
      {
        content: "uniquecontenthere",
        tags: ["rainbow", "and", "unicorns"]
      }
    ]
  }
)

sleep 1

response = client.search(
  index: 'organization_1',
  type: 'document',
  body: {
    query: {
      nested: {
        path: "segments",
        query: {
          match_phrase: {
            "segments.content": "uniquecontenthere"
          }
        }
      }
    }
  }
)

# puts response
puts "Searching for a Document through alias took #{response["took"]} ms"

response = client.search(
  index: 'organizations',
  type: 'document',
  body: {
    query: {
      nested: {
        path: "segments",
        query: {
          match_phrase: {
            "segments.content": "uniquecontenthere"
          }
        }
      }
    }
  }
)

# puts response
puts "Searching for a Document through entire index took #{response["took"]} ms"

/Organizations/documents/:ID

Results

Searching for a Document through alias took 37 ms
Searching for a Document through entire index took 27 ms