The what and why of Python and Elasticsearch

@honzakral

Elasticsearch

HTTP Rest

import requests

ES_URL = 'http://localhost:9200'

requests.put(ES_URL + '/i/t/42', json={"title": "PyCon Belarus"})

query = {"query": {"match": {"title": "pycon"}}}

data = requests.get(ES_URL + '/i/_search', json=query).json()

I know http!

well, actually...

Rich API

Distributed

Official Client(s)

  • maintained
  • full API support
  • load balancing
  • node failures
  • different transports
  • no opinions

No Opinions?!

"Nobody should have a reason not to use the official client."

1-to-1 mapping


response = client.search(
  index="my-index",
  body={
    "query": {
      "bool": {
        "must": {"match": {"title": "python"}},
        "must_not": {"match": {"description": "beta"}}
      }
    }
  }
)
curl -X GET localhost:9200/my-index/_search -d '{
  "query": {
    "bool": {
      "must": {"match": {"title": "python"}},
      "must_not": {"match": {"description": "beta"}}
    }
  }
}'

Component architecture

from elasticsearch import Elasticsearch
from elasticsearch_async import AsyncTransport

client = Elasticsearch(transport_class=AsyncTransport)

Unified test suite

elasticsearch-py


response = client.search(
    index="my-index",
    body={
      "query": {
        "bool": {
          "must": [{"match": {"title": "python"}}],
          "must_not": [{"match": {"description": "beta"}}]
          "filter": [{"term": {"category": "search"}}]
        }
      },
      "aggs" : {
        "per_tag": {
          "terms": {"field": "tags"},
          "aggs": {
            "max_lines": {"max": {"field": "lines"}}
          }
        }
      }
    }
)

for hit in response['hits']['hits']:
    print(hit['_score'], hit['_source']['title'])

elasticsearch-dsl

s = Search(using=client, index="my-index")
# filter only search
s = s.filter("term", category="search")
# we want python in title
s = s.query("match", title="python")
# and no beta releases
s = s.query(~Q("match", description="beta"))

# aggregate on tags
s.aggs.bucket('per_tag', 'terms', field='tags')
# max lines per tag
s.aggs['per_tag'].metric('max_lines', 'max', field='lines')

Hide mechanics,

not meaning!

Mechanics


response = client.search(
    index="my-index",
    body={
      "query": {
        "bool": {
          "must": [{"match": {"title": "python"}}],
          "must_not": [{"match": {"description": "beta"}}]
          "filter": [{"term": {"category": "search"}}]
        }
      },
      "aggs" : {
        "per_tag": {
          "terms": {"field": "tags"},
          "aggs": {
            "max_lines": {"max": {"field": "lines"}}
          }
        }
      }
    }
)

for hit in response['hits']['hits']:
    print(hit['_score'], hit['_source']['title'])

Meaning

s = Search(using=client, index="my-index")
s = s.filter("term", category="search")
s = s.query("match", title="python")
s = s.query(~Q("match", description="beta"))

s.aggs.bucket('per_tag', 'terms', field='tags') \
    .metric('max_lines', 'max', field='lines')

for hit in s:
    print(hit.meta.score, hit.title)

Integrations

DocType

from datetime import date
from elasticsearch_dsl import DocType, Text, Keyword, Date

class BlogPost(DocType):
    title = Text(analyzer="english")
    body = Text(analyzer="english")

    published_date = Date()

    tags = Keyword(multi=True)

BlogPost.search('terms', tags=['python', 'belarus'])

bp = BlogPost(title='PyCon Belarus', published_date=date.today())
bp.tags.append('python')
bp.save()
  

Django Integration?

from django import models
from .search import BlogPost as BlogPostDoc

class BlogPost(models.Model):
    title = models.CharField(max_length=200)
    ...
    def to_search(self):
        return BlogPostDoc(
            _id=self.id,
            title=self.title,
            tags=[t.name for t in self.tags.all()],
            ...
        )


def update_search(instance, **kwargs):
    instance.to_search().save()

def remove_from_search(instance, **kwargs):
    instance.to_search().delete()

post_save.connect(update_search, sender=BlogPost)
pre_delete.connect(remove_from_search, sender=BlogPost)

Faceted Search

Faceted Search

from elasticsearch_dsl import FacetedSearch, TermsFacet, DateHistogramFacet

class BlogSearch(FacetedSearch):
    doc_types = [BlogPost, ]

    fields = ['tags', 'title', 'body']

    facets = {
        'tags': TermsFacet(field='tags'),
        'months': DateHistogramFacet(field='published_date', interval='month')
    }

bs = BlogSearch('python web', {'months': date(2015, 6)})

Future?

Thanks!

@honzakral

The What and Why of Python and Elasticsearch

By Honza Král

The What and Why of Python and Elasticsearch

  • 2,353