Query better with Django ORM
Data Science Summit 2020
Mafinar Khan
- Programming with Python since 2001
- Django since 2009
- Senior Backend Engineer at Coffee Meets Bagel
- Talk to me about: Databases, Domain Driven Design, GraphQL, Functional Programming and GIS
- And also- Elixir, Dart, Phoenix
What we'll cover
- Fundamentals
- Functions and Aggregators
- Expressions
- Postgres specific field
Topic cloud
n+1
window
aggregates
annotate
constraints
function
postgres
admin
json
fulltext
queryset
explain
lookups
index
Expressions
Q
F
Arrays
range
explain/analyze
django-debug-tool
managers
search
fundamentals
queryset anatomy
- Model
- Manager
- Methods
- Look-ups
Product.objects.filter(name__startswith="A")
lazy
chainable
iterable
tools
SQL Representation
Explain
Frequently used methods
- all, iteration
- filter, exclude
- slice operators, order_by
- annotate, values, values_list, extra
- aggregate
- earliest, first, last, latest, dates
- select_related, prefetch_related
Schema
# Get a list of categories
Category.objects.all()
# Get an iterator of categories
Category.objects.iterator()
# Get product with categories filled in
Product.objects.select_related("category").all()
# Get all dates stocks were in
ProductItem.objects.dates("stock_in_date", "day")
# Find all products that end with "phone"
Product.objects.filter(name__endswith="phone")
# Find all product items
ProductItems.objects.count()
Note
- all vs iterator
- select_related and prefetch_related
- count and exist
functions and aggregators
Functions and aggregates
- DBMS is usually faster, make it do all the work
- Use Database functions whenever you can easily
- Annotations are your friends
Annotations and VAlues
- Annotations attach each object with provided data
- values and values_list picks up fields from data
Product.objects\
.select_related("category")\
.values("category__name")\
.annotate(count=Count("pk")
functions and aggregate example
def purchases_by_month():
return ProductItem.objects\
.annotate(month=ExtractMonth("stock_in_date"))\
.values("month")\
.annotate(sales=Avg("product__price"))
def total_sale():
return ProductItem.objects\
.select_related("product")\
.aggregate(total=Sum('product__price'))
expressions
composable, nestable represenation of computation
expression
- Query Expression
- Conditional Expression
- F and Q Objects
- SubQuery
find number of products, three product names, and average price per category
Category.objects.annotate(
total=Subquery(
Product.objects.filter(
category=OuterRef("pk")).values("category").annotate(
total=Count("pk")).values("total")),
product_names=Subquery(
Product.objects.filter(
category=OuterRef("pk")).values("category").annotate(
names=StringAgg("name", "||")).values("names")),
average_price=Subquery(
Product.objects.filter(
category=OuterRef("pk")).values("category").annotate(
avg_price=Avg("price")).values("avg_price")))
Conditional statement
- Aka CASE statement
- Helps with putting conditionally acquired data into DB context
window functions
- Apply Function on Partitions
- Expression (What to do)
- Partition (What group to apply one)
- Order (On What Order)
def average_price():
return Product.objects.annotate(
average_by_category=Window(
expression=Avg('price'),
partition_by=[F('category')],
order_by=F('category').asc()
)
).values(
'name', 'category__name', 'average_by_category')
new schema time!!!
class Message(models.Model):
when = models.DateTimeField(auto_now=True, db_index=True)
sender = models.CharField(max_length=127)
recipient = models.CharField(max_length=127)
objects = MessageManager()
def __str__(self):
return f'[{self.when.ctime()}] {self.sender} -> {self.recipient}'
Find last messages between two individuals
class MessageManager(models.Manager):
def latest_distinct(self):
return super().get_queryset().annotate(
user_1=Case(
When(sender__gt=F('recipient'), then=F('recipient')),
default=F('sender')
),
user_2=Case(
When(sender__gt=F('recipient'), then=F('sender')),
default=F('recipient')
)
).order_by('user_1', 'user_2', '-when').distinct('user_1', 'user_2')
WHY?
postgres specific fields
Postgres specific fields
- Citext, JSONB, HStore, Range
- Ranges
- Searches
from django.db import models
from django.contrib.postgres.fields import CITextField, CICharField, JSONField
from phonenumber_field.modelfields import PhoneNumberField
from common.models import TimeStampedModel
class Customer(TimeStampedModel):
name = models.CharField(max_length=127)
email = models.EmailField(unique=True)
phone_number = PhoneNumberField(unique=True)
address = CITextField(blank=True)
bio = CICharField(blank=True, max_length=31)
preferences = JSONField(null=True)
def __str__(self):
return self.name
Customer.objects.filter(preferences__send_email=True)
Customer.objects.filter(preferences__languages__contains="python")
Customer.objects.filter(bio__contains="pYThOn")
yet another schema!!!
This one will be fun!!!
write an attendance system with clash detection
class Event(TimeStampedModel):
user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE)
time_range = DateTimeRangeField(db_index=True, default=default_time_range)
remarks = models.TextField(blank=True)
objects = EventManager()
def __str__(self):
return f'{self.user.username}: ({format_time_range(self.time_range)})'
class Meta:
ordering = '-time_range',
# Uncomment in the event of Python 3.0!!!
# constraints = [
# ExclusionConstraint(
# name='exclude_overlapping_reservations',
# expressions=[
# ('time_range', RangeOperators.OVERLAPS),
# ('user', RangeOperators.EQUAL),
# ]
# ),
# ]
def create_event(user, start_time=None, end_time=None) -> Event:
return Event.objects.create(
user=user, time_range=DateTimeTZRange(lower=start_time or now(), upper=end_time))
def update_event(pk, start_time, end_time) -> Event:
return Event.objects.get(id=pk).update(
time_range=DateTimeTZRange(lower=start_time, upper=end_time))
class EventManager(models.Manager):
def get_queryset(self):
return super().get_queryset().annotate(
start_time=Lower('time_range'),
end_time=Upper('time_range'),
is_current=Case(
When(
end_time__isnull=True,
then=True),
default=False,
output_field=BooleanField()),
duration=Case(
When(is_current=True,
then=ExpressionWrapper(Now() - F('start_time'),
output_field=DurationField())),
default=ExpressionWrapper(F('end_time') - F('start_time'),
output_field=DurationField()))
)
Let's search now!!!
search query and search vector
def find_in_category_or_product(query):
return Product.objects.select_related('category').annotate(
category_name=F('category__name'),
).annotate(
search=SearchVector('name') + SearchVector('category_name')
).filter(search=query)
# FIND Pixel Phones that does not have TAB in it
services.find_in_category_or_product(
SearchQuery('pixel') & ~SearchQuery('tab'))
# FIND ANYTHING WHERE 'phone' IS IN 'name' or 'category name'
services.find_in_category_or_product('phone')
trigram similarity
def fuzzy_name_finder(term, model=Product):
return model.objects.annotate(
similarity=TrigramSimilarity('name', term)
).filter(similarity__gt=0.1).order_by('-similarity')
# SHOULD GIVE YOU PIXEL
services.fuzzy_name_finder("pijsel")
patterns and practices
Useful tools
- Django Debug Tool
- Django Extensions
Additional tips
- Think in terms of Data, SQL and then Python
- Always Analyze
what's not covered
- Advanced F Object
- More Window Functions
- Custom Lookups
thank you
Query Better with Django ORM
By Mafinar Khan
Query Better with Django ORM
In this workshop, we will learn about intermediate and advanced features of Django ORM. We will see how we can run complex queries with Django ORM efficiently and staying Pythonic all the while.
- 1,245