Django ORM
&
queries optimization
Name: Ivaylo Donchev
Age: 23
Full Stack Developer
( not graduated yet.. ;( )
Bulgaria 💕
Django ORM
ORM
vs
Raw SQL
from psycopg2 import sql
cur.execute(
sql.SQL("insert into {} values (%s, %s)")
.format(sql.Identifier('my_table')),
[10, 20])
It does the perfect job but it's
hard to keep it maintainable...
class MyTable(Model):
a = IntegerField()
b = IntegerField()
# ...
MyTable.objects.create(a=10, b=20)
Django ORM
DB
Django
ORM
Django ORM
DB
Django
ORM
python
objects
methods
SQL queries
DB rows
/
query result
ORM
DB
Django
BACKEND (adapter)
QuerySet - Manager
ORM
DB
Django
BACKEND (adapter)
QuerySet - Manager
SQL compiler
User model
class User(Model):
name = models.CharField(max_length=64)
age = models.IntegerField()
id | name | age |
---|---|---|
Programmer API:
QuerySet / Manager
Django ORM
Python Expression |
User.objects.all() (.values()/.values_list()) |
---|---|
SQL query | SELECT * FROM USERS; |
TEMPLATE | SELECT (fields=*) FROM User._meta.db_table |
The python variable | <Queryset: User<>, ... > |
Django ORM
Python Expression | User.objects.filter(age__gte=20) |
---|---|
SQL query | SELECT * FROM USERS WHERE AGE >= 20; |
TEMPLATE | SELECT (fields=*) FROM User._meta.db_table WHERE (<some_field> AND/OR ...); |
def get_all_users_with_age_gte_20():
users_qs = User.objects.all() # 1
users_qs_2 = users_qs.all() # 2
result_1 = [user for user in users_qs if user.age >= 20] # 3
result_2 = (user for user in users_qs_2 if user.age >= 20) # 4
result_3 = users_qs.filter(age__gte=20) # 5
result_4 = [user for user in result_3] # 6
print(result_1) # 7
print(len(result_2)) # 8
print(result_3) # 9
print(result_4) # 10
return result_1 # 11
Django ORM
QuerySet
Manager
Model
The implementation
class QuerySet:
"""
Represent a lazy database lookup
for a set of objects."""
# SELECTs
def get(self, *args, **kwargs):
def first(self):
def last(self):
def order_by(self, **kwargs):
# Doing actions
def create(self, **kwargs):
def update(self, **kwargs):
# Make JOINs
def prefetch_related(self, *lookups):
def select_related(self, *fields):
# Choose python data structure
def values(self, *fields, **expressions):
def values_list(self, *fields, flat=False, named=False):
def all(self):
def in_bulk(self):
# Rename fields
def annotate(self, *args, **kwargs):
# ...
class Query:
"""A single SQL query."""
class WhereNode(tree.Node):
"""
An SQL WHERE clause.
"""
It's an oriented graph
QuerySet
Query
iterable_class
WhereNode
class QuerySet:
# ...
def values_list(self, *fields, flat=False, named=False):
# Some validations
clone = self._values(*_fields, **expressions)
clone._iterable_class = FlatValuesListIterable if flat else ValuesListIterable
return clone
User.objects.values_list('id')
# [[1, ], [2, ]]
User.objects.values_list('id', flat=True)
# [1, 2]
class FlatValuesListIterable(BaseIterable):
def __iter__(self):
queryset = self.queryset
compiler = queryset.query.get_compiler(queryset.db)
for row in compiler.results_iter(chunked_fetch=self.chunked_fetch, chunk_size=self.chunk_size):
yield row[0]
User.objects.values_list('id')
# [[1, ], [2, ]]
User.objects.values_list('id', flat=True)
# [1, 2]
class CustomIterable(BaseIterable):
def __iter__(self):
queryset = self.queryset
compiler = queryset.query.get_compiler(queryset.db)
counter = 0
for row in compiler.results_iter(chunked_fetch=self.chunked_fetch, chunk_size=self.chunk_size):
if counter % 2 == 0:
yield {
'specific_value_name': row[3],
'counter': counter
}
class QuerySet:
# ...
def custom_all(self):
clone = self._values(*_fields, **expressions)
clone._iterable_class = CustomIterable
return clone
User.objects.custom_all()
"""
[
{'specific_value_name: 'some_value_1', counter: 0},
{'specific_value_name: 'some_value_2', counter: 2}
]
"""
Some basic ORM principles:
- QuerySet are IMMUTABLE
- `__iter__()` === SQL query
- iterable_class defines the final data structure
def get_all_users_with_age_gte_20():
users_qs = User.objects.all() # 1
users_qs_2 = users_qs.all() # 2
result_1 = [user for user in users_qs if user.age >= 20] # 3
result_2 = (user for user in users_qs_2 if user.age >= 20) # 4
result_3 = users_qs.filter(age__gte=20) # 5
result_4 = [user for user in result_3] # 6
print(result_1) # 7
print(len(result_2)) # 8
print(result_3) # 9
print(result_4) # 10
return result_1 # 11
def get_all_users_with_age_gte_20():
users_qs = User.objects.all() # 1 - no
users_qs_2 = users_qs.all() # 2 - no
result_1 = [user for user in users_qs if user.age >= 20] # 3 - yes
result_2 = (user for user in users_qs_2 if user.age >= 20) # 4 - yes
result_3 = users_qs.filter(age__gte=20) # 5 - no
result_4 = [user for user in result_3] # 6 - yes
print(result_1) # 7 - no (because it's already executed)
print(len(result_2)) # 8 - no
print(len(result_3)) # 9 - yes (`len` requires iterable)
print(result_4) # 10 - no (already executed)
return result_1 # 11 - no
# => 5 queries
Some tips
- If you have large database tables:
- only('field_i_need'), defer('field_that_i_dont_need')
- use lighter data structure (tuple/list/dict)
- If you have slow python logic over the objects (filtration..):
- Sometimes an extra query is faster than a `for loop`
- It's good idea to use `order_by()` ! ('id' ordering is default just for `first/last()`)
Queries Optimization
Insert/Update
ivo = User(name='Ivo')
milica = User(name='Milica')
User.objects.bulk_create([ivo, milica])
User.objects\
.filter(name__icontains='iv')\
.update(name='Ivaylo')
DELETE
User.objects\
.filter(name__icontains='iv')\
.delete()
3 types of problems:
#1 Too much queries
#2 Too much data
#3 Too much data & queries
SELECT
#1 Too much queries
.prefetch_related()
.select_related()
&
class SharedAccount(Model):
pass
class User(Model):
shared_account = ForeignKey(
SharedAccount,
related_name='users'
)
SELECT *
FROM "user"
ORM expression
User.objects.all()
SQL query
SELECT *
FROM "user"
INNER JOIN "shared_account"
ON ("user"."shared_account_id" = "shared_account"."id")
ORM expression
User.objects.select_related('shared_account')
SQL query
class SharedAccount(Model):
pass
class User(Model):
shared_account = ForeignKey(
SharedAccount,
related_name='users'
)
SELECT *
FROM "shared_account"
ORM expression
SharedAccount.objects.all()
SQL query
SELECT *
FROM "shared_account"
ORM expression
SharedAccount.objects.prefetch_related('users')
SQL query
SELECT *
FROM "users"
WHERE "user"."shared_account_id" IN (...);
FOLLOWED BY....
Tip 1:
you can use
.select_related() and .prefetch_related()
for simple JOIN-s
#2 Too much data
class User(Model):
name = CharField(max_length=255)
class Playlist(Model):
name = CharField(max_length=255)
user = ForeignKey(User, related_name='playlists')
class Song(Model):
title = CharField(max_length=255)
length = PositiveIntegerField()
playlist = ForeignKey(Playlist, related_name='songs')
Music Service Web Application
User
Playlist
Playlist
Playlist
Song
Song
Song
class User(Model):
name = # ...
class Playlist(Model):
user = ForeignKey(User, related_name='playlists')
name = # ...
@property
def songs_total_length(self):
total_length = 0
for song in self.songs.all():
total_length += song.length
return total_length
class Song(Model):
playlist = ForeignKey(Playlist, related_name='songs')
title = # ...
length = # ...
class User(Model):
name = # ...
class Playlist(Model):
user = ForeignKey(User, related_name='playlists')
name = # ...
@property
def songs_total_length(self):
songs_length = [s.length for s in self.songs.all()]
return sum(songs_length)
class Song(Model):
playlist = ForeignKey(Playlist, related_name='songs')
title = # ...
length = # ...
class User(Model):
name = # ...
@property
def playlists_total_length(self):
return sum([p.songs_total_length for p in self.playlists.all()])
class Playlist(Model):
user = ForeignKey(User, related_name='playlists')
name = # ...
@property
def songs_total_length(self):
return sum([song.length for song in self.songs.all()])
class Song(Model):
playlist = ForeignKey(Playlist, related_name='songs')
title = # ...
length = # ...
Task: Get all users' total length of their playlists
- 10 Users
- 10 Playlists per user
- 10 songs per playlist
Task: Get all users' total length of their playlists
Total objects to fetch = 10 (Users) +\
10 * 10 (Playlists) +\
10 * 10 * 10 (Songs)
1110 DB rows in 3 different tables
for just 10 users
GROUP BY with aggregation(SUM)
class User(Model):
name = # ...
@property
def playlists_total_length(self):
return sum([p.songs_total_length for p in self.playlists.all()])
class Playlist(Model):
user = ForeignKey(User, related_name='playlists')
name = # ...
@property
def songs_total_length(self):
return sum([song.length for song in self.songs.all()])
class Song(Model):
playlist = ForeignKey(Playlist, related_name='songs')
title = # ...
length = # ...
Playlists with total songs length
SELECT "playlist"."id",
"playlist"."user_id",
"playlist"."name",
(
SELECT SUM("song"."length")
FROM "song"
WHERE "song"."playlist_id" = ("playlist"."id")
GROUP BY "song"."playlist_id" LIMIT 1
) AS "songs_total_length"
FROM "playlist";
class Playlist(Model):
user = ForeignKey(User, related_name='playlists')
name = CharField(max_length=255)
@property
def songs_total_length(self):
return sum([song.length for song in self.songs.all()])
class PlaylistQuerySet(QuerySet):
@classmethod
def songs_total_length(cls):
return <sum aggregation over songs grouped by the playlist ???>
def collect(self):
return self.annotate(_songs_total_length=self.songs_total_length())
class Playlist(Model):
objects = PlaylistQuerySet.as_manager()
user = ForeignKey(User, related_name='playlists')
name = CharField(max_length=255)
@property
def songs_total_length(self):
if hasattr(self, 'songs_total_length'):
return self._songs_total_length
return sum([song.length for song in self.songs.all()])
class PlaylistQuerySet(QuerySet):
@classmethod
def songs_total_length(cls):
return Subquery(
query=...,
output_field=...
)
def collect(self):
return self.annotate(_songs_total_length=self.songs_total_length())
class Playlist(Model):
objects = PlaylistQuerySet.as_manager()
user = ForeignKey(User, related_name='playlists')
name = CharField(max_length=255)
@property
def songs_total_length(self):
if hasattr(self, 'songs_total_length'):
return self._songs_total_length
return sum([song.length for song in self.songs.all()])
class PlaylistQuerySet(QuerySet):
@classmethod
def songs_total_length(cls):
return Subquery(
queryset=...,
output_field=models.IntegerField()
)
def collect(self):
return self.annotate(_songs_total_length=self.songs_total_length())
class Playlist(Model):
objects = PlaylistQuerySet.as_manager()
user = ForeignKey(User, related_name='playlists')
name = CharField(max_length=255)
@property
def songs_total_length(self):
if hasattr(self, 'songs_total_length'):
return self._songs_total_length
return sum([song.length for song in self.songs.all()])
class PlaylistQuerySet(QuerySet):
@classmethod
def songs_total_length(cls):
queryset = Song.objects \
.values('playlist__id') \ # This is the GROUP BY field
. # ¯\_(ツ)_/¯
return Subquery(
queryset=queryset,
output_field=models.IntegerField()
)
def collect(self):
return self.annotate(_songs_total_length=self.songs_total_length())
class Playlist(Model):
objects = PlaylistQuerySet.as_manager()
user = ForeignKey(User, related_name='playlists')
name = CharField(max_length=255)
@property
def songs_total_length(self):
if hasattr(self, 'songs_total_length'):
return self._songs_total_length
return sum([song.length for song in self.songs.all()])
class PlaylistQuerySet(QuerySet):
@classmethod
def songs_total_length(cls):
queryset = Song.objects \
.values('playlist__id') \ # This is the GROUP BY field
.values_list(Sum('length')) \ # The SUM aggregation
. # ¯\_(ツ)_/¯
return Subquery(
queryset=queryset,
output_field=models.IntegerField()
)
def collect(self):
return self.annotate(_songs_total_length=self.songs_total_length())
class Playlist(Model):
objects = PlaylistQuerySet.as_manager()
user = ForeignKey(User, related_name='playlists')
name = CharField(max_length=255)
@property
def songs_total_length(self):
if hasattr(self, 'songs_total_length'):
return self._songs_total_length
return sum([song.length for song in self.songs.all()])
class PlaylistQuerySet(QuerySet):
@classmethod
def songs_total_length(cls):
queryset = Song.objects \
.values('playlist__id') \ # This is the GROUP BY field
.values_list(Sum('length')) \ # The SUM aggregation
.filter(playlist__id=OuterRef('id')) # !!!
return Subquery(
queryset=queryset,
output_field=models.IntegerField()
)
def collect(self):
return self.annotate(_songs_total_length=self.songs_total_length())
class Playlist(Model):
objects = PlaylistQuerySet.as_manager()
user = ForeignKey(User, related_name='playlists')
name = CharField(max_length=255)
@property
def songs_total_length(self):
if hasattr(self, 'songs_total_length'):
return self._songs_total_length
return sum([song.length for song in self.songs.all()])
Playlist.objects.collect()
SELECT "playlist"."id",
"playlist"."user_id",
"playlist"."name",
(
SELECT SUM("song"."length")
FROM "song"
WHERE "song"."playlist_id" = ("playlist"."id")
GROUP BY "song"."playlist_id" LIMIT 1
) AS "songs_total_length"
FROM "playlist";
...will be the Django ORM equivalent for
class PlaylistQuerySet(QuerySet):
@classmethod
def playlists_total_length(cls):
pass
def collect(self):
return self.annotate(_playlists_total_length=self.playlists_total_length())
class User(Model):
objects = UsersQuerySet.as_manager()
name = CharField(max_length=255)
@property
def playlists_total_length(self):
if hasattr(self, '_playlists_total_length'):
return self._playlists_total_length
playlists_length = [
playlist.songs_total_length
for playlist in self.playlists.all()
]
return sum(playlist_length)
class UserQuerySet(QuerySet):
@classmethod
def playlists_total_length(cls):
playlist_annotation = PlaylistQueryset.songs_total_length()
queryset = Playlist.objects \
.values('user__id') \
.filter(user__id=OuterRef('id')) \
.values_list(Sum(playlist_annotation))
return Subquery(
queryset=queryset,
output_field=IntegerField()
)
def collect(self):
return self.annotate(_playlists_total_length=self.playlists_total_length())
class User(Model):
objects = UserQuerySet.as_manager()
name = CharField(max_length=255)
@property
def playlists_total_length(self):
if hasattr(self, '_playlists_total_length'):
return self._playlists_total_length
return sum([p.songs_total_length for p in self.playlists.all()])
SELECT "user"."id",
"user"."name",
(SELECT SUM((
SELECT SUM(U0."length")
FROM "song" U0
WHERE U0."playlist_id" = (V0."id")
GROUP BY U0."playlist_id"
LIMIT 1
))
FROM "playlist" V0
WHERE V0."user_id" = ("user"."id")
GROUP BY V0."user_id") AS "_playlists_total_length"
FROM "user"
Tip 2:
Subquery + OuterRef for GROUP BY
#2 Too much
data & queries
class User(Model):
objects = UserQuerySet.as_manager()
first_name = CharField(max_length=255)
second_name = CharField(max_length=255)
@property
def verbose_name(self):
return f'Name: {self.first_name} {self.last_name}.'
class UserQuerySet(QuerySet):
@classmethod
def verbose_name(cls):
return ExpressionWrapper(
output_field=f"Name: {F('first_name')} {F('second_name')}.",
output_field=CharField()
)
def collect(self):
return self.annotate(_verbose_name=verbose_name)
class User(Model):
objects = UserQuerySet.as_manager()
first_name = CharField(max_length=255)
second_name = CharField(max_length=255)
@property
def verbose_name(self):
if hasattr(self, '_verbose_name'):
return self._verbose_name
return f'Name: {self.first_name} {self.last_name}.'
Tip 3:
Use django.db.models.ExpressionWrapper
to avoid the need of loading heavy objects.
(You can use the `versbose_name` with values/values_list)
Tip 4:
https://github.com/jazzband/django-debug-toolbar
Post problem:
Does it really work?
The original tests
# test_models.py
class SongTests(TestCase):
def test_song_length(self):
length = 120
real_length = 0.8 * length
song = Song.objects.create(title='Europython', length=120)
self.assertEqual(real_length, song.real_length)
The modified tests
# test_models.py
class SongTests(TestCase):
def test_song_length(self):
length = 120
real_length = 0.8 * length
song = Song.objects.create(title='Europython', length=120)
self.assertEqual(real_length, song.real_length)
song_collected = Song.objects.collect().get(id=song.id)
self.assertEqual(real_length, song_collected.real_length)
|
---|
Thank you!!!
https://github.com/Ivo-Donchev
https://twitter.com/DonchevIvaylo
https://www.facebook.com/ivodonchev267
i.donchev@hacksoft.io
Django ORM and queries optimization
By Ivaylo Donchev
Django ORM and queries optimization
- 848