Django ORM

&

queries optimization

Name: Ivaylo Donchev

 

Age: 23

 

Full Stack Developer

( not graduated yet.. ;( )

Bulgaria 💕

Django ORM

ORM

vs

Raw SQL

from psycopg2 import sql

cur.execute(
    sql.SQL("insert into {} values (%s, %s)")
        .format(sql.Identifier('my_table')),
    [10, 20])

It does the perfect job but it's

hard to keep it maintainable...

class MyTable(Model):
    a = IntegerField()
    b = IntegerField()


# ...


MyTable.objects.create(a=10, b=20)

Django ORM

DB

Django

ORM

Django ORM

DB

Django

ORM

python

objects

methods

SQL queries

DB rows

/

query result

ORM

DB

Django

BACKEND (adapter)

QuerySet - Manager

ORM

DB

Django

BACKEND (adapter)

QuerySet - Manager

SQL compiler

User model

class User(Model):
    name = models.CharField(max_length=64)
    age = models.IntegerField()
id name age

Programmer API:

QuerySet / Manager

Django ORM

Python Expression
 
User.objects.all() (.values()/.values_list())
SQL query SELECT * FROM USERS;
 
TEMPLATE SELECT (fields=*)
  FROM User._meta.db_table
 
The python variable <Queryset: User<>, ... >
 

Django ORM

Python Expression User.objects.filter(age__gte=20)
 
SQL query SELECT *
  FROM USERS
  WHERE AGE >= 20;
 
TEMPLATE SELECT (fields=*)
  FROM User._meta.db_table
  WHERE (<some_field> AND/OR ...);
 
def get_all_users_with_age_gte_20():
    users_qs = User.objects.all() # 1

    users_qs_2 = users_qs.all() # 2

    result_1 = [user for user in users_qs if user.age >= 20] # 3

    result_2 = (user for user in users_qs_2 if user.age >= 20)  # 4

    result_3 = users_qs.filter(age__gte=20) # 5

    result_4 = [user for user in result_3] # 6

    print(result_1) # 7

    print(len(result_2)) # 8

    print(result_3) # 9

    print(result_4) # 10

    return result_1 # 11

Django ORM

QuerySet

Manager

Model

The implementation

class QuerySet:
    """
    Represent a lazy database lookup
    for a set of objects."""

    # SELECTs 
    def get(self, *args, **kwargs):
    def first(self):
    def last(self):
    def order_by(self, **kwargs):

    # Doing actions
    def create(self, **kwargs):
    def update(self, **kwargs):

    # Make JOINs
    def prefetch_related(self, *lookups):
    def select_related(self, *fields):

    # Choose python data structure
    def values(self, *fields, **expressions):
    def values_list(self, *fields, flat=False, named=False):
    def all(self):
    def in_bulk(self):
    
    # Rename fields
    def annotate(self, *args, **kwargs):
    # ...
class Query:
    """A single SQL query."""
class WhereNode(tree.Node):
    """
    An SQL WHERE clause.
    """

It's an oriented graph

QuerySet

Query

iterable_class

WhereNode

class QuerySet:
    # ...
    def values_list(self, *fields, flat=False, named=False):
        # Some validations
        clone = self._values(*_fields, **expressions)
        clone._iterable_class = FlatValuesListIterable if flat else ValuesListIterable

        return clone


User.objects.values_list('id')
# [[1, ], [2, ]]

User.objects.values_list('id', flat=True)
# [1, 2]
class FlatValuesListIterable(BaseIterable):
    def __iter__(self):
        queryset = self.queryset
        compiler = queryset.query.get_compiler(queryset.db)

        for row in compiler.results_iter(chunked_fetch=self.chunked_fetch, chunk_size=self.chunk_size):
            yield row[0]


User.objects.values_list('id')
# [[1, ], [2, ]]

User.objects.values_list('id', flat=True)
# [1, 2]
class CustomIterable(BaseIterable):
    def __iter__(self):
        queryset = self.queryset
        compiler = queryset.query.get_compiler(queryset.db)

        counter = 0
        for row in compiler.results_iter(chunked_fetch=self.chunked_fetch, chunk_size=self.chunk_size):
            if counter % 2 == 0:
                yield {
                    'specific_value_name': row[3],
                    'counter': counter
                }


class QuerySet:
    # ...
    def custom_all(self):
        clone = self._values(*_fields, **expressions)
        clone._iterable_class = CustomIterable

        return clone


User.objects.custom_all()

"""
[
    {'specific_value_name: 'some_value_1', counter: 0},
    {'specific_value_name: 'some_value_2', counter: 2}
]
"""

Some basic ORM principles:

  • QuerySet are IMMUTABLE
  • `__iter__()` === SQL query
  • iterable_class defines the final data structure
def get_all_users_with_age_gte_20():
    users_qs = User.objects.all() # 1

    users_qs_2 = users_qs.all() # 2

    result_1 = [user for user in users_qs if user.age >= 20] # 3

    result_2 = (user for user in users_qs_2 if user.age >= 20)  # 4

    result_3 = users_qs.filter(age__gte=20) # 5

    result_4 = [user for user in result_3] # 6

    print(result_1) # 7

    print(len(result_2)) # 8

    print(result_3) # 9

    print(result_4) # 10

    return result_1 # 11
def get_all_users_with_age_gte_20():
    users_qs = User.objects.all() # 1 - no

    users_qs_2 = users_qs.all() # 2 - no

    result_1 = [user for user in users_qs if user.age >= 20] # 3 - yes

    result_2 = (user for user in users_qs_2 if user.age >= 20)  # 4 - yes

    result_3 = users_qs.filter(age__gte=20) # 5 - no

    result_4 = [user for user in result_3] # 6 - yes

    print(result_1) # 7 - no (because it's already executed)

    print(len(result_2)) # 8 - no

    print(len(result_3)) # 9 - yes (`len` requires iterable)

    print(result_4) # 10 - no (already executed)

    return result_1 # 11 - no


# => 5 queries

Some tips

  • If you have large database tables:
    • only('field_i_need'), defer('field_that_i_dont_need')
    • use lighter data structure (tuple/list/dict)

 

  • If you have slow python logic over the objects (filtration..):
    • Sometimes an extra query is faster than a `for loop`

 

  • It's good idea to use `order_by()` ! ('id' ordering is default just for `first/last()`)

Queries Optimization

Insert/Update

ivo = User(name='Ivo')

milica = User(name='Milica')

User.objects.bulk_create([ivo, milica])

User.objects\
    .filter(name__icontains='iv')\
    .update(name='Ivaylo')

DELETE

User.objects\
    .filter(name__icontains='iv')\
    .delete()

3 types of problems:

#1 Too much queries

#2 Too much data

#3 Too much data & queries

SELECT

#1 Too much queries

.prefetch_related()

.select_related()

&

class SharedAccount(Model):
    pass



class User(Model):
    shared_account = ForeignKey(
        SharedAccount,
        related_name='users'
    )
SELECT *
    FROM "user"

ORM expression

User.objects.all()

SQL query

SELECT *
    FROM "user"
    INNER JOIN "shared_account"
        ON ("user"."shared_account_id" = "shared_account"."id")

ORM expression

User.objects.select_related('shared_account')

SQL query

class SharedAccount(Model):
    pass



class User(Model):
    shared_account = ForeignKey(
        SharedAccount,
        related_name='users'
    )
SELECT *
    FROM "shared_account"

ORM expression

SharedAccount.objects.all()

SQL query

SELECT *
    FROM "shared_account"

ORM expression

SharedAccount.objects.prefetch_related('users')

SQL query

SELECT *
    FROM "users"
    WHERE "user"."shared_account_id" IN (...);

FOLLOWED BY....

Tip 1:

you can use

.select_related() and .prefetch_related()

for simple JOIN-s

#2 Too much data

class User(Model):
    name = CharField(max_length=255)


class Playlist(Model):
    name = CharField(max_length=255)

    user = ForeignKey(User, related_name='playlists')


class Song(Model):
    title = CharField(max_length=255)
    length = PositiveIntegerField()

    playlist = ForeignKey(Playlist, related_name='songs')

Music Service Web Application

User

Playlist

Playlist

Playlist

Song

Song

Song

class User(Model):
    name = # ...


class Playlist(Model):
    user = ForeignKey(User, related_name='playlists')
    name = # ...

    @property
    def songs_total_length(self):
        total_length = 0
        
        for song in self.songs.all():
            total_length += song.length
   
        return total_length


class Song(Model):
    playlist = ForeignKey(Playlist, related_name='songs')
    title =  # ...
    length =  # ...
class User(Model):
    name = # ...


class Playlist(Model):
    user = ForeignKey(User, related_name='playlists')
    name = # ...

    @property
    def songs_total_length(self):
        songs_length = [s.length for s in self.songs.all()]
        return sum(songs_length)


class Song(Model):
    playlist = ForeignKey(Playlist, related_name='songs')
    title = # ...
    length = # ...
class User(Model):
    name = # ...

    @property
    def playlists_total_length(self):
        return sum([p.songs_total_length for p in self.playlists.all()])


class Playlist(Model):
    user = ForeignKey(User, related_name='playlists')
    name = # ...

    @property
    def songs_total_length(self):
        return sum([song.length for song in self.songs.all()])


class Song(Model):
    playlist = ForeignKey(Playlist, related_name='songs')
    title = # ...
    length = # ...

Task: Get all users' total length of their playlists

  • 10 Users
  • 10 Playlists per user
  • 10 songs per playlist

Task: Get all users' total length of their playlists

Total objects to fetch = 10 (Users) +\

                                          10 * 10 (Playlists) +\

                                          10 * 10 * 10 (Songs)

1110 DB rows in 3 different tables

for just 10 users

GROUP BY with aggregation(SUM)

class User(Model):
    name = # ...

    @property
    def playlists_total_length(self):
        return sum([p.songs_total_length for p in self.playlists.all()])


class Playlist(Model):
    user = ForeignKey(User, related_name='playlists')
    name = # ...

    @property
    def songs_total_length(self):
        return sum([song.length for song in self.songs.all()])


class Song(Model):
    playlist = ForeignKey(Playlist, related_name='songs')
    title = # ...
    length = # ...

Playlists with total songs length 

SELECT "playlist"."id",
       "playlist"."user_id",
       "playlist"."name",
       (
           SELECT SUM("song"."length")
               FROM "song"
               WHERE "song"."playlist_id" = ("playlist"."id")
               GROUP BY "song"."playlist_id" LIMIT 1
        ) AS "songs_total_length"
    FROM "playlist";
class Playlist(Model):
    user = ForeignKey(User, related_name='playlists')

    name = CharField(max_length=255)

    @property
    def songs_total_length(self):
        return sum([song.length for song in self.songs.all()])
class PlaylistQuerySet(QuerySet):
    @classmethod
    def songs_total_length(cls):
        return <sum aggregation over songs grouped by the playlist ???>

    def collect(self):
        return self.annotate(_songs_total_length=self.songs_total_length())


class Playlist(Model):
    objects = PlaylistQuerySet.as_manager()
    user = ForeignKey(User, related_name='playlists')
    name = CharField(max_length=255)

    @property
    def songs_total_length(self):
        if hasattr(self, 'songs_total_length'):
            return self._songs_total_length

        return sum([song.length for song in self.songs.all()])
class PlaylistQuerySet(QuerySet):
    @classmethod
    def songs_total_length(cls):
        return Subquery(
            query=...,
            output_field=...
        )

    def collect(self):
        return self.annotate(_songs_total_length=self.songs_total_length())


class Playlist(Model):
    objects = PlaylistQuerySet.as_manager()
    user = ForeignKey(User, related_name='playlists')
    name = CharField(max_length=255)

    @property
    def songs_total_length(self):
        if hasattr(self, 'songs_total_length'):
            return self._songs_total_length

        return sum([song.length for song in self.songs.all()])
class PlaylistQuerySet(QuerySet):
    @classmethod
    def songs_total_length(cls):
        return Subquery(
            queryset=...,
            output_field=models.IntegerField()
        )

    def collect(self):
        return self.annotate(_songs_total_length=self.songs_total_length())


class Playlist(Model):
    objects = PlaylistQuerySet.as_manager()
    user = ForeignKey(User, related_name='playlists')
    name = CharField(max_length=255)

    @property
    def songs_total_length(self):
        if hasattr(self, 'songs_total_length'):
            return self._songs_total_length

        return sum([song.length for song in self.songs.all()])
class PlaylistQuerySet(QuerySet):
    @classmethod
    def songs_total_length(cls):
        queryset = Song.objects \
            .values('playlist__id') \  # This is the GROUP BY field
            .  # ¯\_(ツ)_/¯

        return Subquery(
            queryset=queryset,
            output_field=models.IntegerField()
        )

    def collect(self):
        return self.annotate(_songs_total_length=self.songs_total_length())


class Playlist(Model):
    objects = PlaylistQuerySet.as_manager()
    user = ForeignKey(User, related_name='playlists')
    name = CharField(max_length=255)

    @property
    def songs_total_length(self):
        if hasattr(self, 'songs_total_length'):
            return self._songs_total_length

        return sum([song.length for song in self.songs.all()])
class PlaylistQuerySet(QuerySet):
    @classmethod
    def songs_total_length(cls):
        queryset = Song.objects \
            .values('playlist__id') \  # This is the GROUP BY field
            .values_list(Sum('length')) \  # The SUM aggregation
            .  # ¯\_(ツ)_/¯

        return Subquery(
            queryset=queryset,
            output_field=models.IntegerField()
        )

    def collect(self):
        return self.annotate(_songs_total_length=self.songs_total_length())


class Playlist(Model):
    objects = PlaylistQuerySet.as_manager()
    user = ForeignKey(User, related_name='playlists')
    name = CharField(max_length=255)

    @property
    def songs_total_length(self):
        if hasattr(self, 'songs_total_length'):
            return self._songs_total_length

        return sum([song.length for song in self.songs.all()])
class PlaylistQuerySet(QuerySet):
    @classmethod
    def songs_total_length(cls):
        queryset = Song.objects \
            .values('playlist__id') \  # This is the GROUP BY field
            .values_list(Sum('length'))  \ # The SUM aggregation
            .filter(playlist__id=OuterRef('id')) # !!!

        return Subquery(
            queryset=queryset,
            output_field=models.IntegerField()
        )

    def collect(self):
        return self.annotate(_songs_total_length=self.songs_total_length())


class Playlist(Model):
    objects = PlaylistQuerySet.as_manager()
    user = ForeignKey(User, related_name='playlists')
    name = CharField(max_length=255)

    @property
    def songs_total_length(self):
        if hasattr(self, 'songs_total_length'):
            return self._songs_total_length

        return sum([song.length for song in self.songs.all()])
Playlist.objects.collect()

SELECT "playlist"."id",
       "playlist"."user_id",
       "playlist"."name",
       (
           SELECT SUM("song"."length")
               FROM "song"
               WHERE "song"."playlist_id" = ("playlist"."id")
               GROUP BY "song"."playlist_id" LIMIT 1
        ) AS "songs_total_length"
    FROM "playlist";

...will be the Django ORM equivalent for

class PlaylistQuerySet(QuerySet):
    @classmethod
    def playlists_total_length(cls):
        pass

    def collect(self):
        return self.annotate(_playlists_total_length=self.playlists_total_length())


class User(Model):
    objects = UsersQuerySet.as_manager()

    name = CharField(max_length=255)

    @property
    def playlists_total_length(self):
        if hasattr(self, '_playlists_total_length'):
            return self._playlists_total_length

        playlists_length = [
            playlist.songs_total_length
            for playlist in self.playlists.all()
        ]

        return sum(playlist_length)
class UserQuerySet(QuerySet):
    @classmethod
    def playlists_total_length(cls):
        playlist_annotation = PlaylistQueryset.songs_total_length()

        queryset = Playlist.objects \
            .values('user__id') \
            .filter(user__id=OuterRef('id')) \
            .values_list(Sum(playlist_annotation))

        return Subquery(
            queryset=queryset,
            output_field=IntegerField()
        )

    def collect(self):
        return self.annotate(_playlists_total_length=self.playlists_total_length())


class User(Model):
    objects = UserQuerySet.as_manager()

    name = CharField(max_length=255)

    @property
    def playlists_total_length(self):
        if hasattr(self, '_playlists_total_length'):
            return self._playlists_total_length

        return sum([p.songs_total_length for p in self.playlists.all()])
SELECT "user"."id",
       "user"."name",
       (SELECT SUM((
            SELECT SUM(U0."length")
                FROM "song" U0
                WHERE U0."playlist_id" = (V0."id")
                GROUP BY U0."playlist_id"
                LIMIT 1
       ))
           FROM "playlist" V0
           WHERE V0."user_id" = ("user"."id")
           GROUP BY V0."user_id") AS "_playlists_total_length"
FROM "user"

Tip 2:

Subquery + OuterRef for GROUP BY

#2 Too much

data & queries

class User(Model):
    objects = UserQuerySet.as_manager()

    first_name = CharField(max_length=255)
    second_name = CharField(max_length=255)

    @property
    def verbose_name(self):
        return f'Name: {self.first_name} {self.last_name}.'
class UserQuerySet(QuerySet):
    @classmethod
    def verbose_name(cls):
        return ExpressionWrapper(
            output_field=f"Name: {F('first_name')} {F('second_name')}.",
            output_field=CharField()
        )

    def collect(self):
        return self.annotate(_verbose_name=verbose_name)


class User(Model):
    objects = UserQuerySet.as_manager()

    first_name = CharField(max_length=255)
    second_name = CharField(max_length=255)

    @property
    def verbose_name(self):
        if hasattr(self, '_verbose_name'):
            return self._verbose_name

        return f'Name: {self.first_name} {self.last_name}.'

Tip 3:

Use django.db.models.ExpressionWrapper

to avoid the need of loading heavy objects.

 

(You can use the `versbose_name` with values/values_list)

Tip 4:

https://github.com/jazzband/django-debug-toolbar

Post problem:

Does it really work?

The original tests

# test_models.py

class SongTests(TestCase):
    def test_song_length(self):
        length = 120
        real_length = 0.8 * length
        song = Song.objects.create(title='Europython', length=120)
        
        self.assertEqual(real_length, song.real_length)

The modified tests

# test_models.py

class SongTests(TestCase):
    def test_song_length(self):
        length = 120
        real_length = 0.8 * length
        song = Song.objects.create(title='Europython', length=120)
        
        self.assertEqual(real_length, song.real_length)

        song_collected = Song.objects.collect().get(id=song.id)

        self.assertEqual(real_length, song_collected.real_length)

 

Thank you!!!

https://github.com/Ivo-Donchev

https://twitter.com/DonchevIvaylo

https://www.facebook.com/ivodonchev267

i.donchev@hacksoft.io

Django ORM and queries optimization

By Ivaylo Donchev

Django ORM and queries optimization

  • 765