Celery basics

elery

  • Async task queue & task runner

  • Distributed message passing

  • Implemented in Python

  • Plays well with Django

  • Pluggable brokers & storages

Basic Concepts

Basic Concepts

Concept #1: Broker

  • Creation of task queues
  • Dispatching tasks to task queues
  • Delivering tasks to workers.

Concept #2: Consumer

One or multiple Celery workers executing the tasks

Concept #3:

Result Backend

Used for storing the results

Keep in mind...

Rule #1:

adding a task to a queue should be faster than performing the task itself

Rule #2:

you should consume tasks faster than you produce them.

If not --> add more workers!

StyleSage meets Celery

Use Case #1:

Product Loader

Use Case #2:

Images

http://www.zappos.com/images/z/3...

http://images.neimanmarcus.com/ca/1/pro...

Use Case #3:

Mappings

Use Case #4:

Scores

First steps

Application


from celery import Celery

app = Celery('tasks',
             broker='pyamqp://guest@localhost//'
             backend='redis://localhost')

@app.task
def add(x, y):
    return x + y

Running the Celery worker server


>> celery -A tasks worker --loglevel=info

Calling the task

from tasks import add

### apply_async(args[, kwargs[, …]])
# Sends a task message.
add.apply_async(4, 4)


### delay(*args, **kwargs)
# Shortcut to send a task message, 
# but doesn’t support execution options.
add.delay(4, 4)


### calling (__call__)
# That the task will not be executed by a worker, 
# but in the current process instead
add(4, 4)

Keeping Results


>>> result = add.delay(4, 4)

>>> result.ready()
False

>>> result.get(timeout=1)
8

Best practices

#1 Passing Big Objects to Tasks => Memory Leak


@task()
def process_file(file_content):
    print file_content


def run():
    for i in range(1000):
        file = open('big_file_%s.text' % i, 'r')
        file_content = file.read()
        process_file.delay(file_content)

#1 Passing Big Objects to Tasks => Memory Leak


@task()
def process_file(file_name):
    file = open('big_file_%s.text' % i, 'r')
    file_content = file.read()
    print file_content


def run():
    for i in range(1000):
        process_file.delay('big_file_%s.text' % i)

#2 Passing Database/ORM Objects => Race Condition

from celery import task

@task()
def update_user_picture(user_object, picture):
    """Update user profile picture in background, since the
       uploading process takes a while to complete."""

    new_profile_picture_url = upload_picture(picture)
    user_object.profile_picture = new_profile_picture_url
    user_object.save()

def update_username(request, user_id, username):
    user_object = db.user.get(user_id=user_id)
    user_object.username = username
    user_object.save()

def update_user_picture(request, user_id, picture):
    user_object = db.user.get(user_id=user_id)
    update_user_picture.delay(user_object, picture)

#2 Passing Database/ORM Objects => Race Condition

@task()
def update_user_picture(user_id, picture):

    # Uploading takes time, make sure to get a fresh
    # user_object before updating/saving it.
    new_profile_picture_url = upload_picture(picture)

    user_object = db.user.get(user_id=user_id)
    user_object.profile_picture = new_profile_picture_url
    user_object.save()

def update_username(user_id, username):
    user_object = db.user.get(user_id=user_id)
    user_object.username = username
    user_object.save()

def update_user_picture(request, user_id, picture):
    update_user_picture.delay(user_id, picture)

#3 Route Tasks to Their Own Queues

@task()
def log_event():
    pass

@task()
def update_username():
    pass

@task()
def update_password():
    pass

@task()
def update_user_picture():
    pass

def run():
    log_event.delay()
    update_username.delay()
    update_password.delay()
    update_user_picture.delay()
CELERY_ROUTES = {
    'default.log_event': {
        'queue': 'log_event',
    },
    'default.update_username': {
        'queue': 'update_user_profile',
    },
    'default.update_password': {
        'queue': 'update_user_profile',
    },
    'default.update_user_picture': {
        'queue': 'update_user_profile',
    },
}

#4 Retry & Idempotent

from celery import task

@task()
def log_event(event):
    """
        Simply save the event object to database.
        Retrying will cause duplicate objects saved.
    """
    db.save(event)
  • Celery Tasks may fail or be interrupted.
  • Never assume the current state of the system when a task begins.
  • Change as little external state as possible.

#4 Retry & Idempotent

from celery import task

@task(default_retry_delay=10, max_retries=3)
def log_event(event):
    """
        Save the event object to database, only if it's 
        not been created yet.
    """
    if not db.event.find_one(ip=event.ip, user_agent=event.user_agent):
        db.save(event)
  • Celery Tasks may fail or be interrupted.
  • Never assume the current state of the system when a task begins.
  • Change as little external state as possible.

#5 Property Caching

from celery import Task

class DatabaseTask(Task):
    abstract = True
    _db = None

    @property
    def db(self):
        """Cache the Database connection for reuse."""
        if self._db is None:
            self._db = Database.connect()
        return self._db

    def run(self, user_id, username):
        user = self.db.user.find_one(user_id=user_id)
        user["username"] = username
        self.db.user.save(user)

#6 Class Method as Task

from celery import task


class UserProfileUpdater(object):

    @staticmethod
    @celery.task()
    def update_user_picture(user_id, picture):
        new_profile_picture_url = upload_picture(picture)
        user = db.user.find_one(user_id=user_id)
        user.profile_picture = new_profile_picture_url
        user.save()

Designing Work-flows

6 tools

#1 Partials


# Any arguments added will be prepended
# to the args in the signature:
>>> partial = add.s(2)  # incomplete signature
>>> partial.delay(4)  # 4 + 2
>>> partial.apply_async((4,))  # same

# Any keyword arguments added will be merged
# with the kwargs in the signature
>>> s = add.s(2, 2)
>>> s.delay(debug=True)  # -> add(2, 2, debug=True)
>>> s.apply_async(kwargs={'debug': True})  # same
>>> from celery import chain

>>> # 2 + 2 + 4 + 8
>>> res = chain(add.s(2, 2),
        add.s(4), add.s(8))()
>>> res.get()
16

# This can also be written using pipes:
(add.s(2, 2) | add.s(4) | add.s(8))().get()

#2 Chains

# Two ways to do it
>>> add.signature((2, 2), immutable=True)
>>> add.si(2, 2)

# A chain of independent tasks
>>> res = (add.si(2, 2) | add.si(4, 4) | add.s(8, 8))()
>>> res.get()
16

>>> res.parent.get()
8

>>> res.parent.parent.get()
4

#3 Immutable signatures


# Group of tasks to execute in parallel

>>> from celery import group

>>> res = group(add.s(i, i)
            for i in xrange(10))()

>>> res.get(timeout=1)
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]

#4 Groups


>>> from celery import chord

>>> res = chord((add.s(i, i) for i in xrange(10)),
                 xsum.s())()
>>> res.get()
90

#5 chord (group + chain)

>>> from proj.tasks import add

>>> res = add.chunks(zip(range(100), range(100)), 10)()
>>> res.get()
[[0, 2, 4, 6, 8, 10, 12, 14, 16, 18],
 [20, 22, 24, 26, 28, 30, 32, 34, 36, 38],
 [40, 42, 44, 46, 48, 50, 52, 54, 56, 58],
 [60, 62, 64, 66, 68, 70, 72, 74, 76, 78],
 [80, 82, 84, 86, 88, 90, 92, 94, 96, 98],
 [100, 102, 104, 106, 108, 110, 112, 114, 116, 118],
 [120, 122, 124, 126, 128, 130, 132, 134, 136, 138],
 [140, 142, 144, 146, 148, 150, 152, 154, 156, 158],
 [160, 162, 164, 166, 168, 170, 172, 174, 176, 178],
 [180, 182, 184, 186, 188, 190, 192, 194, 196, 198]]

#6 Chunks

Periodic tasks

Periodical tasks

from celery import Celery

app = Celery()

@app.on_after_configure.connect
def setup_periodic_tasks(sender, **kwargs):
    # Calls test('hello') every 10 seconds.
    sender.add_periodic_task(10.0, test.s('hello'))

    # Calls test('world') every 30 seconds
    sender.add_periodic_task(30.0, test.s('world'))

@app.task
def test(arg):
    print(arg)

Crontab schedules

from celery import Celery
from celery.schedules import crontab

app = Celery()

@app.on_after_configure.connect
def setup_periodic_tasks(sender, **kwargs):
    # Executes every Monday morning at 7:30 a.m.
    sender.add_periodic_task(
        crontab(hour=7, minute=30, day_of_week=1),
        test.s('Happy Mondays!'),
    )

@app.task
def test(arg):
    print(arg)

Solar schedules

from celery import Celery
from celery.schedules import crontab

app = Celery()

@app.on_after_configure.connect
def setup_periodic_tasks(sender, **kwargs):
    # Executes at sunset in Melbourne
    sender.add_periodic_task(
        solar('sunset', -37.81753, 144.96715),
        test.s('Good night Melbourne!'),
    )

@app.task
def test(arg):
    print(arg)

Starting the Scheduler


$ celery -A proj beat

Django testing

"""
* No need for running RabbitMQ
* Don't do async (always eager)
* Propagate exceptions
"""
CELERY_BROKER_BACKEND = 'memory'
CELERY_ALWAYS_EAGER = True
CELERY_EAGER_PROPAGATES_EXCEPTIONS = True

Django testing

Monitoring

Flower

Thank you!

Made with Slides.com