Speedy protocol parsing in Python and Cython

Taras Voinarovskyi

https://github.com/tvoinarovskyi

Taras Voinarovskyi

https://github.com/tvoinarovskyi

Collaborator:

Member of aio-libs team

Where to start?

Read the protocol docs
Determine an API for Parser and Builder
Write some simple parse, build and serde tests.
Implement Parser and Builder
Connect to actual code, write integration tests etc.

Read the docs

RFC / http, smtp
Specification / amqp
Docs page / Postgres, MySQL, Kafka
Bad docs? Read the code =(

For example

Binary frame format:

Primitive types:
int8, int16, int32, int64 - Signed integers with the given precision (in bits)
                            stored in big endian order.

Dynamic types:
* string = Size<int32> + body
* array = Size<int16> + body

Protocol is a request-response type over a TCP connection. Request pipelining is
supported. Both Request and Response are Size<int32> delimited.

For example

Request Response types

Header:
* api_key <int8>
* sequence <int8>

Echo request:
* timestamp <int64>
* subject <string>
* metadata <array<int16>>
* body <string>

Echo response:
* id <int32>
* timestamp <int64>
* subject <string>
* body <string>

Determine the API


class Parser:

    def feed(self, data: bytes) -> None:
        """ Feed data that was read from socket.
        """

    def parse(self) -> List[Response]:
        """ Returns: List of Response subclass objects
        """


class Builder:

    def add(self, req: Request) -> None:
        """ Add request object to output buffer
        """

    def get_buffer(self) -> bytes:
        """ Get the buffer of appended requests
        """

Write tests

verify_data = (
    b"\x00\x00\x00+\x00\x00\x00\x00\x00\x00\x00\x00'\x0ftest\x00\x01"
    "\x00\x02\x00\x03We have a lot in common"
)


def test_parser():
    parser = Parser()
    parser.feed(verify_data)
    responses = parser.parse()

    assert len(responses) == 1
    resp = responses[0]
    assert resp.subject == b"test"
    assert resp.timestamp == 9999
    assert resp.metadata == [1, 2, 3]
    assert resp.body == b"We have a lot in common"

Write tests

verify_data = (
    b"\x00\x00\x00+\x00\x00\x00\x00\x00\x00\x00\x00'\x0ftest\x00\x01"
    "\x00\x02\x00\x03We have a lot in common"
)


def test_reader():
    builder = Builder()
    builder.add(
        subject=b"test",
        timestamp=9999,
        metadata=[1, 2, 3],
        body=b"We have a lot in common")

    assert builder.get_buffer() == verify_data

Write tests

def test_serde():
    builder = Builder()
    builder.add(
        subject=b"test",
        timestamp=9999,
        metadata=[1, 2, 3],
        body=b"We have a lot in common")

    parser = Parser()
    responses = parser.feed(builder.get())

    assert len(responses) == 1
    resp = responses[0]
    assert resp.subject == b"test"
    assert resp.timestamp == 9999
    assert resp.metadata == [1, 2, 3]
    assert resp.body == b"We have a lot in common"

Parser

Framing - split the incoming TCP stream into blocks
Parsing - turn byte presentation into typed values
Object representation - returning a user friendly object like Message or Response

Builder

Space allocation - either dynamic or preallocate
Serialization - turn typed values into byte presentation

Framing

import socket

def read_frame(sock: socket.socket):
    size = sock.recv(4)  # Int32 size
    size = struct.unpack(">i", size)
    frame = b""
    while len(frame) <= size:
        frame += sock.recv(size - len(frame))
    return frame

Never do this!

Framing

import socket
from .parser import Parser

MAX_READ_SIZE = 2 * 1024


def read(sock: socket.socket, parser: Parser):
    data = sock.recv(MAX_READ_SIZE)  # Int32 size
    if not data:
        raise Disconnected
    frames = parser.feed(data)
    return frames

Better!

SIZE_LENGTH = 4  # Int32


class Parser:

    def __init__(self):
        self._buffer = bytearray()  # <-- A quite good choice for compatibility
        self._next_pos = None

    def feed(self, data: bytes):
        self._buffer.extend(data)

        frames = []
        while True:
            frame = self._split_frame()
            if frame is None:
                break
            frames.append(frame)
        return frames

    def _split_frame(self):
        if len(self._buffer) < SIZE_LENGTH:
            return None

        if self._next_pos is None:
            self._next_pos = struct.unpack_from(">i", self._buffer) + SIZE_LENGTH

        if len(self._buffer) < self._next_pos:
            return None
        
        frame = self._buffer[SIZE_LENGTH: self._next_pos]
        self._buffer = self._buffer[self._next_pos:]
        return frame

Bytearray like types

>>> big_data_buffer = bytes(10 * 1024 * 1024)
>>> big_data_buffer[10: 15]  # <-- Slices are cheap only for small slices
b'\x00\x00\x00\x00\x00'

>>> data_view = memoryview(big_data_buffer)
>>> data_view[10: 15]  # <-- Will always be cheap
<memory at 0x101fafac8>
>>> data_view[10: 15].tobytes()
b'\x00\x00\x00\x00\x00'

>>> out_buffer = bytearray(10 * 1024)  # <-- Use writable bytearray for outgoing buffers
>>> out_buffer[10: 15] = data_view[10: 15]  # <-- Will do memcpy on C, fast

>>> out_view = memoryview(out_buffer)  # <-- We can do views on bytearray too
>>> out_view[10: 15] = data_view[10: 15]  # <-- Slicing will be the same
>>> out_view.release()  <-- Unblock writing to bytearray

Avoid strange cases!

>>> out_buffer[:10] = out_buffer[10:]  # <-- Moving data is Ok, but not big chunks
>>> out_buffer = out_buffer[10:] + bytearray(10)  # <-- Easier and predictable

Peek inside a bit

typedef struct {
    PyObject ob_base;
    Py_ssize_t ob_size; /* Number of items in variable part */
} PyVarObject;


typedef struct {
    PyObject_VAR_HEAD
    Py_hash_t ob_shash;
    char ob_sval[1];

    /* Invariants:
     *     ob_sval contains space for 'ob_size+1' elements.
     *     ob_sval[ob_size] == 0.
     *     ob_shash is the hash of the string or -1 if not computed yet.
     */
} PyBytesObject;

typedef struct {
    PyObject_VAR_HEAD
    Py_ssize_t ob_alloc; /* How many bytes allocated in ob_bytes */
    char *ob_bytes;      /* Physical backing buffer */
    char *ob_start;      /* Logical start inside ob_bytes */
    /* XXX(nnorwitz): should ob_exports be Py_ssize_t? */
    int ob_exports;      /* How many buffer exports */
} PyByteArrayObject;

typedef struct {
    PyObject_HEAD
    int flags;          /* state flags */
    Py_ssize_t exports; /* number of direct memoryview exports */
    Py_buffer master; /* snapshot buffer obtained from the original exporter */
} _PyManagedBufferObject;



typedef struct {
    PyObject_VAR_HEAD
    _PyManagedBufferObject *mbuf; /* managed buffer */
    Py_hash_t hash;               /* hash value for read-only views */
    int flags;                    /* state flags */
    Py_ssize_t exports;           /* number of buffer re-exports */
    Py_buffer view;               /* private copy of the exporter's view */
    PyObject *weakreflist;
    Py_ssize_t ob_array[1];       /* shape, strides, suboffsets */
} PyMemoryViewObject;

Py2

str
unicode
bytearray
memoryview

Py3

bytes
str
bytearray
memoryview

Python2 has some differences

>>> memoryview(b"123")[0]
'1'
>>> bytearray(b"123")[0]
49
>>> bytes(b"123")[0]
'1'
>>> bytes(10)
'10'
>>> bytearray(10)
bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')

Parsing values

import struct

HEADER_STRUCT = struct.Struct(
    ">b"  # Api Key => Int8
    "b"  # Sequence => Int8
)

class Parser:

    def parse_frame(self, frame: bytearray):
        api_key, sequence = HEADER_STRUCT.unpack_from(frame)
        pos = HEADER_STRUCT.size
        assert api_key == 0

        timestamp, subj_len = struct.unpack_from(">qi", frame, pos)
        pos += struct.calcsize(">qi")
        subject = bytes(frame[pos: pos + subj_len])
        pos += subj_len
        metadata_len, = struct.unpack_from(">h", frame, pos)
        pos += struct.calcsize(">h")

        fmt = ">%sh" % metadata_len  # <-- We use dynamic format string
        metadata, body_len = struct.unpack_from(fmt, frame, pos)
        pos += struct.calcsize(fmt)
        body = bytes(frame[pos: pos + body_len])

        return EchoRequest(timestamp, subject, metadata, body)

Object presentation

tuple
namedtuple
attrs library (http://www.attrs.org)
etc.

@attr.s
class EchoRequest:
    timestamp = attr.ib(converter=int)
    subject = attr.ib(converter=bytes)
    metadata = attr.ib(converter=list)
    body = attr.ib(converter=bytes)

That's good and all, but you will still do it on raw classes for Cython =)

class Builder:

    def __init__(self):
        self._buffers = []
        self._size = 0

    def add(self, id, timestamp, subject, body):
        subj_len = len(subject)
        body_len = len(body)

        # size<int32> id<int32> timestamp<int64> subj_len<int32> body_len<int32>
        size = (  # <-- Many protocols will require size as first few bytes
            SIZE_LENGTH + HEADER_STRUCT.size + struct.calcsize(">iqii") + 
            subj_len + body_len
        )

        msg_buf = bytearray(size)  # <-- We need a writeable structure

        # ...

        # ...

        pos = 0
        struct.pack_into(msg_buf)
        pos += SIZE_LENGTH
        HEADER_STRUCT.pack_into(msg_buf, pos, 0, 0)
        pos += HEADER_STRUCT.size
        struct.pack_into(">iqi", msg_buf, pos, id, timestamp, subj_len)
        pos += struct.calcsize(">iqi")

        msg_buf[pos: pos + subj_len] = subject
        struct.pack_into(">i", msg_buf, pos, body_len)
        pos += struct.calcsize(">i")
        msg_buf[pos: pos + body_len] = body

        self._size += size
        self._buffers.append(msg_buf)
        return size

    def get_buffer(self):
        return b"".join(self._buffers)

Builder

Before you speed up

Profile (vmprof, cProfile)
Benchmark (perf)
100% Coverage
Always do functional/integration testing!

Cython problems

No struct module in C API!!!
Be aware of integer endian

IF UNAME_SYSNAME == "Windows":
    cdef extern from "winsock2.h":
        uint32_t htonl(uint32_t hostlong)
        uint16_t htons(uint16_t hostshort)
        uint32_t ntohl(uint32_t netlong)
        uint16_t ntohs(uint16_t netshort)
ELSE:
    cdef extern from "arpa/inet.h":
        uint32_t htonl(uint32_t hostlong)
        uint16_t htons(uint16_t hostshort)
        uint32_t ntohl(uint32_t netlong)
        uint16_t ntohs(uint16_t netshort)

cdef inline void pack_int64(char* buf, int64_t x):
    (<uint32_t*>buf)[0] = htonl(<uint32_t>(<uint64_t>(x) >> 32))
    (<uint32_t*>&buf[4])[0] = htonl(<uint32_t>(x))


cdef inline int64_t unpack_int64(const char* buf):
    cdef int64_t hh = unpack_int32(buf)
    cdef uint32_t hl = <uint32_t>unpack_int32(&buf[4])

    return (hh << 32) | hl

Cython problems

Be aware of bytes types C API and Buffer Protocol!

from cpython cimport (
    PyObject_GetBuffer, PyBuffer_Release, 
    PyBUF_SIMPLE, PyBUF_WRITABLE, Py_buffer,
    PyBytes_FromStringAndSize
)

cdef extern from "Python.h":   # <-- Not everything is defined in Cython yet
    ssize_t PyByteArray_GET_SIZE(object)
    char* PyByteArray_AS_STRING(bytearray ba)
    int PyByteArray_Resize(object, ssize_t) except -1  # <-- Don't forget exceptions

from libc.string cimport memcpy  # <-- When in C do it the C way

cdef handle(object data, char* out_buf, Py_ssize_t pos):
    cdef:
        Py_buffer buf

    PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE)
    memcpy(&out_buf[pos], <char*>buf.buf, <size_t>buf.len)
    PyBuffer_Release(&buf)

Cython problems

Little of Python is visible from C API

import time

timestamp = int(time.time() * 1000)  # BuBu!!

import gzip

crc = gzip.crc32(some_big_data)  # BuBu!!

data_view = memoryview(some_big_data)
data_view = data_view[start: end]  # BuBu!!

Object presentation

Use Cython's fast initialization

@cython.no_gc_clear  # <-- Disable GC, as no circular dependencies
@cython.final        # <-- No need for subclasses
@cython.freelist(REQUEST_FREELIST_SIZE)  # <-- Avoid reallocation
cdef class Request:

    cdef:
        readonly int64_t timestamp
        readonly object subject
        readonly object body
        readonly tuple metadata


    def __init__(self, int64_t timestamp, subject, body, metadata):
        self.timestamp = timestamp
        self.subject = subject
        self.body = body
        self.metadata = metadata

    # ...

Not fast enough? Ehmm, no choice but C =(

    # ...

    @staticmethod
    cdef inline Request new(
            int64_t timestamp, subject, body, metadata):
        """ Fast constructor to initialize from C.
        """
        cdef:
            Request req

        req = Request.__new__(Request)  # <-- We don't call Python API for this
        req.timestamp = timestamp
        req.subject = subject
        req.body = body
        req.metadata = metadata
        return record

Speedy protocol parsing in Python and Cython

Where to start?

Read the docs

For example

For example

Determine the API

Write tests

Write tests

Write tests

Parser

Builder

Framing

Framing

Bytearray like types

Peek inside a bit

Py2

Py3

Python2 has some differences

Parsing values

Object presentation

Builder

Before you speed up

Cython problems

Cython problems

Cython problems

Object presentation

Questions?