Speedy protocol parsing in Python and Cython

Taras Voinarovskyi

Taras Voinarovskyi

Member of aio-libs team

Where to start?

  • Read the protocol docs
  • Determine an API for Parser and Builder
  • Write some simple parse, build and serde tests.
  • Implement Parser and Builder
  • Connect to actual code, write integration tests etc.

Read the docs

  • RFC / http, smtp
  • Specification / amqp
  • Docs page / Postgres, MySQL, Kafka
  • Bad docs? Read the code =(

For example

Binary frame format:

Primitive types:
int8, int16, int32, int64 - Signed integers with the given precision (in bits)
                            stored in big endian order.

Dynamic types:
* string = Size<int32> + body
* array = Size<int16> + body

Protocol is a request-response type over a TCP connection. Request pipelining is
supported. Both Request and Response are Size<int32> delimited.

For example

Request Response types

Header:
* api_key <int8>
* sequence <int8>

Echo request:
* timestamp <int64>
* subject <string>
* metadata <array<int16>>
* body <string>

Echo response:
* id <int32>
* timestamp <int64>
* subject <string>
* body <string>

Determine the API


class Parser:

    def feed(self, data: bytes) -> None:
        """ Feed data that was read from socket.
        """

    def parse(self) -> List[Response]:
        """ Returns: List of Response subclass objects
        """


class Builder:

    def add(self, req: Request) -> None:
        """ Add request object to output buffer
        """

    def get_buffer(self) -> bytes:
        """ Get the buffer of appended requests
        """

Write tests

verify_data = (
    b"\x00\x00\x00+\x00\x00\x00\x00\x00\x00\x00\x00'\x0ftest\x00\x01"
    "\x00\x02\x00\x03We have a lot in common"
)


def test_parser():
    parser = Parser()
    parser.feed(verify_data)
    responses = parser.parse()

    assert len(responses) == 1
    resp = responses[0]
    assert resp.subject == b"test"
    assert resp.timestamp == 9999
    assert resp.metadata == [1, 2, 3]
    assert resp.body == b"We have a lot in common"

Write tests

verify_data = (
    b"\x00\x00\x00+\x00\x00\x00\x00\x00\x00\x00\x00'\x0ftest\x00\x01"
    "\x00\x02\x00\x03We have a lot in common"
)


def test_reader():
    builder = Builder()
    builder.add(
        subject=b"test",
        timestamp=9999,
        metadata=[1, 2, 3],
        body=b"We have a lot in common")

    assert builder.get_buffer() == verify_data

Write tests

def test_serde():
    builder = Builder()
    builder.add(
        subject=b"test",
        timestamp=9999,
        metadata=[1, 2, 3],
        body=b"We have a lot in common")

    parser = Parser()
    responses = parser.feed(builder.get())

    assert len(responses) == 1
    resp = responses[0]
    assert resp.subject == b"test"
    assert resp.timestamp == 9999
    assert resp.metadata == [1, 2, 3]
    assert resp.body == b"We have a lot in common"

Parser

  • Framing - split the incoming TCP stream into blocks
  • Parsing - turn byte presentation into typed values
  • Object representation - returning a user friendly object like Message or Response

Builder

  • Space allocation - either dynamic or preallocate
  • Serialization - turn typed values into byte presentation

Framing

import socket

def read_frame(sock: socket.socket):
    size = sock.recv(4)  # Int32 size
    size = struct.unpack(">i", size)
    frame = b""
    while len(frame) <= size:
        frame += sock.recv(size - len(frame))
    return frame

Never do this!

Framing

import socket
from .parser import Parser

MAX_READ_SIZE = 2 * 1024


def read(sock: socket.socket, parser: Parser):
    data = sock.recv(MAX_READ_SIZE)  # Int32 size
    if not data:
        raise Disconnected
    frames = parser.feed(data)
    return frames

Better!

SIZE_LENGTH = 4  # Int32


class Parser:

    def __init__(self):
        self._buffer = bytearray()  # <-- A quite good choice for compatibility
        self._next_pos = None

    def feed(self, data: bytes):
        self._buffer.extend(data)

        frames = []
        while True:
            frame = self._split_frame()
            if frame is None:
                break
            frames.append(frame)
        return frames

    def _split_frame(self):
        if len(self._buffer) < SIZE_LENGTH:
            return None

        if self._next_pos is None:
            self._next_pos = struct.unpack_from(">i", self._buffer) + SIZE_LENGTH

        if len(self._buffer) < self._next_pos:
            return None
        
        frame = self._buffer[SIZE_LENGTH: self._next_pos]
        self._buffer = self._buffer[self._next_pos:]
        return frame

Bytearray like types

>>> big_data_buffer = bytes(10 * 1024 * 1024)
>>> big_data_buffer[10: 15]  # <-- Slices are cheap only for small slices
b'\x00\x00\x00\x00\x00'

>>> data_view = memoryview(big_data_buffer)
>>> data_view[10: 15]  # <-- Will always be cheap
<memory at 0x101fafac8>
>>> data_view[10: 15].tobytes()
b'\x00\x00\x00\x00\x00'

>>> out_buffer = bytearray(10 * 1024)  # <-- Use writable bytearray for outgoing buffers
>>> out_buffer[10: 15] = data_view[10: 15]  # <-- Will do memcpy on C, fast

>>> out_view = memoryview(out_buffer)  # <-- We can do views on bytearray too
>>> out_view[10: 15] = data_view[10: 15]  # <-- Slicing will be the same
>>> out_view.release()  <-- Unblock writing to bytearray

Avoid strange cases!

>>> out_buffer[:10] = out_buffer[10:]  # <-- Moving data is Ok, but not big chunks
>>> out_buffer = out_buffer[10:] + bytearray(10)  # <-- Easier and predictable

Peek inside a bit

typedef struct {
    PyObject ob_base;
    Py_ssize_t ob_size; /* Number of items in variable part */
} PyVarObject;


typedef struct {
    PyObject_VAR_HEAD
    Py_hash_t ob_shash;
    char ob_sval[1];

    /* Invariants:
     *     ob_sval contains space for 'ob_size+1' elements.
     *     ob_sval[ob_size] == 0.
     *     ob_shash is the hash of the string or -1 if not computed yet.
     */
} PyBytesObject;

typedef struct {
    PyObject_VAR_HEAD
    Py_ssize_t ob_alloc; /* How many bytes allocated in ob_bytes */
    char *ob_bytes;      /* Physical backing buffer */
    char *ob_start;      /* Logical start inside ob_bytes */
    /* XXX(nnorwitz): should ob_exports be Py_ssize_t? */
    int ob_exports;      /* How many buffer exports */
} PyByteArrayObject;
typedef struct {
    PyObject_HEAD
    int flags;          /* state flags */
    Py_ssize_t exports; /* number of direct memoryview exports */
    Py_buffer master; /* snapshot buffer obtained from the original exporter */
} _PyManagedBufferObject;



typedef struct {
    PyObject_VAR_HEAD
    _PyManagedBufferObject *mbuf; /* managed buffer */
    Py_hash_t hash;               /* hash value for read-only views */
    int flags;                    /* state flags */
    Py_ssize_t exports;           /* number of buffer re-exports */
    Py_buffer view;               /* private copy of the exporter's view */
    PyObject *weakreflist;
    Py_ssize_t ob_array[1];       /* shape, strides, suboffsets */
} PyMemoryViewObject;

Py2

  • str
  • unicode
  • bytearray
  • memoryview

Py3

  • bytes
  • str
  • bytearray
  • memoryview

Python2 has some differences

>>> memoryview(b"123")[0]
'1'
>>> bytearray(b"123")[0]
49
>>> bytes(b"123")[0]
'1'
>>> bytes(10)
'10'
>>> bytearray(10)
bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')

Parsing values

import struct

HEADER_STRUCT = struct.Struct(
    ">b"  # Api Key => Int8
    "b"  # Sequence => Int8
)

class Parser:

    def parse_frame(self, frame: bytearray):
        api_key, sequence = HEADER_STRUCT.unpack_from(frame)
        pos = HEADER_STRUCT.size
        assert api_key == 0

        timestamp, subj_len = struct.unpack_from(">qi", frame, pos)
        pos += struct.calcsize(">qi")
        subject = bytes(frame[pos: pos + subj_len])
        pos += subj_len
        metadata_len, = struct.unpack_from(">h", frame, pos)
        pos += struct.calcsize(">h")

        fmt = ">%sh" % metadata_len  # <-- We use dynamic format string
        metadata, body_len = struct.unpack_from(fmt, frame, pos)
        pos += struct.calcsize(fmt)
        body = bytes(frame[pos: pos + body_len])

        return EchoRequest(timestamp, subject, metadata, body)

Object presentation

@attr.s
class EchoRequest:
    timestamp = attr.ib(converter=int)
    subject = attr.ib(converter=bytes)
    metadata = attr.ib(converter=list)
    body = attr.ib(converter=bytes)

That's good and all, but you will still do it on raw classes for Cython =)

class Builder:

    def __init__(self):
        self._buffers = []
        self._size = 0

    def add(self, id, timestamp, subject, body):
        subj_len = len(subject)
        body_len = len(body)

        # size<int32> id<int32> timestamp<int64> subj_len<int32> body_len<int32>
        size = (  # <-- Many protocols will require size as first few bytes
            SIZE_LENGTH + HEADER_STRUCT.size + struct.calcsize(">iqii") + 
            subj_len + body_len
        )

        msg_buf = bytearray(size)  # <-- We need a writeable structure

        # ...
        # ...

        pos = 0
        struct.pack_into(msg_buf)
        pos += SIZE_LENGTH
        HEADER_STRUCT.pack_into(msg_buf, pos, 0, 0)
        pos += HEADER_STRUCT.size
        struct.pack_into(">iqi", msg_buf, pos, id, timestamp, subj_len)
        pos += struct.calcsize(">iqi")

        msg_buf[pos: pos + subj_len] = subject
        struct.pack_into(">i", msg_buf, pos, body_len)
        pos += struct.calcsize(">i")
        msg_buf[pos: pos + body_len] = body

        self._size += size
        self._buffers.append(msg_buf)
        return size

    def get_buffer(self):
        return b"".join(self._buffers)

Builder

Before you speed up

  • Profile (vmprof, cProfile)

  • Benchmark (perf)

  • 100% Coverage

  • Always do functional/integration testing!

Cython problems

No struct module in C API!!!
Be aware of integer endian

IF UNAME_SYSNAME == "Windows":
    cdef extern from "winsock2.h":
        uint32_t htonl(uint32_t hostlong)
        uint16_t htons(uint16_t hostshort)
        uint32_t ntohl(uint32_t netlong)
        uint16_t ntohs(uint16_t netshort)
ELSE:
    cdef extern from "arpa/inet.h":
        uint32_t htonl(uint32_t hostlong)
        uint16_t htons(uint16_t hostshort)
        uint32_t ntohl(uint32_t netlong)
        uint16_t ntohs(uint16_t netshort)
cdef inline void pack_int64(char* buf, int64_t x):
    (<uint32_t*>buf)[0] = htonl(<uint32_t>(<uint64_t>(x) >> 32))
    (<uint32_t*>&buf[4])[0] = htonl(<uint32_t>(x))


cdef inline int64_t unpack_int64(const char* buf):
    cdef int64_t hh = unpack_int32(buf)
    cdef uint32_t hl = <uint32_t>unpack_int32(&buf[4])

    return (hh << 32) | hl

Cython problems

Be aware of bytes types C API and Buffer Protocol!

from cpython cimport (
    PyObject_GetBuffer, PyBuffer_Release, 
    PyBUF_SIMPLE, PyBUF_WRITABLE, Py_buffer,
    PyBytes_FromStringAndSize
)

cdef extern from "Python.h":   # <-- Not everything is defined in Cython yet
    ssize_t PyByteArray_GET_SIZE(object)
    char* PyByteArray_AS_STRING(bytearray ba)
    int PyByteArray_Resize(object, ssize_t) except -1  # <-- Don't forget exceptions

from libc.string cimport memcpy  # <-- When in C do it the C way
cdef handle(object data, char* out_buf, Py_ssize_t pos):
    cdef:
        Py_buffer buf

    PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE)
    memcpy(&out_buf[pos], <char*>buf.buf, <size_t>buf.len)
    PyBuffer_Release(&buf)

Cython problems

Little of Python is visible from C API

import time

timestamp = int(time.time() * 1000)  # BuBu!!

import gzip

crc = gzip.crc32(some_big_data)  # BuBu!!

data_view = memoryview(some_big_data)
data_view = data_view[start: end]  # BuBu!!

Object presentation

Use Cython's fast initialization

@cython.no_gc_clear  # <-- Disable GC, as no circular dependencies
@cython.final        # <-- No need for subclasses
@cython.freelist(REQUEST_FREELIST_SIZE)  # <-- Avoid reallocation
cdef class Request:

    cdef:
        readonly int64_t timestamp
        readonly object subject
        readonly object body
        readonly tuple metadata


    def __init__(self, int64_t timestamp, subject, body, metadata):
        self.timestamp = timestamp
        self.subject = subject
        self.body = body
        self.metadata = metadata

    # ...

Not fast enough? Ehmm, no choice but C =(

    # ...

    @staticmethod
    cdef inline Request new(
            int64_t timestamp, subject, body, metadata):
        """ Fast constructor to initialize from C.
        """
        cdef:
            Request req

        req = Request.__new__(Request)  # <-- We don't call Python API for this
        req.timestamp = timestamp
        req.subject = subject
        req.body = body
        req.metadata = metadata
        return record

Questions?

Speedy protocol parsing in Python and Cython

By Taras Voinarovskyi

Speedy protocol parsing in Python and Cython

How to parse a binary protocol in Python. Tricks to make it not suck too bad. What to use as a bytebuffer in Python2 and Python3? And how to make it fast with Cython.

  • 2,124