Speedy protocol parsing in Python and Cython
Taras Voinarovskyi
Taras Voinarovskyi
Member of aio-libs team
Where to start?
- Read the protocol docs
- Determine an API for Parser and Builder
- Write some simple parse, build and serde tests.
- Implement Parser and Builder
- Connect to actual code, write integration tests etc.
Read the docs
- RFC / http, smtp
- Specification / amqp
- Docs page / Postgres, MySQL, Kafka
- Bad docs? Read the code =(
For example
Binary frame format:
Primitive types:
int8, int16, int32, int64 - Signed integers with the given precision (in bits)
stored in big endian order.
Dynamic types:
* string = Size<int32> + body
* array = Size<int16> + body
Protocol is a request-response type over a TCP connection. Request pipelining is
supported. Both Request and Response are Size<int32> delimited.
For example
Request Response types
Header:
* api_key <int8>
* sequence <int8>
Echo request:
* timestamp <int64>
* subject <string>
* metadata <array<int16>>
* body <string>
Echo response:
* id <int32>
* timestamp <int64>
* subject <string>
* body <string>
Determine the API
class Parser:
def feed(self, data: bytes) -> None:
""" Feed data that was read from socket.
"""
def parse(self) -> List[Response]:
""" Returns: List of Response subclass objects
"""
class Builder:
def add(self, req: Request) -> None:
""" Add request object to output buffer
"""
def get_buffer(self) -> bytes:
""" Get the buffer of appended requests
"""
Write tests
verify_data = (
b"\x00\x00\x00+\x00\x00\x00\x00\x00\x00\x00\x00'\x0ftest\x00\x01"
"\x00\x02\x00\x03We have a lot in common"
)
def test_parser():
parser = Parser()
parser.feed(verify_data)
responses = parser.parse()
assert len(responses) == 1
resp = responses[0]
assert resp.subject == b"test"
assert resp.timestamp == 9999
assert resp.metadata == [1, 2, 3]
assert resp.body == b"We have a lot in common"
Write tests
verify_data = (
b"\x00\x00\x00+\x00\x00\x00\x00\x00\x00\x00\x00'\x0ftest\x00\x01"
"\x00\x02\x00\x03We have a lot in common"
)
def test_reader():
builder = Builder()
builder.add(
subject=b"test",
timestamp=9999,
metadata=[1, 2, 3],
body=b"We have a lot in common")
assert builder.get_buffer() == verify_data
Write tests
def test_serde():
builder = Builder()
builder.add(
subject=b"test",
timestamp=9999,
metadata=[1, 2, 3],
body=b"We have a lot in common")
parser = Parser()
responses = parser.feed(builder.get())
assert len(responses) == 1
resp = responses[0]
assert resp.subject == b"test"
assert resp.timestamp == 9999
assert resp.metadata == [1, 2, 3]
assert resp.body == b"We have a lot in common"
Parser
- Framing - split the incoming TCP stream into blocks
- Parsing - turn byte presentation into typed values
- Object representation - returning a user friendly object like Message or Response
Builder
- Space allocation - either dynamic or preallocate
- Serialization - turn typed values into byte presentation
Framing
import socket
def read_frame(sock: socket.socket):
size = sock.recv(4) # Int32 size
size = struct.unpack(">i", size)
frame = b""
while len(frame) <= size:
frame += sock.recv(size - len(frame))
return frame
Never do this!
Framing
import socket
from .parser import Parser
MAX_READ_SIZE = 2 * 1024
def read(sock: socket.socket, parser: Parser):
data = sock.recv(MAX_READ_SIZE) # Int32 size
if not data:
raise Disconnected
frames = parser.feed(data)
return frames
Better!
SIZE_LENGTH = 4 # Int32
class Parser:
def __init__(self):
self._buffer = bytearray() # <-- A quite good choice for compatibility
self._next_pos = None
def feed(self, data: bytes):
self._buffer.extend(data)
frames = []
while True:
frame = self._split_frame()
if frame is None:
break
frames.append(frame)
return frames
def _split_frame(self):
if len(self._buffer) < SIZE_LENGTH:
return None
if self._next_pos is None:
self._next_pos = struct.unpack_from(">i", self._buffer) + SIZE_LENGTH
if len(self._buffer) < self._next_pos:
return None
frame = self._buffer[SIZE_LENGTH: self._next_pos]
self._buffer = self._buffer[self._next_pos:]
return frame
Bytearray like types
>>> big_data_buffer = bytes(10 * 1024 * 1024)
>>> big_data_buffer[10: 15] # <-- Slices are cheap only for small slices
b'\x00\x00\x00\x00\x00'
>>> data_view = memoryview(big_data_buffer)
>>> data_view[10: 15] # <-- Will always be cheap
<memory at 0x101fafac8>
>>> data_view[10: 15].tobytes()
b'\x00\x00\x00\x00\x00'
>>> out_buffer = bytearray(10 * 1024) # <-- Use writable bytearray for outgoing buffers
>>> out_buffer[10: 15] = data_view[10: 15] # <-- Will do memcpy on C, fast
>>> out_view = memoryview(out_buffer) # <-- We can do views on bytearray too
>>> out_view[10: 15] = data_view[10: 15] # <-- Slicing will be the same
>>> out_view.release() <-- Unblock writing to bytearray
Avoid strange cases!
>>> out_buffer[:10] = out_buffer[10:] # <-- Moving data is Ok, but not big chunks
>>> out_buffer = out_buffer[10:] + bytearray(10) # <-- Easier and predictable
Peek inside a bit
typedef struct {
PyObject ob_base;
Py_ssize_t ob_size; /* Number of items in variable part */
} PyVarObject;
typedef struct {
PyObject_VAR_HEAD
Py_hash_t ob_shash;
char ob_sval[1];
/* Invariants:
* ob_sval contains space for 'ob_size+1' elements.
* ob_sval[ob_size] == 0.
* ob_shash is the hash of the string or -1 if not computed yet.
*/
} PyBytesObject;
typedef struct {
PyObject_VAR_HEAD
Py_ssize_t ob_alloc; /* How many bytes allocated in ob_bytes */
char *ob_bytes; /* Physical backing buffer */
char *ob_start; /* Logical start inside ob_bytes */
/* XXX(nnorwitz): should ob_exports be Py_ssize_t? */
int ob_exports; /* How many buffer exports */
} PyByteArrayObject;
typedef struct {
PyObject_HEAD
int flags; /* state flags */
Py_ssize_t exports; /* number of direct memoryview exports */
Py_buffer master; /* snapshot buffer obtained from the original exporter */
} _PyManagedBufferObject;
typedef struct {
PyObject_VAR_HEAD
_PyManagedBufferObject *mbuf; /* managed buffer */
Py_hash_t hash; /* hash value for read-only views */
int flags; /* state flags */
Py_ssize_t exports; /* number of buffer re-exports */
Py_buffer view; /* private copy of the exporter's view */
PyObject *weakreflist;
Py_ssize_t ob_array[1]; /* shape, strides, suboffsets */
} PyMemoryViewObject;
Py2
- str
- unicode
- bytearray
- memoryview
Py3
- bytes
- str
- bytearray
- memoryview
Python2 has some differences
>>> memoryview(b"123")[0]
'1'
>>> bytearray(b"123")[0]
49
>>> bytes(b"123")[0]
'1'
>>> bytes(10)
'10'
>>> bytearray(10)
bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
Parsing values
import struct
HEADER_STRUCT = struct.Struct(
">b" # Api Key => Int8
"b" # Sequence => Int8
)
class Parser:
def parse_frame(self, frame: bytearray):
api_key, sequence = HEADER_STRUCT.unpack_from(frame)
pos = HEADER_STRUCT.size
assert api_key == 0
timestamp, subj_len = struct.unpack_from(">qi", frame, pos)
pos += struct.calcsize(">qi")
subject = bytes(frame[pos: pos + subj_len])
pos += subj_len
metadata_len, = struct.unpack_from(">h", frame, pos)
pos += struct.calcsize(">h")
fmt = ">%sh" % metadata_len # <-- We use dynamic format string
metadata, body_len = struct.unpack_from(fmt, frame, pos)
pos += struct.calcsize(fmt)
body = bytes(frame[pos: pos + body_len])
return EchoRequest(timestamp, subject, metadata, body)
Object presentation
- tuple
- namedtuple
- attrs library (http://www.attrs.org)
- etc.
@attr.s
class EchoRequest:
timestamp = attr.ib(converter=int)
subject = attr.ib(converter=bytes)
metadata = attr.ib(converter=list)
body = attr.ib(converter=bytes)
That's good and all, but you will still do it on raw classes for Cython =)
class Builder:
def __init__(self):
self._buffers = []
self._size = 0
def add(self, id, timestamp, subject, body):
subj_len = len(subject)
body_len = len(body)
# size<int32> id<int32> timestamp<int64> subj_len<int32> body_len<int32>
size = ( # <-- Many protocols will require size as first few bytes
SIZE_LENGTH + HEADER_STRUCT.size + struct.calcsize(">iqii") +
subj_len + body_len
)
msg_buf = bytearray(size) # <-- We need a writeable structure
# ...
# ...
pos = 0
struct.pack_into(msg_buf)
pos += SIZE_LENGTH
HEADER_STRUCT.pack_into(msg_buf, pos, 0, 0)
pos += HEADER_STRUCT.size
struct.pack_into(">iqi", msg_buf, pos, id, timestamp, subj_len)
pos += struct.calcsize(">iqi")
msg_buf[pos: pos + subj_len] = subject
struct.pack_into(">i", msg_buf, pos, body_len)
pos += struct.calcsize(">i")
msg_buf[pos: pos + body_len] = body
self._size += size
self._buffers.append(msg_buf)
return size
def get_buffer(self):
return b"".join(self._buffers)
Builder
Before you speed up
-
Profile (vmprof, cProfile)
-
Benchmark (perf)
-
100% Coverage
-
Always do functional/integration testing!
Cython problems
No struct module in C API!!!
Be aware of integer endian
IF UNAME_SYSNAME == "Windows":
cdef extern from "winsock2.h":
uint32_t htonl(uint32_t hostlong)
uint16_t htons(uint16_t hostshort)
uint32_t ntohl(uint32_t netlong)
uint16_t ntohs(uint16_t netshort)
ELSE:
cdef extern from "arpa/inet.h":
uint32_t htonl(uint32_t hostlong)
uint16_t htons(uint16_t hostshort)
uint32_t ntohl(uint32_t netlong)
uint16_t ntohs(uint16_t netshort)
cdef inline void pack_int64(char* buf, int64_t x):
(<uint32_t*>buf)[0] = htonl(<uint32_t>(<uint64_t>(x) >> 32))
(<uint32_t*>&buf[4])[0] = htonl(<uint32_t>(x))
cdef inline int64_t unpack_int64(const char* buf):
cdef int64_t hh = unpack_int32(buf)
cdef uint32_t hl = <uint32_t>unpack_int32(&buf[4])
return (hh << 32) | hl
Cython problems
Be aware of bytes types C API and Buffer Protocol!
from cpython cimport (
PyObject_GetBuffer, PyBuffer_Release,
PyBUF_SIMPLE, PyBUF_WRITABLE, Py_buffer,
PyBytes_FromStringAndSize
)
cdef extern from "Python.h": # <-- Not everything is defined in Cython yet
ssize_t PyByteArray_GET_SIZE(object)
char* PyByteArray_AS_STRING(bytearray ba)
int PyByteArray_Resize(object, ssize_t) except -1 # <-- Don't forget exceptions
from libc.string cimport memcpy # <-- When in C do it the C way
cdef handle(object data, char* out_buf, Py_ssize_t pos):
cdef:
Py_buffer buf
PyObject_GetBuffer(data, &buf, PyBUF_SIMPLE)
memcpy(&out_buf[pos], <char*>buf.buf, <size_t>buf.len)
PyBuffer_Release(&buf)
Cython problems
Little of Python is visible from C API
import time
timestamp = int(time.time() * 1000) # BuBu!!
import gzip
crc = gzip.crc32(some_big_data) # BuBu!!
data_view = memoryview(some_big_data)
data_view = data_view[start: end] # BuBu!!
Object presentation
Use Cython's fast initialization
@cython.no_gc_clear # <-- Disable GC, as no circular dependencies
@cython.final # <-- No need for subclasses
@cython.freelist(REQUEST_FREELIST_SIZE) # <-- Avoid reallocation
cdef class Request:
cdef:
readonly int64_t timestamp
readonly object subject
readonly object body
readonly tuple metadata
def __init__(self, int64_t timestamp, subject, body, metadata):
self.timestamp = timestamp
self.subject = subject
self.body = body
self.metadata = metadata
# ...
Not fast enough? Ehmm, no choice but C =(
# ...
@staticmethod
cdef inline Request new(
int64_t timestamp, subject, body, metadata):
""" Fast constructor to initialize from C.
"""
cdef:
Request req
req = Request.__new__(Request) # <-- We don't call Python API for this
req.timestamp = timestamp
req.subject = subject
req.body = body
req.metadata = metadata
return record
Questions?
Speedy protocol parsing in Python and Cython
By Taras Voinarovskyi
Speedy protocol parsing in Python and Cython
How to parse a binary protocol in Python. Tricks to make it not suck too bad. What to use as a bytebuffer in Python2 and Python3? And how to make it fast with Cython.
- 2,124