Binary data in Python with a bit of C spice on top​
Taras Voinarovskyi
Taras Voinarovskyi

Member of aio-libs team

What's on todays menu?
  • binary data types
  • modules for working with binary data
  • binary IO and sockets
  • Binary types memory layout
  • The Buffer protocol
  • Tips on writing C extensions
We had issues before...
>>> user_name = "Alex Smith"
>>> unicode(user_name)
u'Alex Smith'
>>> 
>>> unicode_name = u"Mark Smith"
>>> str(unicode_name)
'Mark Smith'
>>> ...
>>> user_name = "Тарас"
>>> user_name
'\xd0\xa2\xd0\xb0\xd1\x80\xd0\xb0\xd1\x81'
>>> unicode(user_name)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
UnicodeDecodeError: 'ascii' codec can't decode byte \
    0xd0 in position 0: ordinal not in range(128)
>>> 
>>> user_name.decode("utf-8")
u'\u0422\u0430\u0440\u0430\u0441'
As of Python3 we have str and bytes
>>> byte_data = b"Super data"
>>> str_data = "Супер дані"
>>> byte_data[0]   # Bytes return int
83
>>> str_data[0]  # Str returns a substring
'С'
>>> len(byte_data)  # Counts bytes
10
>>> len(str_data)  # Counts characters
10
Python 2 returned string data for binary files
>>> open("test_file", "r").read()
'\xd0\xa1\xd1\x83\xd0\xbf\xd0\xb5\xd1\x80\n'
>>> open("test_file", "rb").read()
'\xd0\xa1\xd1\x83\xd0\xbf\xd0\xb5\xd1\x80\n'
>>> x = open("test_file", "rb").read()
>>> y = open("test_file", "r").read()
>>> x == y
True
>>> open("test_file", "r").read()
'Супер\n'
>>> open("test_file", "rb").read()
b'\xd0\xa1\xd1\x83\xd0\xbf\xd0\xb5\xd1\x80\n'
>>> x = open("test_file", "rb").read()
>>> y = open("test_file", "r").read()    
>>> x == y
False
As of Python3 binary files actually return binary data
Other binary data types
>>> buf = bytearray(7)
bytearray(b'\x00\x00\x00\x00\x00\x00\x00')
>>> buf.append(1)
>>> buf
bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x01')
>>> buf[0:5] = b"12345"
>>> buf
bytearray(b'12345\x00\x00\x01')
>>> 
>>> mview = memoryview(buf)
>>> mview
<memory at 0x103cafdc8>
>>> mview[0:5] = b"54321"
>>> buf
bytearray(b'54321\x00\x00\x01')
Python2 has some differences
>>> memoryview(b"123")[0]
'1'
>>> bytearray(b"123")[0]
49
>>> bytes(b"123")[0]
'1'
>>> bytes(6)
'6'
>>> bytearray(6)
bytearray(b'\x00\x00\x00\x00\x00\x00')
What do you see, Neo?
Count number of messages
ID => Int64
Length => Int32
CRC => UInt32
Version => Int8
Body => Bytes
some_messages.dump
Structs are our allies
>>> import struct
>>> packed = struct.pack(">qiIb", 100, 73, 333, 1)
>>> packed[:12]
b'\x00\x00\x00\x00\x00\x00\x00d\x00\x00\x00I'
>>> 
>>> struct.unpack(">qiIb", packed)
(100, 73, 333, 1)
>>> 
struct Message
{
    long long message_id;  /* q */
    int length;            /* i */
    unsigned int crc;      /* I */
    signed char version;   /* b */
};
Be aware of native alignment and size
>>> struct.pack("I", 5)
b'\x05\x00\x00\x00'
>>> struct.pack(">I", 5)
b'\x00\x00\x00\x05'
>>> 
>>> struct.calcsize("bI")
8
>>> struct.calcsize(">bI")
5
>>> struct.pack("bI", 1, 2)
b'\x01\x00\x00\x00\x02\x00\x00\x00'
>>> struct.pack(">bI", 1, 2)
b'\x01\x00\x00\x00\x02'
Binary file IO
import io

buf = bytearray(struct.calcsize(">qiIb"))

with open("x_file", "rb", buffering=0) as file:
    reader = io.BufferedReader(file, buffer_size=10 * 1024)
    while True:
        readlen = reader.readinto(buf)
        assert readlen == len(buf)

        msg = struct.unpack(">qiIb", buf)
        message_len = msg[1]

        reader.seek(message_len, io.SEEK_CUR)
Efficient socket framing
import socket
from .parser import Parser

MAX_READ_SIZE = 2 * 1024


def read(sock: socket.socket, parser: Parser):
    data = sock.recv(MAX_READ_SIZE)
    if not data:
        raise Disconnected()
    frames = parser.feed(data)
    return frames
import socket

def read_frame(sock: socket.socket):
    size = sock.recv(4)  # Int32 size
    size = struct.unpack(">i", size)
    frame = b""
    while len(frame) <= size:
        frame += sock.recv(size - len(frame))
    return frame
Lets talk a bit about buffers
Any bytes-like can be a buffer
with open("x_file_index", "wb+") as file:

    file.write(b"1234")
    file.write(array.array("Q", [1, 2, 3]))
    file.write(bytearray(b"1234"))
    file.write(np.array([1, 2, 3, 4]))  
memoryview works on bytes-like objects
>>> arr = array.array("H", [1, 2, 3, 4])
>>> view = memoryview(arr)
>>> view.shape
(4,)
>>> view.format
'H'
>>> view[0:2].tolist()
[1, 2]
>>> buf = b"\x05\x00\x00\x00"
>>> view = memoryview(buf).cast("i")
>>> view.tolist()
[5]
>>> view = memoryview(buf).cast("h", (2, 1))
>>> view.tolist()
[[5], [0]]
Use memoryview if API does not support offset and size
big_buffer = bytearray(1000000)

with open("some_file", "rb") as file:

    view = memoryview(big_buffer)[0:15]

    readlen = file.readinto(view)
    assert readlen == 15
  • Use struct module to parse/pack simple types (int, float, fixed-size strings...)
  • Use Binary IO when working with files ("rb" and "wb" modes)
  • Proper buffer management is the key to your program's efficiency
  • Use memoryview to avoid unneeded copies of data
So, lets sum it up
Diving deeper: Memory layout
How Python manages objects
Heap
  • All objects are stored in Heap

  • All objects have PyObject_HEAD containing ref_count and type info

  • Variable length objects have PyObject_VAR_HEAD containing size info

  • Container objects have a PyGC_Head
0x1044afa08
0x10457e530
0x107f4e820
typedef struct {
    Py_ssize_t ob_refcnt;
    struct _typeobject *ob_type;
} PyObject;

typedef struct {
    PyObject ob_base;
    Py_ssize_t ob_size;
} PyVarObject;
How Python manages objects
Bytes
Bytearray
>>> x = b"Super data"
>>> import sys
>>> sys.getsizeof(x)
43
refcount
type
Super data

\x00 byte

ob_sval
ob_shash
ob_size

0

8

16

24

32

43

Memoryview
Bytes
Bytearray
typedef struct {
    PyObject_VAR_HEAD
    Py_hash_t ob_shash;
    char ob_sval[1];

    /* Invariants:
     *     ob_sval contains space for 
           'ob_size+1' elements.
     *     ob_sval[ob_size] == 0.
     *     ob_shash is the hash of the
           string or -1 if not computed yet.
     */
} PyBytesObject;
Memoryview
Bytes
Bytearray
>>> x = bytearray(b"Super data")
>>> sys.getsizeof(x)
67
ob_bytes
ob_alloc

0

8

16

24

32

Super data
ob_start

40

48

56

ob_exports

0

11

Memoryview
Bytes
Bytearray
typedef struct {
    PyObject_VAR_HEAD
    Py_ssize_t ob_alloc;
        /* How many bytes allocated */
    char *ob_bytes;     
        /* Physical backing buffer */
    char *ob_start;     
        /* Logical start inside ob_bytes */
    int ob_exports;     
        /* How many buffer exports */
} PyByteArrayObject;
Memoryview
>>> b = bytearray(1000)
>>> size = sys.getsizeof(b)
>>> 
>>> del b[:10]
>>> del b[-10:]
>>> 
>>> sys.getsizeof(b) == size
True
Bytearray is optimized on slices from both sides
Buffer protocol
  • Export a `void*` buffer in C
  • Any object can support it
  • n-dimensional structure support
  • memoryview on Python level
Buffer protocol
Super data
Bytes
Py_buffer
void* buf
PyObject *obj
Py_ssize_t len​
...
Buffer protocol
/* buffer interface */
typedef struct bufferinfo {
    void *buf;
    PyObject *obj; /* owned reference */
    Py_ssize_t len;
    Py_ssize_t itemsize;
    int readonly;

    // ...
} Py_buffer;
/* buffer interface */
typedef struct bufferinfo {
    // ...

    int ndim;
    char *format;
    Py_ssize_t *shape;
    Py_ssize_t *strides;
    Py_ssize_t *suboffsets;
    void *internal;
} Py_buffer;
static int
process_data(PyObject *np_array)
{
    Py_buffer buf;
    if (PyObject_GetBuffer(
            np_array, &buf, PyBUF_SIMPLE) != 0)
        return -1;
    
    Py_BEGIN_ALLOW_THREADS
    // Long complex processing in C
    Py_END_ALLOW_THREADS
    
    PyBuffer_Release(&buf);
    return 0;
}
Buffer protocol
Bytes
Bytearray
>>> x = memoryview(b"Super data")
>>> sys.getsizeof(x)
192
Memoryview
_PyManagedBufferObject
PyMemoryViewObject
Py_buffer
Py_buffer
hash
flags
exports
flags
exports
Bytes
Bytearray
Memoryview
typedef struct {
    PyObject_HEAD
    int flags;
        /* state flags */
    Py_ssize_t exports;
        /* number of direct memoryview exports */
    Py_buffer master;
} _PyManagedBufferObject;
typedef struct {
    PyObject_VAR_HEAD
    _PyManagedBufferObject *mbuf;
        /* managed buffer */
    Py_hash_t hash;
        /* hash value for read-only views */
    int flags; 
        /* state flags */
    Py_ssize_t exports;
        /* number of buffer re-exports */
    Py_buffer view;
        /* private copy of the exporter's view */
    PyObject *weakreflist;
    Py_ssize_t ob_array[1];
        /* shape, strides, suboffsets */
} PyMemoryViewObject;

Questions?

  • Profile (vmprof, cProfile)
  • Benchmark (perf)
  • 100% Coverage
  • Always do functional/integration testing!
Before you speed it up
What is different in C?
  • No struct module we can use
  • No memoryview, only direct Buffer protocol.
  • Very little of Python is available in C API (No time, os, sys, etc.)
Made with Slides.com