Binary data in Python with a bit of C spice on top

Taras Voinarovskyi

https://github.com/tvoinarovskyi

Taras Voinarovskyi

https://github.com/tvoinarovskyi

Collaborator:

Member of aio-libs team

What's on todays menu?

binary data types
modules for working with binary data
binary IO and sockets

Binary types memory layout
The Buffer protocol
Tips on writing C extensions

We had issues before...

>>> user_name = "Alex Smith"
>>> unicode(user_name)
u'Alex Smith'
>>> 
>>> unicode_name = u"Mark Smith"
>>> str(unicode_name)
'Mark Smith'
>>> ...

>>> user_name = "Тарас"
>>> user_name
'\xd0\xa2\xd0\xb0\xd1\x80\xd0\xb0\xd1\x81'
>>> unicode(user_name)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
UnicodeDecodeError: 'ascii' codec can't decode byte \
    0xd0 in position 0: ordinal not in range(128)
>>> 
>>> user_name.decode("utf-8")
u'\u0422\u0430\u0440\u0430\u0441'

As of Python3 we have str and bytes

>>> byte_data = b"Super data"
>>> str_data = "Супер дані"
>>> byte_data[0]   # Bytes return int
83
>>> str_data[0]  # Str returns a substring
'С'
>>> len(byte_data)  # Counts bytes
10
>>> len(str_data)  # Counts characters
10

Python 2 returned string data for binary files

>>> open("test_file", "r").read()
'\xd0\xa1\xd1\x83\xd0\xbf\xd0\xb5\xd1\x80\n'
>>> open("test_file", "rb").read()
'\xd0\xa1\xd1\x83\xd0\xbf\xd0\xb5\xd1\x80\n'
>>> x = open("test_file", "rb").read()
>>> y = open("test_file", "r").read()
>>> x == y
True

>>> open("test_file", "r").read()
'Супер\n'
>>> open("test_file", "rb").read()
b'\xd0\xa1\xd1\x83\xd0\xbf\xd0\xb5\xd1\x80\n'
>>> x = open("test_file", "rb").read()
>>> y = open("test_file", "r").read()    
>>> x == y
False

As of Python3 binary files actually return binary data

Other binary data types

>>> buf = bytearray(7)
bytearray(b'\x00\x00\x00\x00\x00\x00\x00')
>>> buf.append(1)
>>> buf
bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x01')
>>> buf[0:5] = b"12345"
>>> buf
bytearray(b'12345\x00\x00\x01')
>>>

>>> mview = memoryview(buf)
>>> mview
<memory at 0x103cafdc8>
>>> mview[0:5] = b"54321"
>>> buf
bytearray(b'54321\x00\x00\x01')

Python2 has some differences

>>> memoryview(b"123")[0]
'1'
>>> bytearray(b"123")[0]
49
>>> bytes(b"123")[0]
'1'
>>> bytes(6)
'6'
>>> bytearray(6)
bytearray(b'\x00\x00\x00\x00\x00\x00')

What do you see, Neo?

Count number of messages

ID => Int64
Length => Int32
CRC => UInt32
Version => Int8
Body => Bytes

some_messages.dump

Structs are our allies

>>> import struct
>>> packed = struct.pack(">qiIb", 100, 73, 333, 1)
>>> packed[:12]
b'\x00\x00\x00\x00\x00\x00\x00d\x00\x00\x00I'
>>> 
>>> struct.unpack(">qiIb", packed)
(100, 73, 333, 1)
>>>

struct Message
{
    long long message_id;  /* q */
    int length;            /* i */
    unsigned int crc;      /* I */
    signed char version;   /* b */
};

Be aware of native alignment and size

>>> struct.pack("I", 5)
b'\x05\x00\x00\x00'
>>> struct.pack(">I", 5)
b'\x00\x00\x00\x05'
>>> 
>>> struct.calcsize("bI")
8
>>> struct.calcsize(">bI")
5
>>> struct.pack("bI", 1, 2)
b'\x01\x00\x00\x00\x02\x00\x00\x00'
>>> struct.pack(">bI", 1, 2)
b'\x01\x00\x00\x00\x02'

Binary file IO

import io

buf = bytearray(struct.calcsize(">qiIb"))

with open("x_file", "rb", buffering=0) as file:
    reader = io.BufferedReader(file, buffer_size=10 * 1024)
    while True:
        readlen = reader.readinto(buf)
        assert readlen == len(buf)

        msg = struct.unpack(">qiIb", buf)
        message_len = msg[1]

        reader.seek(message_len, io.SEEK_CUR)

Efficient socket framing

import socket
from .parser import Parser

MAX_READ_SIZE = 2 * 1024


def read(sock: socket.socket, parser: Parser):
    data = sock.recv(MAX_READ_SIZE)
    if not data:
        raise Disconnected()
    frames = parser.feed(data)
    return frames

import socket

def read_frame(sock: socket.socket):
    size = sock.recv(4)  # Int32 size
    size = struct.unpack(">i", size)
    frame = b""
    while len(frame) <= size:
        frame += sock.recv(size - len(frame))
    return frame

Lets talk a bit about buffers

Any bytes-like can be a buffer

with open("x_file_index", "wb+") as file:

    file.write(b"1234")
    file.write(array.array("Q", [1, 2, 3]))
    file.write(bytearray(b"1234"))
    file.write(np.array([1, 2, 3, 4]))

memoryview works on bytes-like objects

>>> arr = array.array("H", [1, 2, 3, 4])
>>> view = memoryview(arr)
>>> view.shape
(4,)
>>> view.format
'H'
>>> view[0:2].tolist()
[1, 2]

>>> buf = b"\x05\x00\x00\x00"
>>> view = memoryview(buf).cast("i")
>>> view.tolist()
[5]
>>> view = memoryview(buf).cast("h", (2, 1))
>>> view.tolist()
[[5], [0]]

Use memoryview if API does not support offset and size

big_buffer = bytearray(1000000)

with open("some_file", "rb") as file:

    view = memoryview(big_buffer)[0:15]

    readlen = file.readinto(view)
    assert readlen == 15

Use struct module to parse/pack simple types (int, float, fixed-size strings...)

Use Binary IO when working with files ("rb" and "wb" modes)

Proper buffer management is the key to your program's efficiency

Use memoryview to avoid unneeded copies of data

So, lets sum it up

Diving deeper: Memory layout

How Python manages objects

Heap

All objects are stored in Heap
All objects have PyObject_HEAD containing ref_count and type info
Variable length objects have PyObject_VAR_HEAD containing size info
Container objects have a PyGC_Head

0x1044afa08

0x10457e530

0x107f4e820

typedef struct {
    Py_ssize_t ob_refcnt;
    struct _typeobject *ob_type;
} PyObject;

typedef struct {
    PyObject ob_base;
    Py_ssize_t ob_size;
} PyVarObject;

How Python manages objects

Bytes

Bytearray

>>> x = b"Super data"
>>> import sys
>>> sys.getsizeof(x)
43

refcount

type

Super data

\x00 byte

ob_sval

ob_shash

ob_size

Memoryview

Bytes

Bytearray

typedef struct {
    PyObject_VAR_HEAD
    Py_hash_t ob_shash;
    char ob_sval[1];

    /* Invariants:
     *     ob_sval contains space for 
           'ob_size+1' elements.
     *     ob_sval[ob_size] == 0.
     *     ob_shash is the hash of the
           string or -1 if not computed yet.
     */
} PyBytesObject;

Memoryview

Bytes

Bytearray

>>> x = bytearray(b"Super data")
>>> sys.getsizeof(x)
67

ob_bytes

ob_alloc

Super data

ob_start

ob_exports

Memoryview

Bytes

Bytearray

typedef struct {
    PyObject_VAR_HEAD
    Py_ssize_t ob_alloc;
        /* How many bytes allocated */
    char *ob_bytes;     
        /* Physical backing buffer */
    char *ob_start;     
        /* Logical start inside ob_bytes */
    int ob_exports;     
        /* How many buffer exports */
} PyByteArrayObject;

Memoryview

>>> b = bytearray(1000)
>>> size = sys.getsizeof(b)
>>> 
>>> del b[:10]
>>> del b[-10:]
>>> 
>>> sys.getsizeof(b) == size
True

Bytearray is optimized on slices from both sides

PEP 3118 -- Revising the buffer protocol

Buffer protocol

```
Export a `void*` buffer in C
```
```
Any object can support it
```
```
n-dimensional structure support
```
```
memoryview on Python level
```

Buffer protocol

Super data

Bytes

Py_buffer

void* buf

PyObject *obj

Py_ssize_t len

...

Buffer protocol

/* buffer interface */
typedef struct bufferinfo {
    void *buf;
    PyObject *obj; /* owned reference */
    Py_ssize_t len;
    Py_ssize_t itemsize;
    int readonly;

    // ...
} Py_buffer;

/* buffer interface */
typedef struct bufferinfo {
    // ...

    int ndim;
    char *format;
    Py_ssize_t *shape;
    Py_ssize_t *strides;
    Py_ssize_t *suboffsets;
    void *internal;
} Py_buffer;

static int
process_data(PyObject *np_array)
{
    Py_buffer buf;
    if (PyObject_GetBuffer(
            np_array, &buf, PyBUF_SIMPLE) != 0)
        return -1;
    
    Py_BEGIN_ALLOW_THREADS
    // Long complex processing in C
    Py_END_ALLOW_THREADS
    
    PyBuffer_Release(&buf);
    return 0;
}

Buffer protocol

Bytes

Bytearray

>>> x = memoryview(b"Super data")
>>> sys.getsizeof(x)
192

Memoryview

_PyManagedBufferObject

PyMemoryViewObject

Py_buffer

Py_buffer

hash

flags

exports

flags

exports

Bytes

Bytearray

Memoryview

typedef struct {
    PyObject_HEAD
    int flags;
        /* state flags */
    Py_ssize_t exports;
        /* number of direct memoryview exports */
    Py_buffer master;
} _PyManagedBufferObject;

typedef struct {
    PyObject_VAR_HEAD
    _PyManagedBufferObject *mbuf;
        /* managed buffer */
    Py_hash_t hash;
        /* hash value for read-only views */
    int flags; 
        /* state flags */
    Py_ssize_t exports;
        /* number of buffer re-exports */
    Py_buffer view;
        /* private copy of the exporter's view */
    PyObject *weakreflist;
    Py_ssize_t ob_array[1];
        /* shape, strides, suboffsets */
} PyMemoryViewObject;

Questions?

```
Profile (vmprof, cProfile)
```
```
Benchmark (perf)
```
```
100% Coverage
```

Always do functional/integration testing!

Before you speed it up

What is different in C?

```
No struct module we can use
```

No memoryview, only direct Buffer protocol.

Very little of Python is available in C API (No time, os, sys, etc.)