Binary data in Python with a bit of C spice on top
Taras Voinarovskyi
Taras Voinarovskyi

What's on todays menu?
- binary data types
- modules for working with binary data
- binary IO and sockets
- Binary types memory layout
- The Buffer protocol
- Tips on writing C extensions
We had issues before...
>>> user_name = "Alex Smith"
>>> unicode(user_name)
u'Alex Smith'
>>>
>>> unicode_name = u"Mark Smith"
>>> str(unicode_name)
'Mark Smith'
>>> ...
>>> user_name = "Тарас"
>>> user_name
'\xd0\xa2\xd0\xb0\xd1\x80\xd0\xb0\xd1\x81'
>>> unicode(user_name)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
UnicodeDecodeError: 'ascii' codec can't decode byte \
0xd0 in position 0: ordinal not in range(128)
>>>
>>> user_name.decode("utf-8")
u'\u0422\u0430\u0440\u0430\u0441'
As of Python3 we have str and bytes
>>> byte_data = b"Super data"
>>> str_data = "Супер дані"
>>> byte_data[0] # Bytes return int
83
>>> str_data[0] # Str returns a substring
'С'
>>> len(byte_data) # Counts bytes
10
>>> len(str_data) # Counts characters
10
Python 2 returned string data for binary files
>>> open("test_file", "r").read()
'\xd0\xa1\xd1\x83\xd0\xbf\xd0\xb5\xd1\x80\n'
>>> open("test_file", "rb").read()
'\xd0\xa1\xd1\x83\xd0\xbf\xd0\xb5\xd1\x80\n'
>>> x = open("test_file", "rb").read()
>>> y = open("test_file", "r").read()
>>> x == y
True
>>> open("test_file", "r").read()
'Супер\n'
>>> open("test_file", "rb").read()
b'\xd0\xa1\xd1\x83\xd0\xbf\xd0\xb5\xd1\x80\n'
>>> x = open("test_file", "rb").read()
>>> y = open("test_file", "r").read()
>>> x == y
False
As of Python3 binary files actually return binary data
Other binary data types
>>> buf = bytearray(7)
bytearray(b'\x00\x00\x00\x00\x00\x00\x00')
>>> buf.append(1)
>>> buf
bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x01')
>>> buf[0:5] = b"12345"
>>> buf
bytearray(b'12345\x00\x00\x01')
>>>
>>> mview = memoryview(buf)
>>> mview
<memory at 0x103cafdc8>
>>> mview[0:5] = b"54321"
>>> buf
bytearray(b'54321\x00\x00\x01')
Python2 has some differences
>>> memoryview(b"123")[0]
'1'
>>> bytearray(b"123")[0]
49
>>> bytes(b"123")[0]
'1'
>>> bytes(6)
'6'
>>> bytearray(6)
bytearray(b'\x00\x00\x00\x00\x00\x00')
What do you see, Neo?

Count number of messages
ID => Int64
Length => Int32
CRC => UInt32
Version => Int8
Body => Bytes
some_messages.dump
Structs are our allies
>>> import struct
>>> packed = struct.pack(">qiIb", 100, 73, 333, 1)
>>> packed[:12]
b'\x00\x00\x00\x00\x00\x00\x00d\x00\x00\x00I'
>>>
>>> struct.unpack(">qiIb", packed)
(100, 73, 333, 1)
>>>
struct Message
{
long long message_id; /* q */
int length; /* i */
unsigned int crc; /* I */
signed char version; /* b */
};

Be aware of native alignment and size
>>> struct.pack("I", 5)
b'\x05\x00\x00\x00'
>>> struct.pack(">I", 5)
b'\x00\x00\x00\x05'
>>>
>>> struct.calcsize("bI")
8
>>> struct.calcsize(">bI")
5
>>> struct.pack("bI", 1, 2)
b'\x01\x00\x00\x00\x02\x00\x00\x00'
>>> struct.pack(">bI", 1, 2)
b'\x01\x00\x00\x00\x02'

Binary file IO
import io
buf = bytearray(struct.calcsize(">qiIb"))
with open("x_file", "rb", buffering=0) as file:
reader = io.BufferedReader(file, buffer_size=10 * 1024)
while True:
readlen = reader.readinto(buf)
assert readlen == len(buf)
msg = struct.unpack(">qiIb", buf)
message_len = msg[1]
reader.seek(message_len, io.SEEK_CUR)
Efficient socket framing
import socket
from .parser import Parser
MAX_READ_SIZE = 2 * 1024
def read(sock: socket.socket, parser: Parser):
data = sock.recv(MAX_READ_SIZE)
if not data:
raise Disconnected()
frames = parser.feed(data)
return frames
import socket
def read_frame(sock: socket.socket):
size = sock.recv(4) # Int32 size
size = struct.unpack(">i", size)
frame = b""
while len(frame) <= size:
frame += sock.recv(size - len(frame))
return frame
Lets talk a bit about buffers
Any bytes-like can be a buffer
with open("x_file_index", "wb+") as file:
file.write(b"1234")
file.write(array.array("Q", [1, 2, 3]))
file.write(bytearray(b"1234"))
file.write(np.array([1, 2, 3, 4]))
memoryview works on bytes-like objects
>>> arr = array.array("H", [1, 2, 3, 4])
>>> view = memoryview(arr)
>>> view.shape
(4,)
>>> view.format
'H'
>>> view[0:2].tolist()
[1, 2]
>>> buf = b"\x05\x00\x00\x00"
>>> view = memoryview(buf).cast("i")
>>> view.tolist()
[5]
>>> view = memoryview(buf).cast("h", (2, 1))
>>> view.tolist()
[[5], [0]]
Use memoryview if API does not support offset and size
big_buffer = bytearray(1000000)
with open("some_file", "rb") as file:
view = memoryview(big_buffer)[0:15]
readlen = file.readinto(view)
assert readlen == 15
-
Use struct module to parse/pack simple types (int, float, fixed-size strings...)
-
Use Binary IO when working with files ("rb" and "wb" modes)
-
Proper buffer management is the key to your program's efficiency
-
Use memoryview to avoid unneeded copies of data
So, lets sum it up
Diving deeper: Memory layout

How Python manages objects
Heap
-
All objects are stored in Heap
-
All objects have PyObject_HEAD containing ref_count and type info
-
Variable length objects have PyObject_VAR_HEAD containing size info
- Container objects have a PyGC_Head
0x1044afa08
0x10457e530
0x107f4e820
typedef struct {
Py_ssize_t ob_refcnt;
struct _typeobject *ob_type;
} PyObject;
typedef struct {
PyObject ob_base;
Py_ssize_t ob_size;
} PyVarObject;
How Python manages objects
Bytes
Bytearray
>>> x = b"Super data"
>>> import sys
>>> sys.getsizeof(x)
43
refcount
type
Super data
\x00 byte
ob_sval
ob_shash
ob_size
0
8
16
24
32
43
Memoryview
Bytes
Bytearray
typedef struct {
PyObject_VAR_HEAD
Py_hash_t ob_shash;
char ob_sval[1];
/* Invariants:
* ob_sval contains space for
'ob_size+1' elements.
* ob_sval[ob_size] == 0.
* ob_shash is the hash of the
string or -1 if not computed yet.
*/
} PyBytesObject;
Memoryview
Bytes
Bytearray
>>> x = bytearray(b"Super data")
>>> sys.getsizeof(x)
67
ob_bytes
ob_alloc
0
8
16
24
32
Super data
ob_start
40
48
56
ob_exports
0
11
Memoryview
Bytes
Bytearray
typedef struct {
PyObject_VAR_HEAD
Py_ssize_t ob_alloc;
/* How many bytes allocated */
char *ob_bytes;
/* Physical backing buffer */
char *ob_start;
/* Logical start inside ob_bytes */
int ob_exports;
/* How many buffer exports */
} PyByteArrayObject;
Memoryview
>>> b = bytearray(1000)
>>> size = sys.getsizeof(b)
>>>
>>> del b[:10]
>>> del b[-10:]
>>>
>>> sys.getsizeof(b) == size
True
Bytearray is optimized on slices from both sides
Buffer protocol
-
Export a `void*` buffer in C
-
Any object can support it
-
n-dimensional structure support
-
memoryview on Python level
Buffer protocol
Super data
Bytes
Py_buffer
void* buf
PyObject *obj
Py_ssize_t len
...
Buffer protocol
/* buffer interface */
typedef struct bufferinfo {
void *buf;
PyObject *obj; /* owned reference */
Py_ssize_t len;
Py_ssize_t itemsize;
int readonly;
// ...
} Py_buffer;
/* buffer interface */
typedef struct bufferinfo {
// ...
int ndim;
char *format;
Py_ssize_t *shape;
Py_ssize_t *strides;
Py_ssize_t *suboffsets;
void *internal;
} Py_buffer;
static int
process_data(PyObject *np_array)
{
Py_buffer buf;
if (PyObject_GetBuffer(
np_array, &buf, PyBUF_SIMPLE) != 0)
return -1;
Py_BEGIN_ALLOW_THREADS
// Long complex processing in C
Py_END_ALLOW_THREADS
PyBuffer_Release(&buf);
return 0;
}
Buffer protocol
Bytes
Bytearray
>>> x = memoryview(b"Super data")
>>> sys.getsizeof(x)
192
Memoryview
_PyManagedBufferObject
PyMemoryViewObject
Py_buffer
Py_buffer
hash
flags
exports
flags
exports
Bytes
Bytearray
Memoryview
typedef struct {
PyObject_HEAD
int flags;
/* state flags */
Py_ssize_t exports;
/* number of direct memoryview exports */
Py_buffer master;
} _PyManagedBufferObject;
typedef struct {
PyObject_VAR_HEAD
_PyManagedBufferObject *mbuf;
/* managed buffer */
Py_hash_t hash;
/* hash value for read-only views */
int flags;
/* state flags */
Py_ssize_t exports;
/* number of buffer re-exports */
Py_buffer view;
/* private copy of the exporter's view */
PyObject *weakreflist;
Py_ssize_t ob_array[1];
/* shape, strides, suboffsets */
} PyMemoryViewObject;
Questions?
-
Profile (vmprof, cProfile)
-
Benchmark (perf)
-
100% Coverage
-
Always do functional/integration testing!
Before you speed it up
What is different in C?
-
No struct module we can use
-
No memoryview, only direct Buffer protocol.
-
Very little of Python is available in C API (No time, os, sys, etc.)
Binary data in Python with a bit of C spice on top
By Taras Voinarovskyi
Binary data in Python with a bit of C spice on top
You have that annoying C++ client that refuses to send you JSON data no matter what? Well, not a big problem, as long as we keep the mind calm and use correct data types, modules, and tricks in Python. We can even dive a bit deeper into C world ourselves to get the secret spice of efficiency.
- 1,574