Binary data in Python with a bit of C spice on top
Taras Voinarovskyi
Taras Voinarovskyi
What's on todays menu?
We had issues before...
>>> user_name = "Alex Smith"
>>> unicode(user_name)
u'Alex Smith'
>>>
>>> unicode_name = u"Mark Smith"
>>> str(unicode_name)
'Mark Smith'
>>> ...
>>> user_name = "Тарас"
>>> user_name
'\xd0\xa2\xd0\xb0\xd1\x80\xd0\xb0\xd1\x81'
>>> unicode(user_name)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
UnicodeDecodeError: 'ascii' codec can't decode byte \
0xd0 in position 0: ordinal not in range(128)
>>>
>>> user_name.decode("utf-8")
u'\u0422\u0430\u0440\u0430\u0441'
As of Python3 we have str and bytes
>>> byte_data = b"Super data"
>>> str_data = "Супер дані"
>>> byte_data[0] # Bytes return int
83
>>> str_data[0] # Str returns a substring
'С'
>>> len(byte_data) # Counts bytes
10
>>> len(str_data) # Counts characters
10
Python 2 returned string data for binary files
>>> open("test_file", "r").read()
'\xd0\xa1\xd1\x83\xd0\xbf\xd0\xb5\xd1\x80\n'
>>> open("test_file", "rb").read()
'\xd0\xa1\xd1\x83\xd0\xbf\xd0\xb5\xd1\x80\n'
>>> x = open("test_file", "rb").read()
>>> y = open("test_file", "r").read()
>>> x == y
True
>>> open("test_file", "r").read()
'Супер\n'
>>> open("test_file", "rb").read()
b'\xd0\xa1\xd1\x83\xd0\xbf\xd0\xb5\xd1\x80\n'
>>> x = open("test_file", "rb").read()
>>> y = open("test_file", "r").read()
>>> x == y
False
As of Python3 binary files actually return binary data
Other binary data types
>>> buf = bytearray(7)
bytearray(b'\x00\x00\x00\x00\x00\x00\x00')
>>> buf.append(1)
>>> buf
bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x01')
>>> buf[0:5] = b"12345"
>>> buf
bytearray(b'12345\x00\x00\x01')
>>>
>>> mview = memoryview(buf)
>>> mview
<memory at 0x103cafdc8>
>>> mview[0:5] = b"54321"
>>> buf
bytearray(b'54321\x00\x00\x01')
Python2 has some differences
>>> memoryview(b"123")[0]
'1'
>>> bytearray(b"123")[0]
49
>>> bytes(b"123")[0]
'1'
>>> bytes(6)
'6'
>>> bytearray(6)
bytearray(b'\x00\x00\x00\x00\x00\x00')
What do you see, Neo?
Count number of messages
ID => Int64
Length => Int32
CRC => UInt32
Version => Int8
Body => Bytes
some_messages.dump
Structs are our allies
>>> import struct
>>> packed = struct.pack(">qiIb", 100, 73, 333, 1)
>>> packed[:12]
b'\x00\x00\x00\x00\x00\x00\x00d\x00\x00\x00I'
>>>
>>> struct.unpack(">qiIb", packed)
(100, 73, 333, 1)
>>>
struct Message
{
long long message_id; /* q */
int length; /* i */
unsigned int crc; /* I */
signed char version; /* b */
};
Be aware of native alignment and size
>>> struct.pack("I", 5)
b'\x05\x00\x00\x00'
>>> struct.pack(">I", 5)
b'\x00\x00\x00\x05'
>>>
>>> struct.calcsize("bI")
8
>>> struct.calcsize(">bI")
5
>>> struct.pack("bI", 1, 2)
b'\x01\x00\x00\x00\x02\x00\x00\x00'
>>> struct.pack(">bI", 1, 2)
b'\x01\x00\x00\x00\x02'
Binary file IO
import io
buf = bytearray(struct.calcsize(">qiIb"))
with open("x_file", "rb", buffering=0) as file:
reader = io.BufferedReader(file, buffer_size=10 * 1024)
while True:
readlen = reader.readinto(buf)
assert readlen == len(buf)
msg = struct.unpack(">qiIb", buf)
message_len = msg[1]
reader.seek(message_len, io.SEEK_CUR)
Efficient socket framing
import socket
from .parser import Parser
MAX_READ_SIZE = 2 * 1024
def read(sock: socket.socket, parser: Parser):
data = sock.recv(MAX_READ_SIZE)
if not data:
raise Disconnected()
frames = parser.feed(data)
return frames
import socket
def read_frame(sock: socket.socket):
size = sock.recv(4) # Int32 size
size = struct.unpack(">i", size)
frame = b""
while len(frame) <= size:
frame += sock.recv(size - len(frame))
return frame
Lets talk a bit about buffers
Any bytes-like can be a buffer
with open("x_file_index", "wb+") as file:
file.write(b"1234")
file.write(array.array("Q", [1, 2, 3]))
file.write(bytearray(b"1234"))
file.write(np.array([1, 2, 3, 4]))
memoryview works on bytes-like objects
>>> arr = array.array("H", [1, 2, 3, 4])
>>> view = memoryview(arr)
>>> view.shape
(4,)
>>> view.format
'H'
>>> view[0:2].tolist()
[1, 2]
>>> buf = b"\x05\x00\x00\x00"
>>> view = memoryview(buf).cast("i")
>>> view.tolist()
[5]
>>> view = memoryview(buf).cast("h", (2, 1))
>>> view.tolist()
[[5], [0]]
Use memoryview if API does not support offset and size
big_buffer = bytearray(1000000)
with open("some_file", "rb") as file:
view = memoryview(big_buffer)[0:15]
readlen = file.readinto(view)
assert readlen == 15
Use struct module to parse/pack simple types (int, float, fixed-size strings...)
Use Binary IO when working with files ("rb" and "wb" modes)
Proper buffer management is the key to your program's efficiency
Use memoryview to avoid unneeded copies of data
So, lets sum it up
Diving deeper: Memory layout
How Python manages objects
Heap
All objects are stored in Heap
All objects have PyObject_HEAD containing ref_count and type info
Variable length objects have PyObject_VAR_HEAD containing size info
0x1044afa08
0x10457e530
0x107f4e820
typedef struct {
Py_ssize_t ob_refcnt;
struct _typeobject *ob_type;
} PyObject;
typedef struct {
PyObject ob_base;
Py_ssize_t ob_size;
} PyVarObject;
How Python manages objects
Bytes
Bytearray
>>> x = b"Super data"
>>> import sys
>>> sys.getsizeof(x)
43
refcount
type
Super data
\x00 byte
ob_sval
ob_shash
ob_size
0
8
16
24
32
43
Memoryview
Bytes
Bytearray
typedef struct {
PyObject_VAR_HEAD
Py_hash_t ob_shash;
char ob_sval[1];
/* Invariants:
* ob_sval contains space for
'ob_size+1' elements.
* ob_sval[ob_size] == 0.
* ob_shash is the hash of the
string or -1 if not computed yet.
*/
} PyBytesObject;
Memoryview
Bytes
Bytearray
>>> x = bytearray(b"Super data")
>>> sys.getsizeof(x)
67
ob_bytes
ob_alloc
0
8
16
24
32
Super data
ob_start
40
48
56
ob_exports
0
11
Memoryview
Bytes
Bytearray
typedef struct {
PyObject_VAR_HEAD
Py_ssize_t ob_alloc;
/* How many bytes allocated */
char *ob_bytes;
/* Physical backing buffer */
char *ob_start;
/* Logical start inside ob_bytes */
int ob_exports;
/* How many buffer exports */
} PyByteArrayObject;
Memoryview
>>> b = bytearray(1000)
>>> size = sys.getsizeof(b)
>>>
>>> del b[:10]
>>> del b[-10:]
>>>
>>> sys.getsizeof(b) == size
True
Bytearray is optimized on slices from both sides
Buffer protocol
Export a `void*` buffer in C
Any object can support it
n-dimensional structure support
memoryview on Python level
Buffer protocol
Super data
Bytes
Py_buffer
void* buf
PyObject *obj
Py_ssize_t len
...
Buffer protocol
/* buffer interface */
typedef struct bufferinfo {
void *buf;
PyObject *obj; /* owned reference */
Py_ssize_t len;
Py_ssize_t itemsize;
int readonly;
// ...
} Py_buffer;
/* buffer interface */
typedef struct bufferinfo {
// ...
int ndim;
char *format;
Py_ssize_t *shape;
Py_ssize_t *strides;
Py_ssize_t *suboffsets;
void *internal;
} Py_buffer;
static int
process_data(PyObject *np_array)
{
Py_buffer buf;
if (PyObject_GetBuffer(
np_array, &buf, PyBUF_SIMPLE) != 0)
return -1;
Py_BEGIN_ALLOW_THREADS
// Long complex processing in C
Py_END_ALLOW_THREADS
PyBuffer_Release(&buf);
return 0;
}
Buffer protocol
Bytes
Bytearray
>>> x = memoryview(b"Super data")
>>> sys.getsizeof(x)
192
Memoryview
_PyManagedBufferObject
PyMemoryViewObject
Py_buffer
Py_buffer
hash
flags
exports
flags
exports
Bytes
Bytearray
Memoryview
typedef struct {
PyObject_HEAD
int flags;
/* state flags */
Py_ssize_t exports;
/* number of direct memoryview exports */
Py_buffer master;
} _PyManagedBufferObject;
typedef struct {
PyObject_VAR_HEAD
_PyManagedBufferObject *mbuf;
/* managed buffer */
Py_hash_t hash;
/* hash value for read-only views */
int flags;
/* state flags */
Py_ssize_t exports;
/* number of buffer re-exports */
Py_buffer view;
/* private copy of the exporter's view */
PyObject *weakreflist;
Py_ssize_t ob_array[1];
/* shape, strides, suboffsets */
} PyMemoryViewObject;
Profile (vmprof, cProfile)
Benchmark (perf)
100% Coverage
Always do functional/integration testing!
Before you speed it up
What is different in C?
No struct module we can use
No memoryview, only direct Buffer protocol.
Very little of Python is available in C API (No time, os, sys, etc.)