>>> from blaze import Symbol, discover, compute
>>> import pandas as pd
>>> df = pd.DataFrame({'name': ['Alice', 'Bob', 'Forrest', 'Bubba'],
... 'amount': [10, 20, 30, 40]})
...
>>> t = Symbol('t', discover(df))
>>> t.amount.sum()
sum(t.amount)
>>> compute(t.amount.sum(), df)
100
>>> compute(t.amount.sum(), df.values.tolist())
100
>>> compute(t.amount.sum(), df.to_records(index=False))
100
>>> from blaze import into, drop
>>> import numpy as np
>>> into(list, df)
[(10, 'Alice'), (20, 'Bob'), (30, 'Forrest'), (40, 'Bubba')]
>>> into(np.ndarray, df)
rec.array([(10, 'Alice'), (20, 'Bob'), (30, 'Forrest'), (40, 'Bubba')],
dtype=[('amount', '<i8'), ('name', 'O')])
>>> into('sqlite:///db.db::t', df)
<blaze.data.sql.SQL at 0x108278fd0>
>>> drop(_) # remove the database for the next example
>>> result = into(pd.DataFrame, into('sqlite:///db.db::t', df))
>>> result
amount name
0 10 Alice
1 20 Bob
2 30 Forrest
3 40 Bubba
>>> type(result)
pandas.core.frame.DataFrame
>>> from blaze import Data
>>> d = Data(df)
>>> d.amount.dshape
dshape("4 * int64")
>>> d.amount.
d.amount.count d.amount.max d.amount.shape
d.amount.count_values d.amount.mean d.amount.sort
d.amount.distinct d.amount.min d.amount.std
d.amount.dshape d.amount.ndim d.amount.sum
d.amount.fields d.amount.nelements d.amount.truncate
d.amount.head d.amount.nrows d.amount.utcfromtimestamp
d.amount.isidentical d.amount.nunique d.amount.var
d.amount.label d.amount.relabel
d.amount.map d.amount.schema
>>> d.name.dshape
dshape("4 * string")
>>> d.name.
d.name.count d.name.head d.name.max d.name.nunique
d.name.count_values d.name.isidentical d.name.min d.name.relabel
d.name.distinct d.name.label d.name.ndim d.name.schema
d.name.dshape d.name.like d.name.nelements d.name.shape
d.name.fields d.name.map d.name.nrows d.name.sort
q)x:"racecar"
q)n:count x
q)all{[x;n;i]x[i]=x[n-i+1]}[x;n]each til _:[n%2]+1
1b
Check if a string is a palindrome
q)-1 x
racecar
-1
q)1 x
racecar1
Print to stdout, with and without a newline
Um, integers are callable?
1 divided by cat
q)1 % "cat"
0.01010101 0.01030928 0.00862069
so....
Why KDB+/Q?
*It's a little more nuanced than that
What is kdbpy?
≈16 GB (trip dataset only)
partitioned in KDB+ on date (year.month.day)
vs
blaze
group by on
passenger count
medallion
hack license
sum on
trip time
trip distance
cartesian product of the above
Storage
Compute
Parallelization
>>> from blaze import by, compute, Data
>>> from multiprocessing import Pool
>>> data = Data('trip.bcolz')
>>> pool = Pool() # default to the number of logical cores
>>> expr = by(data.passenger_count, avg=data.trip_time_in_secs.mean())
>>> result = compute(expr, map=pool.map) # chunksize defaults to 2 ** 20
>>> result
passenger_count avg
0 0 122.071500
1 1 806.607092
2 2 852.223353
3 3 850.614843
4 4 885.621065
5 5 763.933618
6 6 760.465655
7 7 428.485714
8 8 527.920000
9 9 506.230769
10 129 240.000000
11 208 55.384615
12 255 990.000000
>>> timeit compute(expr, map=pool.map)
1 loops, best of 3: 5.67 s per loop
chunk = symbol('chunk', chunksize * leaf.schema)
(chunk, chunk_expr), (agg, agg_expr) = split(leaf, expr, chunk=chunk)
data_parts = partitions(data, chunksize=(chunksize,))
parts = list(map(curry(compute_chunk, data, chunk, chunk_expr),
data_parts))
# concatenate parts into intermediate
result = compute(agg_expr, {agg: intermediate})