>>> from blaze import Symbol, discover, compute
>>> import pandas as pd
>>> df = pd.DataFrame({'name': ['Alice', 'Bob', 'Forrest', 'Bubba'],
...                    'amount': [10, 20, 30, 40]})
...
>>> t = Symbol('t', discover(df))
>>> t.amount.sum()
sum(t.amount)
>>> compute(t.amount.sum(), df)
100
>>> compute(t.amount.sum(), df.values.tolist())
100
>>> compute(t.amount.sum(), df.to_records(index=False))
100>>> from blaze import into, drop
>>> import numpy as np
>>> into(list, df)
[(10, 'Alice'), (20, 'Bob'), (30, 'Forrest'), (40, 'Bubba')]
>>> into(np.ndarray, df)
rec.array([(10, 'Alice'), (20, 'Bob'), (30, 'Forrest'), (40, 'Bubba')],
      dtype=[('amount', '<i8'), ('name', 'O')])
>>> into('sqlite:///db.db::t', df)
<blaze.data.sql.SQL at 0x108278fd0>
>>> drop(_)  # remove the database for the next example
>>> result = into(pd.DataFrame, into('sqlite:///db.db::t', df))
>>> result
   amount     name
0      10    Alice
1      20      Bob
2      30  Forrest
3      40    Bubba
>>> type(result)
pandas.core.frame.DataFrame>>> from blaze import Data
>>> d = Data(df)
>>> d.amount.dshape
dshape("4 * int64")
>>> d.amount.
d.amount.count             d.amount.max               d.amount.shape
d.amount.count_values      d.amount.mean              d.amount.sort
d.amount.distinct          d.amount.min               d.amount.std
d.amount.dshape            d.amount.ndim              d.amount.sum
d.amount.fields            d.amount.nelements         d.amount.truncate
d.amount.head              d.amount.nrows             d.amount.utcfromtimestamp
d.amount.isidentical       d.amount.nunique           d.amount.var
d.amount.label             d.amount.relabel
d.amount.map               d.amount.schema
>>> d.name.dshape
dshape("4 * string")
>>> d.name.
d.name.count         d.name.head          d.name.max           d.name.nunique
d.name.count_values  d.name.isidentical   d.name.min           d.name.relabel
d.name.distinct      d.name.label         d.name.ndim          d.name.schema
d.name.dshape        d.name.like          d.name.nelements     d.name.shape
d.name.fields        d.name.map           d.name.nrows         d.name.sortq)x:"racecar"
q)n:count x
q)all{[x;n;i]x[i]=x[n-i+1]}[x;n]each til _:[n%2]+1
1bCheck if a string is a palindrome
q)-1 x
racecar
-1
q)1 x
racecar1Print to stdout, with and without a newline
Um, integers are callable?
1 divided by cat

q)1 % "cat"
0.01010101 0.01030928 0.00862069
so....
Why KDB+/Q?
*It's a little more nuanced than that
What is kdbpy?
≈16 GB (trip dataset only)
partitioned in KDB+ on date (year.month.day)
vs
blaze
group by on
passenger count
medallion
hack license
sum on
trip time
trip distance
cartesian product of the above


Storage
Compute
Parallelization
>>> from blaze import by, compute, Data
>>> from multiprocessing import Pool
>>> data = Data('trip.bcolz')
>>> pool = Pool()  # default to the number of logical cores
>>> expr = by(data.passenger_count, avg=data.trip_time_in_secs.mean())
>>> result = compute(expr, map=pool.map)  # chunksize defaults to 2 ** 20
>>> result
    passenger_count         avg
0                 0  122.071500
1                 1  806.607092
2                 2  852.223353
3                 3  850.614843
4                 4  885.621065
5                 5  763.933618
6                 6  760.465655
7                 7  428.485714
8                 8  527.920000
9                 9  506.230769
10              129  240.000000
11              208   55.384615
12              255  990.000000
>>> timeit compute(expr, map=pool.map)
1 loops, best of 3: 5.67 s per loopchunk = symbol('chunk', chunksize * leaf.schema)
(chunk, chunk_expr), (agg, agg_expr) = split(leaf, expr, chunk=chunk)
data_parts = partitions(data, chunksize=(chunksize,))
parts = list(map(curry(compute_chunk, data, chunk, chunk_expr),
                       data_parts))
# concatenate parts into intermediate
result = compute(agg_expr, {agg: intermediate})