select avg(account) from df group by name
df.groupby('name').account.mean()
df.groupBy(df.name).agg({'mean': 'amount'})
To the notebook!
Blaze expressions describe our data. They consist of symbols and operations on those symbols
>>> from blaze import symbol
>>> t = symbol('t', '1000000 * {name: string, amount: float64}')
datashape = shape + type info
symbol name
"t is a one million row table with a string column called 'name' and a float64 column called 'amount'"
>>> by(t.name, avg=t.amount.mean(), sum=t.amount.sum())
Split-apply-combine
Join
>>> join(s, t, on_left='name', on_right='alias')
Many more...
resource
x = np.array([5, 3, 1, ... <one trillion numbers>, ... 12, 5, 10])
A trillion numbers
How do we compute the sum?
x.sum()
Define the problem in Blaze
>>> from blaze import symbol
>>> x = symbol('x', '1000000000 * int')
>>> x.sum()
size = 1000000
chunk = x[size * i:size * (i + 1)]
aggregate[i] = chunk.sum()
aggregate.sum()
>>> from blaze.expr.split import split
>>> split(x, x.sum())
((chunk, sum(chunk)),
(aggregate, sum(aggregate)))
Sum of aggregated results
Sum of each chunk
size = 1000000
chunk = x[size * i:size * (i + 1)]
aggregate[i] = chunk.count()
aggregate.sum()
>>> from blaze.expr.split import split
>>> split(x, x.count())
((chunk, count(chunk)),
(aggregate, sum(aggregate)))
Sum of aggregated results
Count of each chunk
size = 1000000
chunk = x[size * i:size * (i + 1)]
aggregate.total[i] = chunk.sum()
aggregate.n[i] = chunk.count()
aggregate.total.sum() / aggregate.n.sum()
>>> from blaze.expr.split import split
>>> split(x, x.mean())
((chunk, summary(count=count(chunk), total=sum(chunk))),
(aggregate, sum(aggregate.total)) / sum(aggregate.count))
Sum the total and count then divide
Sum and count of each chunk
size = 1000000
chunk = x[size * i:size * (i + 1)]
by(chunk, freq=chunk.count())
by(aggregate, freq=aggregate.freq.sum())
>>> from blaze.expr.split import split
>>> split(x, by(x, freq=x.count())
((chunk, by(chunk, freq=count(chunk))),
(aggregate, by(aggregate.chunk, freq=sum(aggregate.freq))))
Split-apply-combine on concatenation of results
Split-apply-combine on each chunk
>>> points = symbol('points', '10000 * 10000 * 10000 * {x: int, y: int}')
>>> expr = (points.x + points.y).var(axis=0)
>>> split(points, expr, chunk=chunk)
((chunk,
summary(n = count( chunk.x + chunk.y ),
x = sum( chunk.x + chunk.y ),
x2 = sum((chunk.x + chunk.y) ** 2))),
(aggregate,
(sum(aggregate.x2) / (sum(aggregate.n)))
- ((sum(aggregate.x) / (sum(aggregate.n))) ** 2)))
Variance of x + y
Chunk: a cube of a billion elements
Data: a 10000 by 10000 by 10000 array of (x,y) coordinates
>>> chunk = symbol('chunk', '1000 * 1000 * 1000 * {x: int, y: int}')
...except sort and joins
-- manipulate the data before execution
pre_compute :: Expr, Data -> Data
-- manipulate the expression before execution
optimize :: Expr, Data -> Expr
-- do something with the entire expression before calling compute_up
compute_down :: Expr, Data -> Data
-- compute a single node in our expression tree
compute_up :: Expr, Data -> Data
-- do something after we've traversed the tree
post_compute :: Expr, Data -> Data
-- run the interpreter
compute :: Expr, Data -> Data
>>> @dispatch(Selection, tb.Table)
... def compute_up(expr, data):
... s = eval_str(expr.predicate) # Produce string like 'amount < 0'
... return data.read_where(s) # Use PyTables read_where method
>>> @dispatch(Head, tb.Table)
... def compute_up(expr, data):
... return data[:expr.n] # PyTables supports standard indexing
>>> @dispatch(Field, tb.Table)
... def compute_up(expr, data):
... return data.col(expr._name) # Use the PyTables .col method