select avg(account) from df group by namedf.groupby('name').account.mean()df.groupBy(df.name).agg({'mean': 'amount'})To the notebook!
Blaze expressions describe our data. They consist of symbols and operations on those symbols
>>> from blaze import symbol
>>> t = symbol('t', '1000000 * {name: string, amount: float64}')datashape = shape + type info
symbol name
"t is a one million row table with a string column called 'name' and a float64 column called 'amount'"
>>> by(t.name, avg=t.amount.mean(), sum=t.amount.sum())Split-apply-combine
Join
>>> join(s, t, on_left='name', on_right='alias')Many more...
resource
x = np.array([5, 3, 1, ... <one trillion numbers>, ... 12, 5, 10])A trillion numbers
How do we compute the sum?
x.sum()Define the problem in Blaze
>>> from blaze import symbol
>>> x = symbol('x', '1000000000 * int')
>>> x.sum()size = 1000000
chunk = x[size * i:size * (i + 1)]aggregate[i] = chunk.sum()aggregate.sum()>>> from blaze.expr.split import split
>>> split(x, x.sum())
((chunk,     sum(chunk)),
 (aggregate, sum(aggregate)))Sum of aggregated results
Sum of each chunk
size = 1000000
chunk = x[size * i:size * (i + 1)]aggregate[i] = chunk.count()aggregate.sum()>>> from blaze.expr.split import split
>>> split(x, x.count())
((chunk,     count(chunk)),
 (aggregate, sum(aggregate)))Sum of aggregated results
Count of each chunk
size = 1000000
chunk = x[size * i:size * (i + 1)]aggregate.total[i] = chunk.sum()
aggregate.n[i] = chunk.count()aggregate.total.sum() / aggregate.n.sum()>>> from blaze.expr.split import split
>>> split(x, x.mean())
((chunk,     summary(count=count(chunk), total=sum(chunk))),
 (aggregate, sum(aggregate.total)) / sum(aggregate.count))Sum the total and count then divide
Sum and count of each chunk
size = 1000000
chunk = x[size * i:size * (i + 1)]by(chunk, freq=chunk.count())by(aggregate, freq=aggregate.freq.sum())>>> from blaze.expr.split import split
>>> split(x, by(x, freq=x.count())
((chunk,     by(chunk, freq=count(chunk))),
 (aggregate, by(aggregate.chunk, freq=sum(aggregate.freq))))Split-apply-combine on concatenation of results
Split-apply-combine on each chunk
>>> points = symbol('points', '10000 * 10000 * 10000 * {x: int, y: int}')>>> expr = (points.x + points.y).var(axis=0)
>>> split(points, expr, chunk=chunk)
((chunk,
  summary(n  = count( chunk.x + chunk.y ),
          x  =   sum( chunk.x + chunk.y ),
          x2 =   sum((chunk.x + chunk.y) ** 2))),
 (aggregate,
    (sum(aggregate.x2) / (sum(aggregate.n)))
 - ((sum(aggregate.x)  / (sum(aggregate.n))) ** 2)))Variance of x + y
Chunk: a cube of a billion elements
Data: a 10000 by 10000 by 10000 array of (x,y) coordinates
>>> chunk = symbol('chunk', '1000 * 1000 * 1000 * {x: int, y: int}')...except sort and joins
-- manipulate the data before execution
pre_compute :: Expr, Data -> Data
-- manipulate the expression before execution
optimize :: Expr, Data -> Expr
-- do something with the entire expression before calling compute_up
compute_down :: Expr, Data -> Data
-- compute a single node in our expression tree
compute_up :: Expr, Data -> Data
-- do something after we've traversed the tree
post_compute :: Expr, Data -> Data
-- run the interpreter
compute :: Expr, Data -> Data>>> @dispatch(Selection, tb.Table)    
... def compute_up(expr, data):
...     s = eval_str(expr.predicate)  # Produce string like 'amount < 0'
...     return data.read_where(s)     # Use PyTables read_where method
>>> @dispatch(Head, tb.Table)         
... def compute_up(expr, data):
...     return data[:expr.n]          # PyTables supports standard indexing
>>> @dispatch(Field, tb.Table)       
... def compute_up(expr, data):
...     return data.col(expr._name)  # Use the PyTables .col method