Jacek Królikowski - @nietaki
Workflow orchestrator for parallel data processing
"Small" data processing
Demand driven / with back pressure - only have in flight/memory as much as you need right now
GenStage is a specification for exchanging events between producers and consumers.
Enum is eager, Stream is lazy, Flow is concurrent
Flow.from_enumerable(...)
|> Flow.map(...)
|> Flow.flat_map(...)
|> Flow.partition(...)
|> Flow.map(...)
|> Flow.reduce(...)
Diagram stolen from José: http://www.lambdadays.org/lambdadays2017/jose-valim
def scrape() = {
val writer = CSVWriter.open(getCurrentFilename(), append = true)
var curId = 0
(1 to lastPageNo).foreach { pageNo =>
println()
println(pageNo)
println()
val url = constructUrl(pageNo)
val films = scrapeUrl(url).map { f =>
val ret = f.copy(idOption = Some(curId))
curId += 1 // dirty, I know
ret
}
films.foreach(println(_))
writer.writeAll(films.map(_.row))
}
}
File.stream!("input.txt")
|> Flow.from_enumerable()
|> Flow.flat_map(&String.split/1)
|> Flow.partition()
|> Flow.reduce(fn -> %{} end,
fn word, map ->
Map.update(map, word, 1, &(&1 + 1))
end)
|> Flow.each(&IO.inspect/1)
File.stream!("input.txt")
#
|> Stream.flat_map(&String.split/1)
#
|> Enum.reduce(%{}, fn word, map ->
Map.update(map, word, 1, &(&1 + 1))
end)
|> Enum.each(&IO.inspect/1)
{"are", 2}
{"blue", 1}
{"red", 1}
{"roses", 1}
{"violets", 1}
File.stream!("input.txt")
|> Flow.from_enumerable()
|> Flow.flat_map(&String.split/1)
|> Flow.partition()
|> Flow.reduce(fn -> %{} end,
fn word, map ->
Map.update(map, word, 1, &(&1 + 1))
end)
|> Flow.each(&IO.inspect/1)
|> Enum.to_list() # or Flow.run()
{"are", 2}
{"blue", 1}
{"red", 1}
{"roses", 1}
{"violets", 1}
Stream.cycle([:a, :b, :c])
|> Stream.take(3_000_000)
|> Flow.from_enumerable(max_demand: 256)
|> Flow.reduce(fn -> %{} end, fn atom, map ->
Map.update(map, atom, 1, &(&1 + 1))
end)
|> Enum.into(%{})
|> IO.inspect()
%{a: 124768, b: 124783, c: 124785}
Stream.cycle([:a, :b, :c])
|> Stream.take(3_000_000)
|> Flow.from_enumerable(max_demand: 256)
|> Flow.reduce(fn -> %{} end, fn atom, map ->
Map.update(map, atom, 1, &(&1 + 1))
end)
|> Enum.to_list()
|> Enum.reduce(%{}, fn {atom, count}, map ->
Map.update(map, atom, count, &(&1 + count))
end)
|> IO.inspect
%{a: 1000000, b: 1000000, c: 1000000}
%{a: 1000000, b: 1000000, c: 1000000}
Stream.cycle([:a, :b, :c])
|> Stream.take(3_000_000)
|> Flow.from_enumerable(max_demand: 256)
|> Flow.partition()
|> Flow.reduce(fn -> %{} end, fn atom, map ->
Map.update(map, atom, 1, &(&1 + 1))
end)
|> Enum.into(%{})
|> IO.inspect
GenStage
Flow
* mostly stateless, depends what you mean by stateless
* as far as I know
url_manager_stage
|> Flow.from_stage(options)
|> Flow.partition(Keyword.get(options, :fetch_phase))
|> Flow.flat_map(&fetch_operation(&1, options, url_stage))
|> Flow.partition(Keyword.get(options, :process_phase))
|> Flow.flat_map(&parse_operation(&1, options, parser_logic, url_stage))
|> Flow.each(&extract_uris_operation(&1, options, parser_logic, url_stage))
|> Flow.flat_map(&extract_data_operation(&1, options, parser_logic))
@doc """
Parses the retrieved page to user-defined data.
"""
@callback parse(Response.t, options :: Keyword.t)
:: {:ok, parsed} | {:error, term} | :skip | {:skip, reason :: atom}
@doc """
Extracts the uri's to be crawled subsequently.
"""
@callback extract_uris(Response.t, parsed, options :: Keyword.t)
:: [URI.t | String.t]
@doc """
Extracts the final data from the parsed page.
"""
@callback extract_data(Response.t, parsed, options :: Keyword.t)
:: [result]
...from a GenStage producer.
GenStage.async_notify(self(), {:producer, :done})
As of 0.12.0, just stop the producer stage:
{:stop, :normal, state}
f = Flow.from_enumerable(1..10)
{:ok, consumer} = GenStage.start_link(NosyStage, self())
{:ok, _coordinator_process} = Flow.into_stages(f, [{consumer, [max_demand: 8]}])
receive do
:nosy_stage_done -> IO.puts "RESULT: nosy stage done"
after
1000 -> IO.puts "RESULT: nosy stage never finished"
end
defmodule NosyStage do
use GenStage
def handle_info({from, {:producer, :done}}, creator_pid) do
IO.puts "Nosy got {:producer, :done} info from #{inspect(from)}"
send creator_pid, :nosy_stage_done
{:stop, :normal, creator_pid}
end
# (...) other callbacks
end
f = Flow.from_enumerable(1..10)
{:ok, consumer} = GenStage.start_link(NosyStage, self())
{:ok, _coordinator_process} = Flow.into_stages(f, [{consumer, [max_demand: 8]}])
receive do
:nosy_stage_done -> IO.puts "RESULT: nosy stage done"
after
1000 -> IO.puts "RESULT: nosy stage never finished"
end
defmodule NosyStage do
use GenStage
def init(creator_pid) do
{:consumer, creator_pid}
end
def handle_events(events, from, creator_pid) do
IO.puts "Nosy handles incoming events: #{inspect(events)} from #{inspect(from)}"
{:noreply, [], creator_pid}
end
def handle_subscribe(:producer, _options, from, creator_pid) do
IO.puts "Nosy subscribed to #{inspect(from)}"
# {:manual, creator_pid}
{:automatic, creator_pid}
end
def handle_info({from, {:producer, :done}}, creator_pid) do
IO.puts "Nosy got {:producer, :done} info from #{inspect(from)}"
send creator_pid, :nosy_stage_done
{:stop, :normal, creator_pid}
end
end
Nosy subscribed to {#PID<0.128.0>, #Reference<0.0.7.161>}
Nosy handles incoming events: [1, 2, 3, 4] from {#PID<0.128.0>, #Reference<0.0.7.161>}
Nosy handles incoming events: [5, 6, 7, 8] from {#PID<0.128.0>, #Reference<0.0.7.161>}
Nosy handles incoming events: '\t\n' from {#PID<0.128.0>, #Reference<0.0.7.161>}
Nosy got {:producer, :done} info from {#PID<0.128.0>, #Reference<0.0.7.161>}
RESULT: nosy stage done
Main GenStage callbacks:
def handle_demand(demand, %State{pending_demand: pending_demand} = state) do
%State{state | pending_demand: pending_demand + demand}
|> do_handle_demand()
end
defp do_handle_demand(state) do
demand = state.pending_demand
{state, pages} = State.take_pages(state, demand)
remaining_demand = demand - Enum.count(pages)
state = %State{state | pending_demand: remaining_demand}
{:noreply, pages, state}
end