Crawlie

Lessons learned about GenStage and Flow

Hi, my name is...

Jacek Królikowski - @nietaki

GenStage & Flow

Value proposition

  • Workflow orchestrator for parallel data processing

  • "Small" data processing

  • Demand driven / with back pressure - only have in flight/memory as much as you need right now

GenStage - event processing behaviour

GenStage is a specification for exchanging events between producers and consumers.

Flow - abstraction built on GenStage

Enum is eager, Stream is lazy, Flow is concurrent

Flow.from_enumerable(...)
|> Flow.map(...)
|> Flow.flat_map(...)
|> Flow.partition(...)
|> Flow.map(...)
|> Flow.reduce(...)

Crawlie

A simple Elixir library for creating decently-performing crawlers with minimum effort.

Filmweb scraper


  def scrape() = {

    val writer = CSVWriter.open(getCurrentFilename(), append = true)
    var curId = 0
    (1 to lastPageNo).foreach { pageNo =>
      println()
      println(pageNo)
      println()
      val url = constructUrl(pageNo)
      val films = scrapeUrl(url).map { f =>
        val ret = f.copy(idOption = Some(curId))
        curId += 1 // dirty, I know
        ret
      }

      films.foreach(println(_))
      writer.writeAll(films.map(_.row))
    }
  }

How would you use it?

Lessons learned

Flow is simple (to use)

File.stream!("input.txt")
|> Flow.from_enumerable()
|> Flow.flat_map(&String.split/1)
|> Flow.partition()
|> Flow.reduce(fn -> %{} end, 
  fn word, map ->
    Map.update(map, word, 1, &(&1 + 1))
  end)
|> Flow.each(&IO.inspect/1)
File.stream!("input.txt")
#
|> Stream.flat_map(&String.split/1)
#
|> Enum.reduce(%{}, fn word, map ->
  Map.update(map, word, 1, &(&1 + 1))
end)
|> Enum.each(&IO.inspect/1)
{"are", 2}
{"blue", 1}
{"red", 1}
{"roses", 1}
{"violets", 1}
  
File.stream!("input.txt")
|> Flow.from_enumerable()
|> Flow.flat_map(&String.split/1)
|> Flow.partition()
|> Flow.reduce(fn -> %{} end, 
  fn word, map ->
    Map.update(map, word, 1, &(&1 + 1))
  end)
|> Flow.each(&IO.inspect/1)
|> Enum.to_list() # or Flow.run()
{"are", 2}
{"blue", 1}
{"red", 1}
{"roses", 1}
{"violets", 1}

Brain teaser

Stream.cycle([:a, :b, :c])
|> Stream.take(3_000_000)
|> Flow.from_enumerable(max_demand: 256)
|> Flow.reduce(fn -> %{} end, fn atom, map ->
  Map.update(map, atom, 1, &(&1 + 1))
end)
|> Enum.into(%{})
|> IO.inspect()
%{a: 124768, b: 124783, c: 124785}
Stream.cycle([:a, :b, :c])
|> Stream.take(3_000_000)
|> Flow.from_enumerable(max_demand: 256)
|> Flow.reduce(fn -> %{} end, fn atom, map ->
  Map.update(map, atom, 1, &(&1 + 1))
end)
|> Enum.to_list()
|> Enum.reduce(%{}, fn {atom, count}, map ->
  Map.update(map, atom, count, &(&1 + count))
end)
|> IO.inspect
%{a: 1000000, b: 1000000, c: 1000000}
%{a: 1000000, b: 1000000, c: 1000000}
Stream.cycle([:a, :b, :c])
|> Stream.take(3_000_000)
|> Flow.from_enumerable(max_demand: 256)
|> Flow.partition()
|> Flow.reduce(fn -> %{} end, fn atom, map ->
  Map.update(map, atom, 1, &(&1 + 1))
end)
|> Enum.into(%{})
|> IO.inspect

GenStage vs Flow

GenStage

  • low level
  • potentially stateful
  • fits in within a supervision tree

Flow

  • high level
  • stateless*
  • fails all at once*

* mostly stateless, depends what you mean by stateless

* as far as I know

GenStage  Flow

GenStage → Flow

+ Statistics

GenStage → Flow



    url_manager_stage
      |> Flow.from_stage(options)
      |> Flow.partition(Keyword.get(options, :fetch_phase))
      |> Flow.flat_map(&fetch_operation(&1, options, url_stage))
      |> Flow.partition(Keyword.get(options, :process_phase))
      |> Flow.flat_map(&parse_operation(&1, options, parser_logic, url_stage))
      |> Flow.each(&extract_uris_operation(&1, options, parser_logic, url_stage))
      |> Flow.flat_map(&extract_data_operation(&1, options, parser_logic))

@doc """
Parses the retrieved page to user-defined data.
"""
@callback parse(Response.t, options :: Keyword.t) 
:: {:ok, parsed} | {:error, term} | :skip | {:skip, reason :: atom}

@doc """
Extracts the uri's to be crawled subsequently.
"""
@callback extract_uris(Response.t, parsed, options :: Keyword.t) 
:: [URI.t | String.t]

@doc """
Extracts the final data from the parsed page.
"""
@callback extract_data(Response.t, parsed, options :: Keyword.t) 
:: [result]

Terminating Flow...

...from a GenStage producer.

GenStage.async_notify(self(), {:producer, :done})

As of 0.12.0, just stop the producer stage:

{:stop, :normal, state}

Flow → Stage

f = Flow.from_enumerable(1..10)
{:ok, consumer} = GenStage.start_link(NosyStage, self())
{:ok, _coordinator_process} = Flow.into_stages(f, [{consumer, [max_demand: 8]}])

receive do
  :nosy_stage_done -> IO.puts "RESULT: nosy stage done"
after
  1000 -> IO.puts "RESULT: nosy stage never finished"
end

defmodule NosyStage do
  use GenStage

  def handle_info({from, {:producer, :done}}, creator_pid) do
    IO.puts "Nosy got {:producer, :done} info from #{inspect(from)}"
    send creator_pid, :nosy_stage_done
    {:stop, :normal, creator_pid}
  end

  # (...) other callbacks
end
f = Flow.from_enumerable(1..10)
{:ok, consumer} = GenStage.start_link(NosyStage, self())
{:ok, _coordinator_process} = Flow.into_stages(f, [{consumer, [max_demand: 8]}])

receive do
  :nosy_stage_done -> IO.puts "RESULT: nosy stage done"
after
  1000 -> IO.puts "RESULT: nosy stage never finished"
end

defmodule NosyStage do
  use GenStage

  def init(creator_pid) do
    {:consumer, creator_pid}
  end

  def handle_events(events, from, creator_pid) do
    IO.puts "Nosy handles incoming events: #{inspect(events)} from #{inspect(from)}"
    {:noreply, [], creator_pid}
  end

  def handle_subscribe(:producer, _options, from, creator_pid) do
    IO.puts "Nosy subscribed to #{inspect(from)}"
    # {:manual, creator_pid}
    {:automatic, creator_pid}
  end

  def handle_info({from, {:producer, :done}}, creator_pid) do
    IO.puts "Nosy got {:producer, :done} info from #{inspect(from)}"
    send creator_pid, :nosy_stage_done
    {:stop, :normal, creator_pid}
  end
end
Nosy subscribed to {#PID<0.128.0>, #Reference<0.0.7.161>}
Nosy handles incoming events: [1, 2, 3, 4] from {#PID<0.128.0>, #Reference<0.0.7.161>}
Nosy handles incoming events: [5, 6, 7, 8] from {#PID<0.128.0>, #Reference<0.0.7.161>}
Nosy handles incoming events: '\t\n' from {#PID<0.128.0>, #Reference<0.0.7.161>}
Nosy got {:producer, :done} info from {#PID<0.128.0>, #Reference<0.0.7.161>}
RESULT: nosy stage done

Handling Demand

Main GenStage callbacks:

  • init/1
  • handle_events/3
  • handle_demand/2
def handle_demand(demand, %State{pending_demand: pending_demand} = state) do
  %State{state | pending_demand: pending_demand + demand}
  |> do_handle_demand()
end

defp do_handle_demand(state) do
  demand = state.pending_demand
  {state, pages} = State.take_pages(state, demand)
  remaining_demand = demand - Enum.count(pages)
  state = %State{state | pending_demand: remaining_demand}
  {:noreply, pages, state}
end

Optimizing Flows

  • You don't really have to
  • use Flow.partition/2 to set different settings for sections of the flow
    • :stages ~ flow parallelism 
    • :min_demand, :max_demand
    • which parts are IO bound, which are CPU bound?
  • Think about how much data is being sent between stages
  • Application-specific (depth-first url traversal)

There's more!

  • Flow.Window (windows and triggers)
    • late events
  • GenStage.*Dispatcher
  • GenStage processes in supervision trees
  • ...

Links

Live demo

(time permitting)

Crawlie

By Jacek Królikowski

Crawlie

Lessons learned about GenStage and Flow

  • 3,217