Build a fast web scraper and work queue with node.js and async

Building a fast web scraper and parallelized work queue with node.js and async

CVJS - June 2014

Michael Holroyd, Ph.D.

What is this stuff?

node.js

server-side event-driven javascript

async

utility lib with common async patterns

cheerio

server-side implementation of jQuery

beanstalkd

fast simple work-queue

async

https://github.com/caolan/async

async.parallel([
    function(){ ... },
    function(){ ... }
], callback);

async.series([
    function(){ ... },
    function(){ ... }
]);

async.map(['file1','file2','file3'], fs.stat, function(err, results){
    // results is now an array of stats for each file
});

async

async.parallel([
    function(callback){
        callback(null, 'one');
    },
    function(callback){
        setTimeout(function(){
            callback(null, 'two');
        }, 100);
    }
],
// optional callback
function(err, results){
    // results = ['one','two']
});

Each function gets passed a callback to run when it completes. async handles the book-keeping, and calls your optional callback when all the work is done.

.parallelLimit() will run up to N functions at a time, not so easy to get right with correct error handling etc.

Callback Hell / Pyramid of Doom

async.auto (salvation from callback hell)

async.auto({
    get_data: function(callback){
        console.log('in get_data');
        // async code to get some data
        callback(null, 'data', 'converted to array');
    },
    make_folder: function(callback){
        console.log('in make_folder');
        // async code to create a directory to store a file in
        // this is run at the same time as getting the data
        callback(null, 'folder');
    },
    write_file: ['get_data', 'make_folder', function(callback, results){
        console.log('in write_file', JSON.stringify(results));
        // once there is some data and the directory exists,
        // write the data to a file in the directory
        callback(null, 'filename');
    }],
    email_link: ['write_file', function(callback, results){
        console.log('in email_link', JSON.stringify(results));
        // once the file is written let's email a link to it...
        // results.write_file contains the filename returned by write_file.
        callback(null, {'file':results.write_file, 'email':'user@example.com'});
    }]
}, function(err, results) {
    console.log('results = ', results);
});

async.retry

async.auto({
    users: api.getUsers.bind(api),
    payments: async.retry(3, api.getPayments.bind(api))
}, function(err, results) {
  // do something with the results
});

I use async in async.auto in every project even if I don't need complex asynchronous actions, because

it makes data dependancies readable and explicit
sane default error handling behavior (bail out on any error)
fewer curly braces and no long lists of named functions
easy to switch between parallel and series for testing/debugging

cheerio

var cheerio = require('cheerio');
var $ = cheerio.load('<h2 class="title">Hello!</h2>');

$('h2.title').text('Hello there!');
$('h2').addClass('welcome');

$.html();

cheerio is a re-implementation of jQuery for server-side code

uses a "forgiving" HTML parser: many real websites don't have valid HTML...
does not try to parse the whole site immediately, but waits for you to call specific selectors

cheerioexample.js

var request = require('request');
var cheerio = require('cheerio');

request("http://www.meetup.com/CVJS/",function(err,r,body){
  $ = cheerio.load(body);

  var links = $(".event-item h3 a").map(function(idx,elem){
    return {
      text: $(elem).text().trim(),
      href: $(elem).attr("href")
    };
  }).toArray();

  links.forEach(function(l){
    console.log(l.href, l.text);
  });
});

cheerio

// scrape.js
async.each(_.range(30),function(page,indexcb){
  request.get(domain+"/course-list/allcourses?page="+page,function(err,r,body){
    var$= cheerio.load(body);
    var links = _.uniq($(".course-title a").map(function(i,elem){
return elem.attribs.href;
}));
    async.each(links, function(link,cb){
      request.get(link,function(err,r,body){
        var$= cheerio.load(body);
        ...

What if we want to follow every link to the detail page?

series.js and scrape_callbackcounter.js

Compare with running each request in series, ~10x speed-up

Note: This is really about event-driven processing and node.js, not about async; async just makes it easier.

knollop.com (soon to be learnstream.com)

knollop.com

We build scraping infrastructure for educational institutions.
Also maintain a website indexing popular online education resources along with course reviews and feedback.
Run ~50 scrapers daily (or on demand).
Started with python+scrapy, then moved to node.js+cheerio and saw 20x speedup.
Backend API that the scrapers hit is Scala + Play2.0

Job Queues

There are at least 100 different solutions for job queues

beanstalkd is one of them

  put with delay               release with delay
  ----------------> [DELAYED] <------------.
                        |                   |
                        | (time passes)     |
                        |                   |
   put                  v     reserve       |       delete
  -----------------> [READY] ---------> [RESERVED] --------> *poof*
                       ^  ^                |  |
                       |   \  release      |  |
                       |    `-------------'   |
                       |                      |
                       | kick                 |
                       |                      |
                       |       bury           |
                    [BURIED] <---------------'
                       |
                       |  delete
                        `--------> *poof*

beanstalkd

Lightweight, fast, uses almost no memory, blah blah blah
Runs ASCII over TCP (No HTTP overhead)
Very low-level. Jobs are text, you define your own rules.

(Personally we use JSON)

Very easy to setup, many clients in many languages
Persistent, if you want.

We've been running one instance since September 2013 and it has never stopped (despite disk fulls, network downtime, etc)

Kind of like memcached for work-queues

beanstalkd

We use named "tubes" for each type of job

list-tubes
OK 105
---
- default
- copySpin
- createSpin
- editSpin
- exportVideo
- pdfsupply_export
- updateMetadata
- udaverea_export

beanstalkd

each tube keeps a list of jobs in each state
typical flow for a job is ready -> reserved -> *poof*

stats-tube editSpin
OK 273
---
name: editSpin
current-jobs-urgent: 0
current-jobs-ready: 1
current-jobs-reserved: 4
current-jobs-delayed: 0
current-jobs-buried: 0
total-jobs: 55137
current-using: 3
current-watching: 1
current-waiting: 1
cmd-delete: 35061
cmd-pause-tube: 0
pause: 0
pause-time-left: 0

beanstalkd

Each job has a timeout. If the worker that reserved the job does not checkin with a touch command before the timeout, it will be automatically moved back to the ready queue.
Jobs can also be buried (for example if a job is repeatedly failing), to be manually examined later.
Jobs can also be added to the queue with a delay (good for "backing off" a congested service)

arqspin.com

Easy interactive 360 product photography

Users take video of object with their phone or upload it to our website, we create an interactive 360 widget for them to embed on their website.

beanstalkd+node.js

There are four node.js clients. I like fivebeans.

function reserveWork(){
  async.auto({
    reservation: function(cb){
      beanstalkd.reserve(function(err, jobid, payload){
        cb(null,{ payload: payload, jobid: jobid });
      });
    },
    work: ['reservation',function(cb,r){
      var payload = JSON.parse(r.reservation.payload); // work work work...
      cb(null, new_payload);
    }],
    done: ['work',function(cb,r){
      beanstalkd.use("editor",function(err,tube){// next tube
        beanstalkd.put(1024, 0, 300, r.work, function(err,new_jobid){
          beanstalkd.destroy(jobid, cb);
        });
      });
    }]
  }, reserveWork);
}

forever

a simple CLI for making sure your workers run forever

restarts jobs when they die
has sane default behavior, easy to configure however you want
can also be used programmatically from inside node.js

logging (ELK stack)

https://m.arq.io/logs/#/dashboard/elasticsearch/CVJS%20Kibana

Thanks

async: https://github.com/caolan/async

cheerio: https://github.com/cheeriojs/cheerio

beanstalkd: http://kr.github.io/beanstalkd/

forever: https://github.com/nodejitsu/forever

Arqball (computer vision kung-fu):

http://arqspin.com

Knollop (online education search):

http://knollop.com

Michael Holroyd

http://meekohi.com