is really fun!
parseSiteIntoArticles(spec, function(err, articles) {
// an array of articles
});
parseRssIntoArticles(spec, function(err, articles) {
// an array of articles
});
{
"type": "site", // or "feed"
"name": "New Yorker",
"url": "http://www.newyorker.com/",
"template": {
"containers": [{
"selector": "article",
"elements": [{
"name": "url",
"type": "url",
"occurence": "first",
"required": true,
"items": [{
"selector": "section h2 a",
"attribute": "href"
}]
}, {
"name": "title",
"required": true,
"occurence": "first",
"items": [{
"selector": "section h2 a"
}]
}, {
"name": "image",
"type": "url",
"occurence": "first",
"fallback": null,
"items": [{
"selector": "figure a img",
"attribute": "src"
}, {
"selector": "figure a img",
"attribute": "data-lazy-src"
}]
}]
}]
}
}
[{
origin: 'http://www.newyorker.com/',
url: 'http://www.newyorker.com/news/daily-comment/the-plot-against-trains',
author: 'Adam Gopnik',
title: 'The Plot Against Trains',
description: 'The will to abandon American infrastructure projects is not some omission of shortsighted politicians. It is part of a coherent ideological project.',
image: 'http://www.newyorker.com/wp-content/uploads/2015/05/Gopnik-Plot-Against-Trains2-290-150-14182024.jpg',
page_position: 0
}, {
origin: 'http://www.newyorker.com/',
url: 'http://www.newyorker.com/news/john-cassidy/obamas-cognitive-dissonance-on-trade',
author: 'John Cassidy',
title: 'Cognitive Dissonance on Trade',
description: 'A trade deal remains a huge issue for American workers even as President Obama seeks the power from Congress to complete the Trans-Pacific Partnership.',
image: 'http://www.newyorker.com/wp-content/uploads/2015/05/Cassidy-Obamas-Cognitive-Dissonance-on-Trade-320-240-14154710.jpg',
page_position: 1
}, {
origin: 'http://www.newyorker.com/',
url: 'http://www.newyorker.com/magazine/2015/05/18/distant-emotions',
author: 'Anthony Lane',
title: 'Ethan Hawke, Drone Pilot',
description: 'Viewers of “Good Kill” will end up like its protagonist: sad, stunned, lonesome, and boxed in.',
image: 'http://www.newyorker.com/wp-content/uploads/2015/05/150518_r26521-320-240-06151517.jpg',
page_position: 2
}, ...]
getContentFromUrl(url, function(err, content) {
// Article content
});
content = "<p>One morning, my grandmother’s brother,
Avraham, decided to stop being religious.
He shaved his beard, cut off his side
curls, shed his yarmulke, packed his
things and resolved to leave his hometown
of Baranovichi and begin a new life..</p>"
getSharesFromFacebook(url, function(err, result) {
// facebook shares count
})
getSharesFromTwitter(url, function(err, result) {
// twitter shares count
})
shares = 243
let article = {
url: 'http://www.newyorker.com/news/daily-comment/the-plot-against-trains',
title: 'The Plot Against Trains'
};
let extension = {
content: '<p>Trains, trains, trains..</p>'
};
let extendedArticle = Object.assign(extension, article);
// {
// url: 'http://www.newyorker.com/news/daily-comment/the-plot-against-trains',
// title: 'The Plot Against Trains',
// content: '<p>Trains, trains, trains..</p>'
// };
Just some methods for declaratively extending objects
let article = {
url: 'http://www.newyorker.com/magazine/2015/05/18/art-census',
title: 'A Census at the Met'
};
let extensions = {
content: ['url', getContentFromUrl]
};
let extendWithContent = extendWith(extensions);
extendWithContent(article, function(err, result) {
// result = {
// url: 'http://www.newyorker.com/magazine/2015/05/18/art-census',
// title: 'A Census at the Met',
// content: '<p>The content of the article</p>
// }
});
But
(also helpful)
// {...} , {...} , {...}
const specStream = highland([{...}, {...}, {...}]);
const isSite = lodash.compose(
lodash.isEqual('site'),
lodash.result('type')
);
const articlesFromHtmlStream = specStream
.fork()
.filter(isSite)
.map(parseArticlesFromHtml).parallel(5)
.errors(function(err) {
console.log(err);
})
const isRssFeed = lodash.compose(
lodash.isEqual('feed'),
lodash.result('type')
);
const articlesFromRssStream = specStream
.fork()
.filter(isRssFeed)
.map(parseArticlesFromRss).parallel(5)
.errors(handleError)
const articleStream = highland([
articlesFromHtmlStream,
articlesFromRssStream
])
.merge()
.flatten()
.errors(handleError)
const addSocialDataFromUrl = extendWith({
shares: {
facebook: ['url', getSharesFromFacebook],
twitter: ['url', getSharesFromTwitter],
}
});
const addContentFromUrl = extendWith({
content: ['url', getContentFromURL]
});
const extendedArticleStream = articleStream
.fork()
.map(addSocialDataFromUrl).parallel(10)
.map(addContentFromUrl).parallel(10)
.errors(handleError)
{
origin: 'http://www.newyorker.com/',
url: 'http://www.newyorker.com/magazine/2015/05/18/art-census',
title: 'A Census at the Met',
content: '<p>The content of the article</p>',
shares: {
facebook: 245,
twitter: 350,
linkedin: 470
}
}
(With persistence to a database)
const specStream = highland(getSpecsFromDatabase)
.ratelimit(1, 30000)
.flatten()
.errors(handleError)
const articlesFromRssStream = specStream
.fork()
.filter(isRssFeed)
.map(parseArticlesFromRss).parallel(5)
.errors(handleError)
const articlesFromHtmlStream = specStream
.fork()
.filter(isSite)
.map(parseArticlesFromHtml).parallel(5)
.errors(handleError)
const articleStream = highland([
articlesFromHtmlStream,
articlesFromRssStream
])
.merge()
.flatten()
.errors(handleError)
const newArticleStream = articleStream
.fork()
.filter(doesNotExistInDatabase)
.map(addContentFromUrl)
.map(saveToDatabase)
.errors(handleErrors)
const updatedArticleStream = articleStream
.fork()
.filter(existsInDatabase)
.map(addSocialDataFromUrl)
.map(updateInDatabase)
.errors(handleErrors)
newArticleStream
.doto(console.log)
.resume()
updatedArticleStream
.doto(console.log)
.resume()