Thank you
Data Geek User Group Vancouver WA
michael@powma.com
I like to do tech for fun.
The Power To Create
As a freelancer, I've specialized in moving small/medium sized projects from idea to beta.
"What's you next major business milestone?"
Hiring? I can help, while you find the perfect employee.
michael@powma.com
Letting other people do the work.
By example!
Get a card and a pen...
michael@powma.com
michael@powma.com
1) Write an animal (5-10 letters) on the top of card.
This is our INPUT DATA SET.
Unicorn
<front>
michael@powma.com
2) On the back, "map" a summary.
This is our MAP function, we transform a collection 1-1
MAP is a DATA TRANSFORMATION
First: u
Length: 7
Vowels: 3
<back>
3) Pass cards to the right.
If you're on the isle, you should have a pile of cards.
4) We are now a distrubuted system,
processing a MapReduce.
unicorn
pony
cat
michael@powma.com
DATA -> MAP -> SHUFFLE/SORT -> REDUCE
What is the longest word?
What is the last word alphabetically?
What word has the highest vowel/length ratio?
What do you want to know about this data?
First: u
Length: 7
Vowels: 3
michael@powma.com
let foo = ['kitty', 'puppy', 'pony'];
let bar = [];
for(var i = 0; i < foo.length; i++) {
if(foo.indexOf(foo[i]) === i) {
bar.push(foo[i]);
}
}
// What is this code doing?
Map and Reduce can be used separately.
They make your code more expressive than for() loops
MapReduce on document huge sets instead of arrays
Let's do an example with MongoDB
(browser and Node.js)
Copy an array,
with processing.
Select items from an array, with processing.
Aggregate or "reduce" an array/set of items
to a "total" value.
The total can be: a value, object, or arbitrary data structure.
michael@powma.com
Let's start with some stretches.
foo = ['kitty', 'puppy', 'pony'];
bar = foo.map( item => {
return item.toUpperCase()
})
// ["KITTY", "PUPPY", "PONY"]
// ES6
baz = foo.map( item => item.toUpperCase() )
fahrenheit = [0, 32, 45, 50, 75, 80, 99, 120];
fahrenheit.map(elem => {
return Math.round((elem - 32) * 5 / 9);
})
// [-18, 0, 7, 10, 24, 27, 37, 49]
// ES6
fahrenheit.map(elem => Math.round((elem - 32) * 5 / 9));
A quick warm up
// map(), filter(), and reduce()
// are methods of the Array prototype
['kitty', 'puppy', 'pony'].filter( item => {
return item.length === 5;
})
// ["kitty", "puppy"]
['kitty', 'puppy', 'pony'].filter( item => item.length === 5)
// ["kitty", "puppy"]
michael@powma.com
The workout.
strings = ['kitty', 'puppy', 'pony']
// reduce() takes two arguments:
// A reducer function, and an initial value.
strings.reduce(
function reducer(total, item) {
// reduce passes same total instance to every item instance
console.log(total, item)
return total + item.length
},
0 // Initial value for total
)
// 0 "kitty"
// 5 "puppy"
// 10 "pony"
// 14
The workouts other 90%.
strings = ['kitty', 'puppy', 'pony']
// reduce() takes two arguments:
// A reducer function, and an initial value.
// Imagine the possibilities!
strings.reduce(
(total, item) => { // reducer function
total[item] = item.length;
return total;
},
{} // Initial value for total
)
// {kitty: 5, puppy: 5, pony: 4}
var users = [
{name: 'barney', age: 36},
{name: 'fred', age: 40}
];
// We want names = ['barney', 'fred']
// [].map() from ES5
ret = users.map(function(user) { return user.name; });
// ['barney', 'fred']
// LoDash + ES6
ret = _.map(users, user => user.name );
// ['barney', 'fred']
// LoDash sugar dot-string feature
ret = _.map(users, 'name');
// ['barney', 'fred']
michael@powma.com
https://en.wikipedia.org/wiki/Wikipedia:Database_download#English-language_Wikipedia
#!/bin/bash
# Download Wikipedia Article titles - https://en.wikipedia.org/wiki/Wikipedia:Database_download#English-language_Wikipedia
wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-all-titles-in-ns0.gz
gzip -d enwiki-latest-all-titles-in-ns0.gz
# Start mongo:
mongod --dbpath . &
# Import the data
mongoimport -d wikipedia -c titles \
--type tsv --headerline \
--file enwiki-latest-all-titles-in-ns0
# Start the mongo console
mongo
MongoDB does the work
We'll run this command from MongoDB's console.
Reads/Writes sharded collections.
One expressive statement handles many details.
Map function `emit(key,value)` instead of return value.
// JS Map: item is key, item.length is value
function(item) { return item.length; };
// MongoDB Map: this._id is key, this.data is doc
function() { emit(this._id, this.data.length); };
db.someCollection.mapReduce( map, reduce, { query, out });
use wikipedia
// Describe it
mapFun = function() {
if (this.page_title && this.page_title.replace) {
var noPunctuation = this.page_title.replace(/[^\w]/g, "_");
var words = noPunctuation.split("_");
words.forEach(function(word) {
if(word) emit(word.toLowerCase(),1);
});
}
};
reduceFun = function(someKey, someValues) {
return Array.sum(someValues);
}; // Why is this Array.sum() instead of someValues.length?
// Do it.
db.titles.mapReduce(mapFun, reduceFun, { out: {replace:"wordCounts"} });
// Trim the result document set
db.wordCounts.remove({value: {$lt:50}});
use wikipedia
// Mapper
mapFun = function() {
if (this.page_title && this.page_title.replace) {
// Convert title to alpha numberic by replacing punct with _
var noPunctuation = this.page_title.replace(/[^\w]/g, "_");
// Split title into words
var words = noPunctuation.split("_");
// For each word, emit a lower case version with a count of 1
words.forEach(function(word) {
if(word) emit(word.toLowerCase(),1);
});
}
};
// Reducer
reduceFun = function(someKey, someValues) {
return Array.sum(someValues);
}; // Why is this Array.sum() instead of someValues.length?
// Entry point
db.titles.mapReduce(mapFun, reduceFun, { out: {replace:"wordCounts"} });
// Trim the result document set
db.wordCounts.remove({value: {$lt:50}});
Thank you
Data Geek User Group Vancouver WA
michael@powma.com