Spring 2021
Instructors Roz Cyrus and Jerry Cain
PDF
<word> 1
for every alphabetic token in that file.import sys
import re
pattern = re.compile("^[a-z]+$") # matches purely alphabetic words
for line in sys.stdin:
line = line.strip()
tokens = line.split()
for token in tokens:
lowercaseword = token.lower()
if pattern.match(lowercaseword):
print '%s 1' % lowercaseword
myth61:$ cat anna-karenina.txt | ./word-count-mapper.py
happy 1
families 1
are 1
... // some 340000 words omitted for brevity
to 1
put 1
into 1
group-by-key
contributes to all MapReduce pipelines, not just this one. Our group-by-key.py
executable—presented on the next slide—assumes the mapper's output has been sorted so multiple instances of the same key are more easily grouped together, as with:myth61:$ cat anna-karenina.txt | ./word-count-mapper.py | sort
a 1
a 1
a 1
a 1
a 1 // plus 6064 additional copies of this same line
...
zigzag 1
zoological 1
zoological 1
zoology 1
zu 1
myth61:$ cat anna-karenina.txt | ./word-count-mapper.py | sort | ./group-by-key.py
a 1 1 1 1 1 // plus 6064 more 1's on this same line
...
zeal 1 1 1
zealously 1
zest 1
zhivahov 1
zigzag 1
zoological 1 1
zoology 1
zu 1
from itertools import groupby
from operator import itemgetter
import sys
def read_mapper_output(file):
for line in file:
yield line.strip().split(' ')
data = read_mapper_output(sys.stdin)
for key, keygroup in groupby(data, itemgetter(0)):
values = ' '.join(sorted(v for k, v in keygroup))
print "%s %s" % (key, values)
./group-by-key.py
script.import sys
def read_mapper_output(file):
for line in file:
yield line.strip().split(' ')
for vec in read_mapper_output(sys.stdin):
word = vec[0]
count = sum(int(number) for number in vec[1:])
print "%s %d" % (word, count)
myth61:$ cat anna-karenina.txt | ./word-count-mapper.py | sort \
| ./group-by-key.py | ./word-count-reducer.py
a 6069
abandon 6
abandoned 9
abandonment 1
...
zoological 2
zoology 1
zu 1