This is the Unix philosophy:
Write programs that do one thing and do it well. Write programs to work together. Write programs to handle text streams, because that is a universal interface.
ls | cat | grep | find | wc | awk | sed | tr | sort | uniq
$ cat svn.log | egrep 'JohannesBoyne' | awk '{print $5}' | uniq | wc -l
$ curl https://us-east.manta.joyent.com/manta/public/examples/shakespeare/ | egrep -o '"name":.*?[^\\]",' | sed 's/"name":/ /' | awk '{print $1}' | tr -d '"|,'
$ find . -name "*.txt" -ls | awk '{total += $7} END {print total/1000000 " MB"}'
$ cat *.txt | tr -cs A-Za-z '\n' | tr A-Z a-z | sort | uniq -c | awk '{ x[$2] += $1 } END { for (w in x) { print x[w] " " w } }' | sort -rn | head -10
$ grep -i -c ruby *.txt | sed 's/.*:/ /' | awk '{total += $1} END {print total}'
$ grep -i -c -h 'john' *.txt | awk '{total += $1} END {print total}' //shorter not faster
$ find *.txt | while read fn; do ./indexone.sh $fn ; done | awk -f indexmerge.awk | sort | head -10
Disclaimer: Many of the example are borrowed from www.joyent.com/products/manta/job-examples
cat *.txt | tr '[:upper:]' '[:lower:]' | tr -cs a-z '\n' | uniq -c | awk '{ x[$2] += $1 } END { for (w in x) { print x[w] " " w } }' | sort -rn | head -10
// source from: http://hadoop.apache.org/docs/r1.2.1/mapred_tutorial.html#Example%3A+WordCount+v1.0
package org.myorg;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class WordCount {
public static class Map extends MapReduceBase implements Mapper {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
output.collect(word, one);
}
}
}
public static class Reduce extends MapReduceBase implements Reducer {
public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(WordCount.class);
conf.setJobName("wordcount");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}