- Wikipedia
- "Big data is the term for a collection of data sets so large and complex that it becomes difficult to process using on-hand database management tools or traditional data processing applications. The challenges include capture, curation, storage, search, sharing, transfer, analysis, and visualization."Â
"Whenever multiple machines are used in cooperation with one another, the probability of failures rises. In a single-machine environment, failure is not something that program designers explicitly worry about very often: if the machine has crashed, then there is no way for the program to recover anyway."
"Some are born great, some achieve greatness, and some have greatness thrust upon them."
- William Shakespeare
("some", 1)
("are", 1)
("born", 1)
("some", 1) ("are", 1) ("born", 1) ("great", 1) ("some", 1) ("achieve", 1) ("greatness", 1) ("and", 1) ("some", 1) ("have", 1) ("greatness", 1)
("thrust", 1) ("upon", 1) ("them", 1)
("achieve", 1)
("and", 1) ("are", 1) ("born", 1) ("great", 1) ("greatness", 1) ("greatness", 1) ("have", 1) ("some", 1) ("some", 1) ("some", 1) ("them", 1) ("thrust", 1) ("upon", 1)
("achieve", 1)
("and", 1) ("are", 1) ("born", 1) ("great", 1) ("greatness", 2) ("have", 1) ("some", 3) ("them", 1) ("thrust", 1) ("upon", 1)
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
public class WordCountMapper extends MapReduceBase
implements Mapper<LongWritable, Text, Text, IntWritable> {
private final IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(WritableComparable key, Writable value,
OutputCollector output, Reporter reporter) throws IOException {
String line = value.toString();
StringTokenizer itr = new StringTokenizer(line.toLowerCase());
while(itr.hasMoreTokens()) {
word.set(itr.nextToken());
output.collect(word, one);
}
}
}
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
public class WordCountReducer extends MapReduceBase
implements Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator values,
OutputCollector output, Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
IntWritable value = (IntWritable) values.next();
sum += value.get(); // process value
}
output.collect(key, new IntWritable(sum));
}
}
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
public class WordCount {
public static void main(String[] args) {
JobClient client = new JobClient();
JobConf conf = new JobConf(WordCount.class);
// specify output types
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
// specify input and output dirs
FileInputPath.addInputPath(conf, new Path("input"));
FileOutputPath.addOutputPath(conf, new Path("output"));
// specify a mapper
conf.setMapperClass(WordCountMapper.class);
// specify a reducer
conf.setReducerClass(WordCountReducer.class);
conf.setCombinerClass(WordCountReducer.class);
client.setConf(conf);
try {
JobClient.runJob(conf);
} catch (Exception e) {
e.printStackTrace();
}
}
}