Databases Reference
In-Depth Information
as Tex t. We choose to use IntWritable for K2 , V2 , K3 , and V3 because we know those
data must be integers and it's more efficient to use IntWritable .
Based on the data flow and the data types, you'll be able to see the final program
shown in listing 4.2 and understand what it's doing. You can see that it's structurally
similar to the other MapReduce programs we've seen so far. We go into details about
the program after the listing.
Listing 4.2 CitationHistogram.java: count patents cited once, twice, and so on
public class CitationHistogram extends Configured implements Tool {
public static class MapClass extends MapReduceBase
implements Mapper<Text, Text, IntWritable, IntWritable> {
private final static IntWritable uno = new IntWritable(1);
private IntWritable citationCount = new IntWritable();
public void map(Text key, Text value,
OutputCollector<IntWritable, IntWritable> output,
Reporter reporter) throws IOException {
citationCount.set(Integer.parseInt(value.toString()));
output.collect(citationCount, uno);
}
}
public static class Reduce extends MapReduceBase
implements Reducer<IntWritable,IntWritable,IntWritable,IntWritable>
{
public void reduce(IntWritable key, Iterator<IntWritable> values,
OutputCollector<IntWritable, IntWritable>output,
Reporter reporter) throws IOException {
int count = 0;
while (values.hasNext()) {
count += values.next().get();
}
output.collect(key, new IntWritable(count));
}
}
public int run(String[] args) throws Exception {
Configuration conf = getConf();
JobConf job = new JobConf(conf, CitationHistogram.class);
Path in = new Path(args[0]);
Path out = new Path(args[1]);
FileInputFormat.setInputPaths(job, in);
FileOutputFormat.setOutputPath(job, out);
job.setJobName("CitationHistogram");
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
job.setInputFormat(KeyValueTextInputFormat.class);
job.setOutputFormat(TextOutputFormat.class);
 
Search WWH ::




Custom Search