Cookbook - Hadoop in Action

Databases Reference

In-Depth Information

and outputs two data sets. One has chronological information, such as issued date. The

other data set has geographical information associated with each patent. This, too, is

a map-only program, but you can apply the multiple output collectors to reducers in a

straightforward way.

Listing 7.2 Program to project different columns of input data to different files

public class MultiFile extends Configured implements Tool {

public static class MapClass extends MapReduceBase

implements Mapper<LongWritable, Text, NullWritable, Text> {

private MultipleOutputs mos;

private OutputCollector<NullWritable, Text> collector;

public void configure(JobConf conf) {

mos = new MultipleOutputs(conf);

}

public void map(LongWritable key, Text value,

OutputCollector<NullWritable, Text> output,

Reporter reporter) throws IOException {

String[] arr = value.toString().split(",", -1);

String chrono = arr[0] + "," + arr[1] + "," + arr[2];

String geo = arr[0] + "," + arr[4] + "," + arr[5];

collector = mos.getCollector("chrono", reporter);

collector.collect(NullWritable.get(), new Text(chrono));

collector = mos.getCollector("geo", reporter);

collector.collect(NullWritable.get(), new Text(geo));

}

public void close() throws IOException {

mos.close();

}

public int run(String[] args) throws Exception {

Configuration conf = getConf();

JobConf job = new JobConf(conf, MultiFile.class);

Path in = new Path(args[0]);

Path out = new Path(args[1]);

FileInputFormat.setInputPaths(job, in);

FileOutputFormat.setOutputPath(job, out);

job.setJobName("MultiFile");

job.setMapperClass(MapClass.class);

job.setInputFormat(TextInputFormat.class);

job.setOutputKeyClass(NullWritable.class);

job.setOutputValueClass(Text.class);

job.setNumReduceTasks(0);

MultipleOutputs.addNamedOutput(job,

"chrono",

TextOutputFormat.class,

NullWritable.class,

Text.class);

Hadoop in Action

Search WWH ::

Custom Search

Home