MapReduce的分区-Partitioner
分区操作时shuffle操作中的一个重要过程,作用就是讲map的结果按照规则分发到不同的reduce中进行处理,从而按照分区得到多个输出结果。
Partutuiber是partitioner的基类,如果需要盯着partitioner也需要继承该类,HashPartitioner是mapredce的默认partitioner。计算方法是
which reducer=(key.hashCode() & Integer.MAX_VALUE) % numReduceTasks
注:默认情况下reduceTask数量为1,很多时候MR自带的分区规则并不能满足我们的需求,为了实现特定的效果,可以需要自己来定义分区规则。
案例:根据城市区分,来统计每一个城市中每个人产生的流量
public class FlowMapper extends Mapper<LongWritable, Text, Text, Flow> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] arr = line.split(" ");
Flow f = new Flow();
f.setPhone(arr[0]);
f.setCity(arr[1]);
f.setName(arr[2]);
f.setFlow(Integer.parseInt(arr[3]));
context.write(new Text(f.getPhone()), f);
}
}
//指定分区
public class FlowPartitioner extends Partitioner<Text, Flow> {
@Override
public int getPartition(Text key, Flow value, int numPartitions) {
String city = value.getCity();
if(city.equals("bj")){
return 0;
} else if(city.equals("sh"))
return 1;
else
return 2;
}
}
public class FlowReducer extends Reducer<Text, Flow, Text, IntWritable> {
public void reduce(Text key, Iterable<Flow> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (Flow val : values) {
sum += val.getFlow();
}
context.write(key, new IntWritable(sum));
}
}
public class FlowDriver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "JobName");
job.setJarByClass(cn.tedu.flow2.FlowDriver.class);
job.setMapperClass(FlowMapper.class);
job.setReducerClass(FlowReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Flow.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 指定分区
job.setPartitionerClass(FlowPartitioner.class);
// 指定分区所对应的reducer数量
job.setNumReduceTasks(3);
FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.60.132:9000/mr/flow.txt"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.60.132:9000/fpresult"));
if (!job.waitForCompletion(true))
return;
}
}