MapReduce编程之入门 Hello Word Count
目录
------------本文笔记整理自《Hadoop海量数据处理:技术详解与项目实战》范东来
注:实际上此文件很小(只有四行英文),查看作业日志:文件分块是1个,map是1个,reduce是1个。
需要引入的Jar包:hadoop-common-2.9.2.jar和hadoop-mapreduce-client-core-2.9.2.jar
1.单词计数Mapper类
package com.hadoop.hello;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/*
* 单词计数Mapper类
* LongWritable: map输入键值对的键类型
* Text: map输入键值对的值类型
* Text: map输出键值对的键类型
* IntWritable: map输出键值对的值类型
*/
public class TokenizerMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
//定义静态常量 1 ,标识每个词出现的次数为 1
private static final IntWritable one = new IntWritable(1);
//单词
private Text word = new Text();
/*
* @param key: 输入分块inputsplit的行号
* @param value: 摄入分块inputsplit的行内容
*/
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
//输出格式:word 1
context.write(word, one);
}
}
}
2.单词计数Reducer类
package com.hadoop.hello;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/*
* 单词计数Reducer类
* Text: reduce输入键值对的键类型
* IntWritable: reduce输入键值对的值类型
* Text: reduce输出键值对的键类型
* IntWritable: reduce输出键值对的值类型
*
* 同时设置成Combiner,可以将map中间结果传输到reduce前预先做一次单词计数,减少中间结果数据量
* 开启方式:job.setCombinerClass(IntSumReducer.class);
*/
public class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
//单词计数
private static IntWritable result = new IntWritable();
/*
* @param key: 单词
* @param value: 同类词的词数列表
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
Iterator<IntWritable> itr = values.iterator();
while (itr.hasNext()) {
sum += itr.next().get();
}
result.set(sum);
//输出格式:word 1
context.write(key, result);
}
}
3.单词计数main函数类
package com.hadoop.hello;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/*
* 单词计数main函数
*/
public class WordCount {
public static void main(String[] args) throws IOException,
ClassNotFoundException, InterruptedException {
//加载hadoop配置信息,属于静态资源加载
Configuration conf = new Configuration();
if (args.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
//创建作业对象
//被弃用
//Job job = new Job(conf, "wordcount");
Job job = Job.getInstance(conf, "wordcount");
job.setJarByClass(WordCount.class);
//指定Mapper/Reducer类
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(IntSumReducer.class);
//设置Reudce函数输出键值对类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置指定输入/输出路径(目录)
FileInputFormat.addInputPath(job, new Path(args[0]));//输入路径必须存在
FileOutputFormat.setOutputPath(job, new Path(args[1]));//输出路径必须不存在,会自动创建
//提交任务,等待作业完成
//waitForCompletion中参数true代表:打印作业处理过程
//System.exit(n),其中n为0,则正常结束;n为非0,则异常结束
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
4.运行程序
1.将com.hadoop.hello包右键导出为 JAR file,命名为:"WordCount.jar";
2.利用Windows的cmd或者PowerShell(推荐)将JAR文件上传到Linux服务器
命令如下:(在JAR文件目录下执行)
> scp WordCount.jar [email protected]:~/myJars/mapreduce/
(其中remoteIP为远程服务器IP)
3.启动hadoop,创建单词输入文件
> cd ~/myJars/mapreduce/
> touch words
> vi words
按键"i",进入编辑模式,向words文件中输入内容,如下:
good better best
never it rest
till good is better
and better is best
按键"ESC"-->"shift q"-->输入"wq!",回车,保存
--查看单词文件
> cat words
--在HDFS中创建输入文件目录
> hadoop fs -mkdir /user/hadoop/wordcountinput
--在HDFS中查看输入文件目录
> hadoop fs -ls /user/hadoop/wordcountinput
--将本地文件words拷贝到HDFS的输入目录中(在"~/myJars/mapreduce/"下执行)
> hadoop fs -copyFromLocal words /user/hadoop/wordcountinput/
4.执行JAR,运行程序
命令如下:(在JAR文件目录"~/myJars/mapreduce/"下执行)
> hadoop jar WordCount.jar com.hadoop.hello.WordCount /user/hadoop/wordcountinput /user/hadoop/wordcountoutput
运行过程中,屏幕会输出执行过程,直到完成
5.查看单词统计结果
成功执行完后,目录"/user/hadoop/wordcountoutput/"下会产生两个文件
/user/hadoop/wordcountoutput/_SUCCESS --成功执行完的空标识文件
/user/hadoop/wordcountoutput/part-r-00000 --作业输出结果文件
--查看输出文件
> hadoop fs -cat /user/hadoop/wordcountoutput/part-r-00000
and 1
best 2
better 3
good 2
is 2
it 1
never 1
rest 1
till 1
<此即为单词统计结果>