Hive之编译源码支持UDF函数
#1.编译支持UDF
##1.1在idea中新建maven项目,并添加相关参数
要引入 hadoop-client hive-exec,以下是maven项目的pom文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.ruozedata.hadoop</groupId>
<artifactId>ruozedata-hadoop</artifactId>
<version>1.0</version>
<name>ruozedata-hadoop</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<hadoop.version>2.6.0-cdh5.7.0</hadoop.version>
<hive.version>1.1.0-cdh5.7.0</hive.version>
</properties>
<!--添加CDH的仓库-->
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<!--添加Hadoop的依赖-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!--添加hive的依赖-->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
</dependency>
</dependencies>
<build>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
<!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
<plugin>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
</plugin>
<plugin>
<artifactId>maven-project-info-reports-plugin</artifactId>
<version>3.0.0</version>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
##1.2UDF编程
首先让新建的HelloUDF类继承UDF这个类,然后重写 evaluate ()方法
package com.ruozedata.hadoop.udf;
import org.apache.hadoop.hive.ql.exec.UDF;
public class HelloUDF extends UDF{
public String evaluate(String value) {
return "hello:" + value;
}
public static void main(String[] args) {
HelloUDF helloUDF = new HelloUDF();
String value = helloUDF.evaluate("leo");
System.out.println(value);
}
}
#2.下载源码开始编译
#下载
[[email protected] source]$ wget http://archive.cloudera.com/cdh5/cdh/5/hive-1.1.0-cdh5.7.0-src.tar.gz
[[email protected] source]$ ll
total 14392
drwxrwxr-x 18 hadoop hadoop 4096 Apr 20 09:01 hadoop-2.6.0-cdh5.7.0
drwxrwxr-x 32 hadoop hadoop 4096 May 1 14:02 hive-1.1.0-cdh5.7.0
-rw-rw-r-- 1 hadoop hadoop 14652104 Feb 21 2018 hive-1.1.0-cdh5.7.0-src.tar.gz
#解压
[[email protected] source]$ tar -zxvf hive-1.1.0-cdh5.7.0-src.tar.gz -C ~/source/
#3.在源码中修改
##3.1添加helloUDF.java
#将HelloUDF.java 放入 ~/source/hive-1.1.0-cdh5.7.0/ql/src/java/org/apache/hadoop/hive/ql/udf 文件夹中
[[email protected] udf]$ cd ~/source/hive-1.1.0-cdh5.7.0/ql/src/java/org/apache/hadoop/hive/ql/udf
[[email protected] udf]$ ll HelloUDF.java
-rw-r--r-- 1 hadoop hadoop 400 May 1 13:56 HelloUDF.java
#修改HelloUDF.java 的package名为 org.apache.hadoop.hive.ql.udf
##3.2修改FunctionRegistry.java 文件
#添加UDF
[[email protected] ~]$ cd ~/source/hive-1.1.0-cdh5.7.0/ql/src/java/org/apache/hadoop/hive/ql/exec
[[email protected] exec]$ vi FunctionRegistry.java
import org.apache.hadoop.hive.ql.udf.HelloUDF;
#在static代码块添加注册信息
system.registerUDF("say_hell0", HelloUDF.class,false);
#4.编译Hive源码
[[email protected] ~]$ cd ~/source/hive-1.1.0-cdh5.7.0
[[email protected] hive-1.1.0-cdh5.7.0]$ mvn clean package -DskipTests -Phadoop-2 -Pdist
#查看编译好的包,apache-hive-1.1.0-cdh5.7.0-bin.tar.gz 即是我需要的包
[[email protected] ~]$ cd ~/source/hive-1.1.0-cdh5.7.0/packaging/target/
[[email protected] target]$ ll apache-hive-1.1.0-cdh5.7.0-bin.tar.gz
-rw-rw-r-- 1 hadoop hadoop 105765783 May 1 14:17 apache-hive-1.1.0-cdh5.7.0-bin.tar.gz
[[email protected] target]$
#5.Hive部署
重新部署或者将编译后的hive-exec-1.1.0-cdh5.7.0.jar 放到原来hive部署的位置,两种方式都可以!!!
##5.1采用替换jar包的方式
#找到编译后的hive-exec-1.1.0-cdh5.7.0.jar
[[email protected] hive-1.1.0-cdh5.7.0]$ cd /home/hadoop/source/hive-1.1.0-cdh5.7.0/packaging/target/apache-hive-1.1.0-cdh5.7.0-bin/apache-hive-1.1.0-cdh5.7.0-bin/lib
[[email protected] lib]$ ll hive-exec-1.1.0-cdh5.7.0.jar
-rw-rw-r-- 1 hadoop hadoop 19274698 May 1 14:17 hive-exec-1.1.0-cdh5.7.0.jar
#备份原先安装的Hive下的jar
[[email protected] ~]$ cd $HIVE_HOME/lib
[[email protected] lib]$ ll hive-exec-1.1.0-cdh5.7.0.jar
-rw-r--r-- 1 hadoop hadoop 19272159 Mar 24 2016 hive-exec-1.1.0-cdh5.7.0.jar
[[email protected] lib]$ mv hive-exec-1.1.0-cdh5.7.0.jar hive-exec-1.1.0-cdh5.7.0.jar.bak
#将编译后的hive-exec-1.1.0-cdh5.7.0.jar 放到原来hive部署的位置
[[email protected] lib]$ cp hive-exec-1.1.0-cdh5.7.0.jar /home/hadoop/app/hive-1.1.0-cdh5.7.0/lib/
[[email protected] lib]$ ll hive-exec-1.1.0-cdh5.7.0.*
-rw-rw-r-- 1 hadoop hadoop 19274698 May 1 14:27 hive-exec-1.1.0-cdh5.7.0.jar
-rw-r--r-- 1 hadoop hadoop 19272159 Mar 24 2016 hive-exec-1.1.0-cdh5.7.0.jar.bak
#6.测试UDF
##查看函数
hive> show functions;
#此时我们可以看到一个名为helloudf的函数,不要添加jar包即可实现了自定义函数的使用
#运行函数
hive> select helloudf("leo");
OK
hello:leo
Time taken: 0.788 seconds, Fetched: 1 row(s)
hive>