Lucene6.5.0 下中文分词IKAnalyzer编译和使用

前言

lucene本省对中文分词有支持，不过支持的不好，其分词方式是机械的将中文词一个分成一个进行存储，例如：成都信息工程大学，最终分成为:：成|都|信|息|工|程|大|学，显然这种分词方式是低效且浪费存储空间的，IK分词是林良益前辈自定义写的一个专门针对中文分词的分析器,最新版本为2012年的版本for4.0之后未做更新，后续版本lucene的接口改变使其不支持，所以需要进行修改。

修改和编译IKAnalyzer

（谷歌不稳定访问不了）国内源码地址：http://git.oschina.net/wltea/IK-Analyzer-2012FF 网盘下载：链接：http://pan.baidu.com/s/1jIt7kGm 密码：hu1g

lucene6.5.0下载地址：https://lucene.apache.org 网盘下载：链接：http://pan.baidu.com/s/1mic8iBe 密码：axca

下载源码之后解压并导入到单独的java project,然后再导入lucene的jar包，如图所示，是我的工程结构

Lucene6.5.0 下中文分词IKAnalyzer编译和使用

导入后修改四个文件：IKAnalyzer和IKTokenizer以及SWMCQueryBuilder、IKQueryExpressionParser，至于demo中的两个文件可直接删除或进行修改，我进行了修改。修改方式很简单，这里贴出修改的原文，以及修改后工程和源码下载。

修改后的工程地址：链接：http://pan.baidu.com/s/1nuALOql密码：miyq

编译好的IKAnalyzer的jar包下载地址：http://download.****.net/detail/fanpei_moukoy/9796612可直接导入lucene项目进行使用

IKAnalyzer

[java]view
plain copy

/** 

 * IK 中文分词  版本 6.5.0 

 * IK Analyzer release 6.5.0 

 *  

 * Licensed to the Apache Software Foundation (ASF) under one or more 

 * contributor license agreements.  See the NOTICE file distributed with 

 * this work for additional information regarding copyright ownership. 

 * The ASF licenses this file to You under the Apache License, Version 2.0 

 * (the "License"); you may not use this file except in compliance with 

 * the License.  You may obtain a copy of the License at 

 * 

 *     http://www.apache.org/licenses/LICENSE-2.0 

 * 

 * Unless required by applicable law or agreed to in writing, software 

 * distributed under the License is distributed on an "AS IS" BASIS, 

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

 * See the License for the specific language governing permissions and 

 * limitations under the License. 

 * 

 * provided by Linliangyi and copyright 2012 by Oolong studio 

 *  

 */  

package org.wltea.analyzer.lucene;  

import java.io.Reader;  

import java.io.StringReader;  

import org.apache.lucene.analysis.Analyzer;  

import org.apache.lucene.util.IOUtils;  

/** 

 * IK分词器，Lucene Analyzer接口实现 

 * 兼容Lucene 6.5.0版本 暴走抹茶 2017.3.28 

 */  

public final class IKAnalyzer extends Analyzer{  

    private boolean useSmart;  

    public boolean useSmart() {  

        return useSmart;  

    }  

    public void setUseSmart(boolean useSmart) {  

        this.useSmart = useSmart;  

    }  

    /** 

     * IK分词器Lucene  Analyzer接口实现类 

     *  

     * 默认细粒度切分算法 

     */  

    public IKAnalyzer(){  

        this(false);  

    }  

    /** 

     * IK分词器Lucene Analyzer接口实现类 

     *  

     * @param useSmart 当为true时，分词器进行智能切分 

     */  

    public IKAnalyzer(boolean useSmart){  

        super();  

        this.useSmart = useSmart;  

    }  

    @Override  

    protected TokenStreamComponents createComponents(String fieldName) {  

         Reader reader=null;  

            try{  

                reader=new StringReader(fieldName);  

                IKTokenizer it = new IKTokenizer(reader);  

                return new Analyzer.TokenStreamComponents(it);  

            }finally {  

                IOUtils.closeWhileHandlingException(reader);  

            }  

    }  

}

IKTokenizer

[java]view
plain copy

/** 

 * IK 中文分词  版本 6.5.0 

 * IK Analyzer release 6.5.0 

 *  

 * Licensed to the Apache Software Foundation (ASF) under one or more 

 * contributor license agreements.  See the NOTICE file distributed with 

 * this work for additional information regarding copyright ownership. 

 * The ASF licenses this file to You under the Apache License, Version 2.0 

 * (the "License"); you may not use this file except in compliance with 

 * the License.  You may obtain a copy of the License at 

 * 

 *     http://www.apache.org/licenses/LICENSE-2.0 

 * 

 * Unless required by applicable law or agreed to in writing, software 

 * distributed under the License is distributed on an "AS IS" BASIS, 

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

 * See the License for the specific language governing permissions and 

 * limitations under the License. 

 * 

 * provided by Linliangyi and copyright 2012 by Oolong studio 

 *  

 *  

 */  

package org.wltea.analyzer.lucene;  

import java.io.IOException;  

import java.io.Reader;  

import org.apache.lucene.analysis.Tokenizer;  

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;  

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;  

import org.apache.lucene.analysis.tokenattributes.TypeAttribute;  

import org.wltea.analyzer.core.IKSegmenter;  

import org.wltea.analyzer.core.Lexeme;  

/** 

 * IK分词器 Lucene Tokenizer适配器类 兼容Lucene 6.5.0版本 暴走抹茶 2017.3.28 

 */  

public final class IKTokenizer extends Tokenizer {  

    // IK分词器实现  

    private IKSegmenter _IKImplement;  

    // 词元文本属性  

    private final CharTermAttribute termAtt;  

    // 词元位移属性  

    private final OffsetAttribute offsetAtt;  

    // 词元分类属性（该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量）  

    private final TypeAttribute typeAtt;  

    // 记录最后一个词元的结束位置  

    private int endPosition;  

    public IKTokenizer(Reader in) {  

        this(in, false);  

    }  

    /** 

     * Lucene 6.5.0 Tokenizer适配器类构造函数 

     *  

     * @param in 

     * @param useSmart 

     */  

    public IKTokenizer(Reader in, boolean useSmart) {  

        offsetAtt = addAttribute(OffsetAttribute.class);  

        termAtt = addAttribute(CharTermAttribute.class);  

        typeAtt = addAttribute(TypeAttribute.class);  

        _IKImplement = new IKSegmenter(input, useSmart);  

    }  

    /* 

     * (non-Javadoc) 

     *  

     * @see org.apache.lucene.analysis.TokenStream#incrementToken() 

     */  

    @Override  

    public boolean incrementToken() throws IOException {  

        // 清除所有的词元属性  

        clearAttributes();  

        Lexeme nextLexeme = _IKImplement.next();  

        if (nextLexeme != null) {  

            // 将Lexeme转成Attributes  

            // 设置词元文本  

            termAtt.append(nextLexeme.getLexemeText());  

            // 设置词元长度  

            termAtt.setLength(nextLexeme.getLength());  

            // 设置词元位移  

            offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());  

            // 记录分词的最后位置  

            endPosition = nextLexeme.getEndPosition();  

            // 记录词元分类  

            typeAtt.setType(nextLexeme.getLexemeTypeString());  

            // 返会true告知还有下个词元  

            return true;  

        }  

        // 返会false告知词元输出完毕  

        return false;  

    }  

    /* 

     * (non-Javadoc) 

     *  

     * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader) 

     */  

    @Override  

    public void reset() throws IOException {  

        super.reset();  

        _IKImplement.reset(input);  

    }  

    @Override  

    public final void end() {  

        // set final offset  

        int finalOffset = correctOffset(this.endPosition);  

        offsetAtt.setOffset(finalOffset, finalOffset);  

    }  

}

IKQueryExpressionParser

[java]view
plain copy

/** 

 * IK 中文分词  版本 6.5.0 

 * IK Analyzer release 6.5.0 

 *  

 * Licensed to the Apache Software Foundation (ASF) under one or more 

 * contributor license agreements.  See the NOTICE file distributed with 

 * this work for additional information regarding copyright ownership. 

 * The ASF licenses this file to You under the Apache License, Version 2.0 

 * (the "License"); you may not use this file except in compliance with 

 * the License.  You may obtain a copy of the License at 

 * 

 *     http://www.apache.org/licenses/LICENSE-2.0 

 * 

 * Unless required by applicable law or agreed to in writing, software 

 * distributed under the License is distributed on an "AS IS" BASIS, 

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

 * See the License for the specific language governing permissions and 

 * limitations under the License. 

 * 

 * provided by Linliangyi and copyright 2012 by Oolong studio 

 *  

 */  

package org.wltea.analyzer.query;  

import java.util.ArrayList;  

import java.util.LinkedList;  

import java.util.List;  

import java.util.Stack;  

import org.apache.lucene.index.Term;  

import org.apache.lucene.search.BooleanClause;  

import org.apache.lucene.search.BooleanQuery;  

import org.apache.lucene.search.BooleanQuery.Builder;  

import org.apache.lucene.search.Query;  

import org.apache.lucene.search.TermQuery;  

import org.apache.lucene.search.TermRangeQuery;  

import org.apache.lucene.search.BooleanClause.Occur;  

import org.apache.lucene.util.BytesRef;  

/** 

 * IK简易查询表达式解析  

 * 结合SWMCQuery算法  暴走抹茶 2017.3.28 

 *  

 * 表达式例子 ： 

 * (id='1231231' && title:'monkey') || (content:'你好吗'  || ulr='www.ik.com') - name:'helloword' 

 * @author linliangyi 

 * 

 */  

public class IKQueryExpressionParser {  

    //public static final String LUCENE_SPECIAL_CHAR = "&&||-()':={}[],";  

    private List<Element> elements = new ArrayList<Element>();  

    private Stack<Query> querys =  new Stack<Query>();  

    private Stack<Element> operates = new Stack<Element>();  

    /** 

     * 解析查询表达式，生成Lucene Query对象 

     *  

     * @param expression 

     * @param quickMode  

     * @return Lucene query 

     */  

    public Query parseExp(String expression , boolean quickMode){  

        Query lucenceQuery = null;  

        if(expression != null && !"".equals(expression.trim())){  

            try{  

                //文法解析  

                this.splitElements(expression);  

                //语法解析  

                this.parseSyntax(quickMode);  

                if(this.querys.size() == 1){  

                    lucenceQuery = this.querys.pop();  

                }else{  

                    throw new IllegalStateException("表达式异常： 缺少逻辑操作符 或 括号缺失");  

                }  

            }finally{  

                elements.clear();  

                querys.clear();  

                operates.clear();  

            }  

        }  

        return lucenceQuery;  

    }     

    /** 

     * 表达式文法解析 

     * @param expression 

     */  

    private void splitElements(String expression){  

        if(expression == null){  

            return;  

        }  

        Element curretElement = null;  

        char[] expChars = expression.toCharArray();  

        for(int i = 0 ; i < expChars.length ; i++){  

            switch(expChars[i]){  

            case '&' :  

                if(curretElement == null){  

                    curretElement = new Element();  

                    curretElement.type = '&';  

                    curretElement.append(expChars[i]);  

                }else if(curretElement.type == '&'){  

                    curretElement.append(expChars[i]);  

                    this.elements.add(curretElement);  

                    curretElement = null;  

                }else if(curretElement.type == '\''){  

                    curretElement.append(expChars[i]);  

                }else {  

                    this.elements.add(curretElement);  

                    curretElement = new Element();  

                    curretElement.type = '&';  

                    curretElement.append(expChars[i]);  

                }  

                break;  

            case '|' :  

                if(curretElement == null){  

                    curretElement = new Element();  

                    curretElement.type = '|';  

                    curretElement.append(expChars[i]);  

                }else if(curretElement.type == '|'){  

                    curretElement.append(expChars[i]);  

                    this.elements.add(curretElement);  

                    curretElement = null;  

                }else if(curretElement.type == '\''){  

                    curretElement.append(expChars[i]);  

                }else {  

                    this.elements.add(curretElement);  

                    curretElement = new Element();  

                    curretElement.type = '|';  

                    curretElement.append(expChars[i]);  

                }                 

                break;  

            case '-' :  

                if(curretElement != null){  

                    if(curretElement.type == '\''){  

                        curretElement.append(expChars[i]);  

                        continue;  

                    }else{  

                        this.elements.add(curretElement);  

                    }  

                }  

                curretElement = new Element();  

                curretElement.type = '-';  

                curretElement.append(expChars[i]);  

                this.elements.add(curretElement);  

                curretElement = null;             

                break;  

            case '(' :  

                if(curretElement != null){  

                    if(curretElement.type == '\''){  

                        curretElement.append(expChars[i]);  

                        continue;  

                    }else{  

                        this.elements.add(curretElement);  

                    }  

                }  

                curretElement = new Element();  

                curretElement.type = '(';  

                curretElement.append(expChars[i]);  

                this.elements.add(curretElement);  

                curretElement = null;             

                break;                

            case ')' :  

                if(curretElement != null){  

                    if(curretElement.type == '\''){  

                        curretElement.append(expChars[i]);  

                        continue;  

                    }else{  

                        this.elements.add(curretElement);  

                    }  

                }  

                curretElement = new Element();  

                curretElement.type = ')';  

                curretElement.append(expChars[i]);  

                this.elements.add(curretElement);  

                curretElement = null;             

                break;                    

            case ':' :  

                if(curretElement != null){  

                    if(curretElement.type == '\''){  

                        curretElement.append(expChars[i]);  

                        continue;  

                    }else{  

                        this.elements.add(curretElement);  

                    }  

                }  

                curretElement = new Element();  

                curretElement.type = ':';  

                curretElement.append(expChars[i]);  

                this.elements.add(curretElement);  

                curretElement = null;             

                break;    

            case '=' :  

                if(curretElement != null){  

                    if(curretElement.type == '\''){  

                        curretElement.append(expChars[i]);  

                        continue;  

                    }else{  

                        this.elements.add(curretElement);  

                    }  

                }  

                curretElement = new Element();  

                curretElement.type = '=';  

                curretElement.append(expChars[i]);  

                this.elements.add(curretElement);  

                curretElement = null;             

                break;                    

            case ' ' :  

                if(curretElement != null){  

                    if(curretElement.type == '\''){  

                        curretElement.append(expChars[i]);  

                    }else{  

                        this.elements.add(curretElement);  

                        curretElement = null;  

                    }  

                }  

                break;  

            case '\'' :  

                if(curretElement == null){  

                    curretElement = new Element();  

                    curretElement.type = '\'';  

                }else if(curretElement.type == '\''){  

                    this.elements.add(curretElement);  

                    curretElement = null;  

                }else{  

                    this.elements.add(curretElement);  

                    curretElement = new Element();  

                    curretElement.type = '\'';  

                }  

                break;  

            case '[':  

                if(curretElement != null){  

                    if(curretElement.type == '\''){  

                        curretElement.append(expChars[i]);  

                        continue;  

                    }else{  

                        this.elements.add(curretElement);  

                    }  

                }  

                curretElement = new Element();  

                curretElement.type = '[';  

                curretElement.append(expChars[i]);  

                this.elements.add(curretElement);  

                curretElement = null;                     

                break;  

            case ']':  

                if(curretElement != null){  

                    if(curretElement.type == '\''){  

                        curretElement.append(expChars[i]);  

                        continue;  

                    }else{  

                        this.elements.add(curretElement);  

                    }  

                }  

                curretElement = new Element();  

                curretElement.type = ']';  

                curretElement.append(expChars[i]);  

                this.elements.add(curretElement);  

                curretElement = null;  

                break;  

            case '{':  

                if(curretElement != null){  

                    if(curretElement.type == '\''){  

                        curretElement.append(expChars[i]);  

                        continue;  

                    }else{  

                        this.elements.add(curretElement);  

                    }  

                }  

                curretElement = new Element();  

                curretElement.type = '{';  

                curretElement.append(expChars[i]);  

                this.elements.add(curretElement);  

                curretElement = null;                     

                break;  

            case '}':  

                if(curretElement != null){  

                    if(curretElement.type == '\''){  

                        curretElement.append(expChars[i]);  

                        continue;  

                    }else{  

                        this.elements.add(curretElement);  

                    }  

                }  

                curretElement = new Element();  

                curretElement.type = '}';  

                curretElement.append(expChars[i]);  

                this.elements.add(curretElement);  

                curretElement = null;  

                break;  

            case ',':  

                if(curretElement != null){  

                    if(curretElement.type == '\''){  

                        curretElement.append(expChars[i]);  

                        continue;  

                    }else{  

                        this.elements.add(curretElement);  

                    }  

                }  

                curretElement = new Element();  

                curretElement.type = ',';  

                curretElement.append(expChars[i]);  

                this.elements.add(curretElement);  

                curretElement = null;  

                break;  

            default :  

                if(curretElement == null){  

                    curretElement = new Element();  

                    curretElement.type = 'F';  

                    curretElement.append(expChars[i]);  

                }else if(curretElement.type == 'F'){  

                    curretElement.append(expChars[i]);  

                }else if(curretElement.type == '\''){  

                    curretElement.append(expChars[i]);  

                }else{  

                    this.elements.add(curretElement);  

                    curretElement = new Element();  

                    curretElement.type = 'F';  

                    curretElement.append(expChars[i]);  

                }             

            }  

        }  

        if(curretElement != null){  

            this.elements.add(curretElement);  

            curretElement = null;  

        }  

    }  

    /** 

     * 语法解析 

     *  

     */  

    private void parseSyntax(boolean quickMode){  

        for(int i = 0 ; i < this.elements.size() ; i++){  

            Element e = this.elements.get(i);  

            if('F' == e.type){  

                Element e2 = this.elements.get(i + 1);  

                if('=' != e2.type && ':' != e2.type){  

                    throw new IllegalStateException("表达式异常： = 或 ： 号丢失");  

                }  

                Element e3 = this.elements.get(i + 2);  

                //处理 = 和 ： 运算  

                if('\'' == e3.type){  

                    i+=2;  

                    if('=' == e2.type){  

                        TermQuery tQuery = new TermQuery(new Term(e.toString() , e3.toString()));  

                        this.querys.push(tQuery);  

                    }else if(':' == e2.type){  

                        String keyword = e3.toString();  

                        //SWMCQuery Here  

                        Query _SWMCQuery =  SWMCQueryBuilder.create(e.toString(), keyword , quickMode);  

                        this.querys.push(_SWMCQuery);  

                    }  

                }else if('[' == e3.type || '{' == e3.type){  

                    i+=2;  

                    //处理 [] 和 {}  

                    LinkedList<Element> eQueue = new LinkedList<Element>();  

                    eQueue.add(e3);  

                    for( i++ ; i < this.elements.size() ; i++){                            

                        Element eN = this.elements.get(i);  

                        eQueue.add(eN);  

                        if(']' == eN.type || '}' == eN.type){  

                            break;  

                        }  

                    }  

                    //翻译RangeQuery  

                    Query rangeQuery = this.toTermRangeQuery(e , eQueue);  

                    this.querys.push(rangeQuery);  

                }else{  

                    throw new IllegalStateException("表达式异常：匹配值丢失");  

                }  

            }else if('(' == e.type){  

                this.operates.push(e);  

            }else if(')' == e.type){  

                boolean doPop = true;  

                while(doPop && !this.operates.empty()){  

                    Element op = this.operates.pop();  

                    if('(' == op.type){  

                        doPop = false;  

                    }else {  

                        Query q = toBooleanQuery(op);  

                        this.querys.push(q);  

                    }  

                }  

            }else{   

                if(this.operates.isEmpty()){  

                    this.operates.push(e);  

                }else{  

                    boolean doPeek = true;  

                    while(doPeek && !this.operates.isEmpty()){  

                        Element eleOnTop = this.operates.peek();  

                        if('(' == eleOnTop.type){  

                            doPeek = false;  

                            this.operates.push(e);  

                        }else if(compare(e , eleOnTop) == 1){  

                            this.operates.push(e);  

                            doPeek = false;  

                        }else if(compare(e , eleOnTop) == 0){  

                            Query q = toBooleanQuery(eleOnTop);  

                            this.operates.pop();  

                            this.querys.push(q);  

                        }else{  

                            Query q = toBooleanQuery(eleOnTop);  

                            this.operates.pop();  

                            this.querys.push(q);  

                        }  

                    }  

                    if(doPeek && this.operates.empty()){  

                        this.operates.push(e);  

                    }  

                }  

            }             

        }  

        while(!this.operates.isEmpty()){  

            Element eleOnTop = this.operates.pop();  

            Query q = toBooleanQuery(eleOnTop);  

            this.querys.push(q);              

        }         

    }  

    /** 

     * 根据逻辑操作符，生成BooleanQuery 

     * @param op 

     * @return 

     */  

    private Query toBooleanQuery(Element op){  

        if(this.querys.size() == 0){  

            return null;  

        }  

        //BooleanQuery resultQuery = null;  

        Builder builder = new Builder();  

        if(this.querys.size() == 1){  

            return this.querys.get(0);  

        }  

        Query q2 = this.querys.pop();  

        Query q1 = this.querys.pop();  

        if('&' == op.type){  

            if(q1 != null){  

                if(q1 instanceof BooleanQuery){  

                    List<BooleanClause> clauses = ((BooleanQuery)q1).clauses();  

                    if(clauses.size() > 0   

                            && clauses.get(0).getOccur() == Occur.MUST){  

                        for(BooleanClause c : clauses){  

                            builder.add(c);  

                        }                     

                    }else{  

                        builder.add(q1,Occur.MUST);  

                    }  

                }else{  

                    //q1 instanceof TermQuery   

                    //q1 instanceof TermRangeQuery   

                    //q1 instanceof PhraseQuery  

                    //others  

                    builder.add(q1,Occur.MUST);  

                }  

            }  

            if(q2 != null){  

                if(q2 instanceof BooleanQuery){  

                    List<BooleanClause> clauses = ((BooleanQuery)q2).clauses();  

                    if(clauses.size() > 0   

                            && clauses.get(0).getOccur() == Occur.MUST){  

                        for(BooleanClause c : clauses){  

                            builder.add(c);  

                        }                     

                    }else{  

                        builder.add(q2,Occur.MUST);  

                    }  

                }else{  

                    //q1 instanceof TermQuery   

                    //q1 instanceof TermRangeQuery   

                    //q1 instanceof PhraseQuery  

                    //others  

                    builder.add(q2,Occur.MUST);  

                }  

            }  

        }else if('|' == op.type){  

            if(q1 != null){  

                if(q1 instanceof BooleanQuery){  

                    List<BooleanClause> clauses = ((BooleanQuery)q1).clauses();  

                    if(clauses.size() > 0   

                            && clauses.get(0).getOccur() == Occur.SHOULD){  

                        for(BooleanClause c : clauses){  

                            builder.add(c);  

                        }                     

                    }else{  

                        builder.add(q1,Occur.SHOULD);  

                    }  

                }else{  

                    //q1 instanceof TermQuery   

                    //q1 instanceof TermRangeQuery   

                    //q1 instanceof PhraseQuery  

                    //others  

                    builder.add(q1,Occur.SHOULD);  

                }  

            }  

            if(q2 != null){  

                if(q2 instanceof BooleanQuery){  

                    List<BooleanClause> clauses = ((BooleanQuery)q2).clauses();  

                    if(clauses.size() > 0   

                            && clauses.get(0).getOccur() == Occur.SHOULD){  

                        for(BooleanClause c : clauses){  

                            builder.add(c);  

                        }                     

                    }else{  

                        builder.add(q2,Occur.SHOULD);  

                    }  

                }else{  

                    //q2 instanceof TermQuery   

                    //q2 instanceof TermRangeQuery   

                    //q2 instanceof PhraseQuery  

                    //others  

                    builder.add(q2,Occur.SHOULD);  

                }  

            }  

        }else if('-' == op.type){  

            if(q1 == null || q2 == null){  

                throw new IllegalStateException("表达式异常：SubQuery 个数不匹配");  

            }  

            if(q1 instanceof BooleanQuery){  

                List<BooleanClause> clauses = ((BooleanQuery)q1).clauses();  

                if(clauses.size() > 0){  

                    for(BooleanClause c : clauses){  

                        builder.add(c);  

                    }                     

                }else{  

                    builder.add(q1,Occur.MUST);  

                }  

            }else{  

                //q1 instanceof TermQuery   

                //q1 instanceof TermRangeQuery   

                //q1 instanceof PhraseQuery  

                //others  

                builder.add(q1,Occur.MUST);  

            }                 

            builder.add(q2,Occur.MUST_NOT);  

        }  

        return builder.build();  

    }     

    /** 

     * 组装TermRangeQuery 

     * @param elements 

     * @return 

     */  

    private TermRangeQuery toTermRangeQuery(Element fieldNameEle , LinkedList<Element> elements){  

        boolean includeFirst = false;  

        boolean includeLast = false;  

        String firstValue = null;  

        String lastValue = null;  

        //检查第一个元素是否是[或者{  

        Element first = elements.getFirst();  

        if('[' == first.type){  

            includeFirst = true;  

        }else if('{' == first.type){  

            includeFirst = false;  

        }else {  

            throw new IllegalStateException("表达式异常");  

        }  

        //检查最后一个元素是否是]或者}  

        Element last = elements.getLast();  

        if(']' == last.type){  

            includeLast = true;  

        }else if('}' == last.type){  

            includeLast = false;  

        }else {  

            throw new IllegalStateException("表达式异常, RangeQuery缺少结束括号");  

        }  

        if(elements.size() < 4 || elements.size() > 5){  

            throw new IllegalStateException("表达式异常, RangeQuery 错误");  

        }             

        //读出中间部分  

        Element e2 = elements.get(1);  

        if('\'' == e2.type){  

            firstValue = e2.toString();  

            //  

            Element e3 = elements.get(2);  

            if(',' != e3.type){  

                throw new IllegalStateException("表达式异常, RangeQuery缺少逗号分隔");  

            }  

            //  

            Element e4 = elements.get(3);  

            if('\'' == e4.type){  

                lastValue = e4.toString();  

            }else if(e4 != last){  

                throw new IllegalStateException("表达式异常，RangeQuery格式错误");  

            }                 

        }else if(',' == e2.type){  

            firstValue = null;  

            //  

            Element e3 = elements.get(2);  

            if('\'' == e3.type){  

                lastValue = e3.toString();  

            }else{  

                throw new IllegalStateException("表达式异常，RangeQuery格式错误");  

            }  

        }else {  

            throw new IllegalStateException("表达式异常, RangeQuery格式错误");  

        }  

        return new TermRangeQuery(fieldNameEle.toString() , new BytesRef(firstValue) , new BytesRef(lastValue) , includeFirst , includeLast);  

    }     

    /** 

     * 比较操作符优先级 

     * @param e1 

     * @param e2 

     * @return 

     */  

    private int compare(Element e1 , Element e2){  

        if('&' == e1.type){  

            if('&' == e2.type){  

                return 0;  

            }else {  

                return 1;  

            }  

        }else if('|' == e1.type){  

            if('&' == e2.type){  

                return -1;  

            }else if('|' == e2.type){  

                return 0;  

            }else{  

                return 1;  

            }  

        }else{  

            if('-' == e2.type){  

                return 0;  

            }else{  

                return -1;  

            }  

        }  

    }  

    /** 

     * 表达式元素（操作符、FieldName、FieldValue） 

     * @author linliangyi 

     * May 20, 2010 

     */  

    private class Element{  

        char type = 0;  

        StringBuffer eleTextBuff;  

        public Element(){  

            eleTextBuff = new StringBuffer();  

        }  

        public void append(char c){  

            this.eleTextBuff.append(c);  

        }  

        public String toString(){  

            return this.eleTextBuff.toString();  

        }  

    }     

    public static void main(String[] args){  

        IKQueryExpressionParser parser = new IKQueryExpressionParser();  

        //String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'";  

        String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18'  || ulr='www.ik.com') - name:'林良益'";  

        Query result = parser.parseExp(ikQueryExp , true);  

        System.out.println(result);  

    }     

}

SWMCQueryBuilder

[java]view
plain copy

/** 

 * IK 中文分词  版本 6.5.0 

 * IK Analyzer release 6.5.0 

 *  

 * Licensed to the Apache Software Foundation (ASF) under one or more 

 * contributor license agreements.  See the NOTICE file distributed with 

 * this work for additional information regarding copyright ownership. 

 * The ASF licenses this file to You under the Apache License, Version 2.0 

 * (the "License"); you may not use this file except in compliance with 

 * the License.  You may obtain a copy of the License at 

 * 

 *     http://www.apache.org/licenses/LICENSE-2.0 

 * 

 * Unless required by applicable law or agreed to in writing, software 

 * distributed under the License is distributed on an "AS IS" BASIS, 

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

 * See the License for the specific language governing permissions and 

 * limitations under the License. 

 * provided by Linliangyi and copyright 2012 by Oolong studio 

 *  

 */  

package org.wltea.analyzer.query;  

import java.io.IOException;  

import java.io.StringReader;  

import java.util.ArrayList;  

import java.util.List;  

import org.apache.lucene.analysis.standard.StandardAnalyzer;  

import org.apache.lucene.queryparser.classic.ParseException;  

import org.apache.lucene.queryparser.classic.QueryParser;  

import org.apache.lucene.search.Query;  

import org.wltea.analyzer.core.IKSegmenter;  

import org.wltea.analyzer.core.Lexeme;  

/** 

 * Single Word Multi Char Query Builder 

 * IK分词算法专用  暴走抹茶 2017.3.28 

 * @author linliangyi 

 * 

 */  

public class SWMCQueryBuilder {  

    /** 

     * 生成SWMCQuery 

     * @param fieldName 

     * @param keywords 

     * @param quickMode 

     * @return Lucene Query 

     */  

    public static Query create(String fieldName ,String keywords , boolean quickMode){  

        if(fieldName == null || keywords == null){  

            throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");  

        }  

        //1.对keywords进行分词处理  

        List<Lexeme> lexemes = doAnalyze(keywords);  

        //2.根据分词结果，生成SWMCQuery  

        Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);  

        return _SWMCQuery;  

    }  

    /** 

     * 分词切分，并返回结链表 

     * @param keywords 

     * @return 

     */  

    private static List<Lexeme> doAnalyze(String keywords){  

        List<Lexeme> lexemes = new ArrayList<Lexeme>();  

        IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true);  

        try{  

            Lexeme l = null;  

            while( (l = ikSeg.next()) != null){  

                lexemes.add(l);  

            }  

        }catch(IOException e){  

            e.printStackTrace();  

        }  

        return lexemes;  

    }  

    /** 

     * 根据分词结果生成SWMC搜索 

     * @param fieldName 

     * @param pathOption 

     * @param quickMode 

     * @return 

     */  

    private static Query getSWMCQuery(String fieldName , List<Lexeme> lexemes , boolean quickMode){  

        //构造SWMC的查询表达式  

        StringBuffer keywordBuffer = new StringBuffer();  

        //精简的SWMC的查询表达式  

        StringBuffer keywordBuffer_Short = new StringBuffer();  

        //记录最后词元长度  

        int lastLexemeLength = 0;  

        //记录最后词元结束位置  

        int lastLexemeEnd = -1;  

        int shortCount = 0;  

        int totalCount = 0;  

        for(Lexeme l : lexemes){  

            totalCount += l.getLength();  

            //精简表达式  

            if(l.getLength() > 1){  

                keywordBuffer_Short.append(' ').append(l.getLexemeText());  

                shortCount += l.getLength();  

            }  

            if(lastLexemeLength == 0){  

                keywordBuffer.append(l.getLexemeText());                  

            }else if(lastLexemeLength == 1 && l.getLength() == 1  

                    && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻，长度为一，合并)  

                keywordBuffer.append(l.getLexemeText());  

            }else{  

                keywordBuffer.append(' ').append(l.getLexemeText());  

            }  

            lastLexemeLength = l.getLength();  

            lastLexemeEnd = l.getEndPosition();  

        }  

        //借助lucene queryparser 生成SWMC Query  

        QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer());  

        qp.setDefaultOperator(QueryParser.AND_OPERATOR);  

        qp.setAutoGeneratePhraseQueries(true);  

        if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){  

            try {  

                //System.out.println(keywordBuffer.toString());  

                Query q = qp.parse(keywordBuffer_Short.toString());  

                return q;  

            } catch (ParseException e) {  

                e.printStackTrace();  

            }  

        }else{  

            if(keywordBuffer.length() > 0){  

                try {  

                    //System.out.println(keywordBuffer.toString());  

                    Query q = qp.parse(keywordBuffer.toString());  

                    return q;  

                } catch (ParseException e) {  

                    e.printStackTrace();  

                }  

            }  

        }  

        return null;  

    }  

}

Lucene6.5.0 下中文分词IKAnalyzer编译和使用

前言

修改和编译IKAnalyzer

相关推荐