Lucene6.5.0 下中文分词IKAnalyzer编译和使用

前言

lucene本省对中文分词有支持,不过支持的不好,其分词方式是机械的将中文词一个分成一个进行存储,例如:成都信息工程大学,最终分成为::成|都|信|息|工|程|大|学,显然这种分词方式是低效且浪费存储空间的,IK分词是林良益前辈自定义写的一个专门针对中文分词的分析器,最新版本为2012年的版本for4.0之后未做更新,后续版本lucene的接口改变使其不支持,所以需要进行修改。

修改和编译IKAnalyzer

(谷歌不稳定访问不了)国内源码地址:http://git.oschina.net/wltea/IK-Analyzer-2012FF  网盘下载:链接:http://pan.baidu.com/s/1jIt7kGm 密码:hu1g

lucene6.5.0下载地址:https://lucene.apache.org 网盘下载:链接:http://pan.baidu.com/s/1mic8iBe 密码:axca

下载源码之后解压并导入到单独的java project,然后再导入lucene的jar包,如图所示,是我的工程结构

Lucene6.5.0 下中文分词IKAnalyzer编译和使用

导入后修改四个文件:IKAnalyzer和IKTokenizer以及SWMCQueryBuilder、IKQueryExpressionParser,至于demo中的两个文件可直接删除或进行修改,我进行了修改。修改方式很简单,这里贴出修改的原文,以及修改后工程和源码下载。

修改后的工程地址:链接:http://pan.baidu.com/s/1nuALOql密码:miyq

编译好的IKAnalyzer的jar包下载地址:http://download.****.net/detail/fanpei_moukoy/9796612可直接导入lucene项目进行使用


IKAnalyzer

[java] view plain copy
  1. /** 
  2.  * IK 中文分词  版本 6.5.0 
  3.  * IK Analyzer release 6.5.0 
  4.  *  
  5.  * Licensed to the Apache Software Foundation (ASF) under one or more 
  6.  * contributor license agreements.  See the NOTICE file distributed with 
  7.  * this work for additional information regarding copyright ownership. 
  8.  * The ASF licenses this file to You under the Apache License, Version 2.0 
  9.  * (the "License"); you may not use this file except in compliance with 
  10.  * the License.  You may obtain a copy of the License at 
  11.  * 
  12.  *     http://www.apache.org/licenses/LICENSE-2.0 
  13.  * 
  14.  * Unless required by applicable law or agreed to in writing, software 
  15.  * distributed under the License is distributed on an "AS IS" BASIS, 
  16.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
  17.  * See the License for the specific language governing permissions and 
  18.  * limitations under the License. 
  19.  * 
  20.  * provided by Linliangyi and copyright 2012 by Oolong studio 
  21.  *  
  22.  */  
  23. package org.wltea.analyzer.lucene;  
  24.   
  25. import java.io.Reader;  
  26. import java.io.StringReader;  
  27.   
  28. import org.apache.lucene.analysis.Analyzer;  
  29. import org.apache.lucene.util.IOUtils;  
  30.   
  31.   
  32. /** 
  33.  * IK分词器,Lucene Analyzer接口实现 
  34.  * 兼容Lucene 6.5.0版本 暴走抹茶 2017.3.28 
  35.  */  
  36. public final class IKAnalyzer extends Analyzer{  
  37.       
  38.     private boolean useSmart;  
  39.       
  40.     public boolean useSmart() {  
  41.         return useSmart;  
  42.     }  
  43.   
  44.     public void setUseSmart(boolean useSmart) {  
  45.         this.useSmart = useSmart;  
  46.     }  
  47.   
  48.     /** 
  49.      * IK分词器Lucene  Analyzer接口实现类 
  50.      *  
  51.      * 默认细粒度切分算法 
  52.      */  
  53.     public IKAnalyzer(){  
  54.         this(false);  
  55.     }  
  56.       
  57.     /** 
  58.      * IK分词器Lucene Analyzer接口实现类 
  59.      *  
  60.      * @param useSmart 当为true时,分词器进行智能切分 
  61.      */  
  62.     public IKAnalyzer(boolean useSmart){  
  63.         super();  
  64.         this.useSmart = useSmart;  
  65.     }  
  66.   
  67.   
  68.     @Override  
  69.     protected TokenStreamComponents createComponents(String fieldName) {  
  70.          Reader reader=null;  
  71.             try{  
  72.                 reader=new StringReader(fieldName);  
  73.                 IKTokenizer it = new IKTokenizer(reader);  
  74.                 return new Analyzer.TokenStreamComponents(it);  
  75.             }finally {  
  76.                 IOUtils.closeWhileHandlingException(reader);  
  77.             }  
  78.     }  
  79.   
  80. }  
IKTokenizer

[java] view plain copy
  1. /** 
  2.  * IK 中文分词  版本 6.5.0 
  3.  * IK Analyzer release 6.5.0 
  4.  *  
  5.  * Licensed to the Apache Software Foundation (ASF) under one or more 
  6.  * contributor license agreements.  See the NOTICE file distributed with 
  7.  * this work for additional information regarding copyright ownership. 
  8.  * The ASF licenses this file to You under the Apache License, Version 2.0 
  9.  * (the "License"); you may not use this file except in compliance with 
  10.  * the License.  You may obtain a copy of the License at 
  11.  * 
  12.  *     http://www.apache.org/licenses/LICENSE-2.0 
  13.  * 
  14.  * Unless required by applicable law or agreed to in writing, software 
  15.  * distributed under the License is distributed on an "AS IS" BASIS, 
  16.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
  17.  * See the License for the specific language governing permissions and 
  18.  * limitations under the License. 
  19.  * 
  20.  * provided by Linliangyi and copyright 2012 by Oolong studio 
  21.  *  
  22.  
  23.  *  
  24.  */  
  25. package org.wltea.analyzer.lucene;  
  26.   
  27. import java.io.IOException;  
  28. import java.io.Reader;  
  29.   
  30. import org.apache.lucene.analysis.Tokenizer;  
  31. import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;  
  32. import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;  
  33. import org.apache.lucene.analysis.tokenattributes.TypeAttribute;  
  34.   
  35. import org.wltea.analyzer.core.IKSegmenter;  
  36. import org.wltea.analyzer.core.Lexeme;  
  37.   
  38. /** 
  39.  * IK分词器 Lucene Tokenizer适配器类 兼容Lucene 6.5.0版本 暴走抹茶 2017.3.28 
  40.  */  
  41. public final class IKTokenizer extends Tokenizer {  
  42.   
  43.     // IK分词器实现  
  44.     private IKSegmenter _IKImplement;  
  45.   
  46.     // 词元文本属性  
  47.     private final CharTermAttribute termAtt;  
  48.     // 词元位移属性  
  49.     private final OffsetAttribute offsetAtt;  
  50.     // 词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)  
  51.     private final TypeAttribute typeAtt;  
  52.     // 记录最后一个词元的结束位置  
  53.     private int endPosition;  
  54.   
  55.     public IKTokenizer(Reader in) {  
  56.         this(in, false);  
  57.     }  
  58.   
  59.     /** 
  60.      * Lucene 6.5.0 Tokenizer适配器类构造函数 
  61.      *  
  62.      * @param in 
  63.      * @param useSmart 
  64.      */  
  65.     public IKTokenizer(Reader in, boolean useSmart) {  
  66.         offsetAtt = addAttribute(OffsetAttribute.class);  
  67.         termAtt = addAttribute(CharTermAttribute.class);  
  68.         typeAtt = addAttribute(TypeAttribute.class);  
  69.         _IKImplement = new IKSegmenter(input, useSmart);  
  70.     }  
  71.   
  72.     /* 
  73.      * (non-Javadoc) 
  74.      *  
  75.      * @see org.apache.lucene.analysis.TokenStream#incrementToken() 
  76.      */  
  77.     @Override  
  78.     public boolean incrementToken() throws IOException {  
  79.         // 清除所有的词元属性  
  80.         clearAttributes();  
  81.         Lexeme nextLexeme = _IKImplement.next();  
  82.         if (nextLexeme != null) {  
  83.             // 将Lexeme转成Attributes  
  84.             // 设置词元文本  
  85.             termAtt.append(nextLexeme.getLexemeText());  
  86.             // 设置词元长度  
  87.             termAtt.setLength(nextLexeme.getLength());  
  88.             // 设置词元位移  
  89.             offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());  
  90.             // 记录分词的最后位置  
  91.             endPosition = nextLexeme.getEndPosition();  
  92.             // 记录词元分类  
  93.             typeAtt.setType(nextLexeme.getLexemeTypeString());  
  94.             // 返会true告知还有下个词元  
  95.             return true;  
  96.         }  
  97.         // 返会false告知词元输出完毕  
  98.         return false;  
  99.     }  
  100.   
  101.     /* 
  102.      * (non-Javadoc) 
  103.      *  
  104.      * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader) 
  105.      */  
  106.     @Override  
  107.     public void reset() throws IOException {  
  108.         super.reset();  
  109.         _IKImplement.reset(input);  
  110.     }  
  111.   
  112.     @Override  
  113.     public final void end() {  
  114.         // set final offset  
  115.         int finalOffset = correctOffset(this.endPosition);  
  116.         offsetAtt.setOffset(finalOffset, finalOffset);  
  117.     }  
  118. }  
IKQueryExpressionParser

[java] view plain copy
  1. /** 
  2.  * IK 中文分词  版本 6.5.0 
  3.  * IK Analyzer release 6.5.0 
  4.  *  
  5.  * Licensed to the Apache Software Foundation (ASF) under one or more 
  6.  * contributor license agreements.  See the NOTICE file distributed with 
  7.  * this work for additional information regarding copyright ownership. 
  8.  * The ASF licenses this file to You under the Apache License, Version 2.0 
  9.  * (the "License"); you may not use this file except in compliance with 
  10.  * the License.  You may obtain a copy of the License at 
  11.  * 
  12.  *     http://www.apache.org/licenses/LICENSE-2.0 
  13.  * 
  14.  * Unless required by applicable law or agreed to in writing, software 
  15.  * distributed under the License is distributed on an "AS IS" BASIS, 
  16.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
  17.  * See the License for the specific language governing permissions and 
  18.  * limitations under the License. 
  19.  * 
  20.  * provided by Linliangyi and copyright 2012 by Oolong studio 
  21.  *  
  22.  */  
  23. package org.wltea.analyzer.query;  
  24.   
  25. import java.util.ArrayList;  
  26. import java.util.LinkedList;  
  27. import java.util.List;  
  28. import java.util.Stack;  
  29.   
  30. import org.apache.lucene.index.Term;  
  31. import org.apache.lucene.search.BooleanClause;  
  32. import org.apache.lucene.search.BooleanQuery;  
  33. import org.apache.lucene.search.BooleanQuery.Builder;  
  34. import org.apache.lucene.search.Query;  
  35. import org.apache.lucene.search.TermQuery;  
  36. import org.apache.lucene.search.TermRangeQuery;  
  37. import org.apache.lucene.search.BooleanClause.Occur;  
  38. import org.apache.lucene.util.BytesRef;  
  39.   
  40. /** 
  41.  * IK简易查询表达式解析  
  42.  * 结合SWMCQuery算法  暴走抹茶 2017.3.28 
  43.  *  
  44.  * 表达式例子 : 
  45.  * (id='1231231' && title:'monkey') || (content:'你好吗'  || ulr='www.ik.com') - name:'helloword' 
  46.  * @author linliangyi 
  47.  * 
  48.  */  
  49. public class IKQueryExpressionParser {  
  50.       
  51.     //public static final String LUCENE_SPECIAL_CHAR = "&&||-()':={}[],";  
  52.       
  53.     private List<Element> elements = new ArrayList<Element>();  
  54.       
  55.     private Stack<Query> querys =  new Stack<Query>();  
  56.       
  57.     private Stack<Element> operates = new Stack<Element>();  
  58.       
  59.     /** 
  60.      * 解析查询表达式,生成Lucene Query对象 
  61.      *  
  62.      * @param expression 
  63.      * @param quickMode  
  64.      * @return Lucene query 
  65.      */  
  66.     public Query parseExp(String expression , boolean quickMode){  
  67.         Query lucenceQuery = null;  
  68.         if(expression != null && !"".equals(expression.trim())){  
  69.             try{  
  70.                 //文法解析  
  71.                 this.splitElements(expression);  
  72.                 //语法解析  
  73.                 this.parseSyntax(quickMode);  
  74.                 if(this.querys.size() == 1){  
  75.                     lucenceQuery = this.querys.pop();  
  76.                 }else{  
  77.                     throw new IllegalStateException("表达式异常: 缺少逻辑操作符 或 括号缺失");  
  78.                 }  
  79.             }finally{  
  80.                 elements.clear();  
  81.                 querys.clear();  
  82.                 operates.clear();  
  83.             }  
  84.         }  
  85.         return lucenceQuery;  
  86.     }     
  87.       
  88.     /** 
  89.      * 表达式文法解析 
  90.      * @param expression 
  91.      */  
  92.     private void splitElements(String expression){  
  93.           
  94.         if(expression == null){  
  95.             return;  
  96.         }  
  97.         Element curretElement = null;  
  98.           
  99.         char[] expChars = expression.toCharArray();  
  100.         for(int i = 0 ; i < expChars.length ; i++){  
  101.             switch(expChars[i]){  
  102.             case '&' :  
  103.                 if(curretElement == null){  
  104.                     curretElement = new Element();  
  105.                     curretElement.type = '&';  
  106.                     curretElement.append(expChars[i]);  
  107.                 }else if(curretElement.type == '&'){  
  108.                     curretElement.append(expChars[i]);  
  109.                     this.elements.add(curretElement);  
  110.                     curretElement = null;  
  111.                 }else if(curretElement.type == '\''){  
  112.                     curretElement.append(expChars[i]);  
  113.                 }else {  
  114.                     this.elements.add(curretElement);  
  115.                     curretElement = new Element();  
  116.                     curretElement.type = '&';  
  117.                     curretElement.append(expChars[i]);  
  118.                 }  
  119.                 break;  
  120.                   
  121.             case '|' :  
  122.                 if(curretElement == null){  
  123.                     curretElement = new Element();  
  124.                     curretElement.type = '|';  
  125.                     curretElement.append(expChars[i]);  
  126.                 }else if(curretElement.type == '|'){  
  127.                     curretElement.append(expChars[i]);  
  128.                     this.elements.add(curretElement);  
  129.                     curretElement = null;  
  130.                 }else if(curretElement.type == '\''){  
  131.                     curretElement.append(expChars[i]);  
  132.                 }else {  
  133.                     this.elements.add(curretElement);  
  134.                     curretElement = new Element();  
  135.                     curretElement.type = '|';  
  136.                     curretElement.append(expChars[i]);  
  137.                 }                 
  138.                 break;  
  139.                   
  140.             case '-' :  
  141.                 if(curretElement != null){  
  142.                     if(curretElement.type == '\''){  
  143.                         curretElement.append(expChars[i]);  
  144.                         continue;  
  145.                     }else{  
  146.                         this.elements.add(curretElement);  
  147.                     }  
  148.                 }  
  149.                 curretElement = new Element();  
  150.                 curretElement.type = '-';  
  151.                 curretElement.append(expChars[i]);  
  152.                 this.elements.add(curretElement);  
  153.                 curretElement = null;             
  154.                 break;  
  155.   
  156.             case '(' :  
  157.                 if(curretElement != null){  
  158.                     if(curretElement.type == '\''){  
  159.                         curretElement.append(expChars[i]);  
  160.                         continue;  
  161.                     }else{  
  162.                         this.elements.add(curretElement);  
  163.                     }  
  164.                 }  
  165.                 curretElement = new Element();  
  166.                 curretElement.type = '(';  
  167.                 curretElement.append(expChars[i]);  
  168.                 this.elements.add(curretElement);  
  169.                 curretElement = null;             
  170.                 break;                
  171.   
  172.             case ')' :  
  173.                 if(curretElement != null){  
  174.                     if(curretElement.type == '\''){  
  175.                         curretElement.append(expChars[i]);  
  176.                         continue;  
  177.                     }else{  
  178.                         this.elements.add(curretElement);  
  179.                     }  
  180.                 }  
  181.                 curretElement = new Element();  
  182.                 curretElement.type = ')';  
  183.                 curretElement.append(expChars[i]);  
  184.                 this.elements.add(curretElement);  
  185.                 curretElement = null;             
  186.                 break;                    
  187.   
  188.             case ':' :  
  189.                 if(curretElement != null){  
  190.                     if(curretElement.type == '\''){  
  191.                         curretElement.append(expChars[i]);  
  192.                         continue;  
  193.                     }else{  
  194.                         this.elements.add(curretElement);  
  195.                     }  
  196.                 }  
  197.                 curretElement = new Element();  
  198.                 curretElement.type = ':';  
  199.                 curretElement.append(expChars[i]);  
  200.                 this.elements.add(curretElement);  
  201.                 curretElement = null;             
  202.                 break;    
  203.               
  204.             case '=' :  
  205.                 if(curretElement != null){  
  206.                     if(curretElement.type == '\''){  
  207.                         curretElement.append(expChars[i]);  
  208.                         continue;  
  209.                     }else{  
  210.                         this.elements.add(curretElement);  
  211.                     }  
  212.                 }  
  213.                 curretElement = new Element();  
  214.                 curretElement.type = '=';  
  215.                 curretElement.append(expChars[i]);  
  216.                 this.elements.add(curretElement);  
  217.                 curretElement = null;             
  218.                 break;                    
  219.   
  220.             case ' ' :  
  221.                 if(curretElement != null){  
  222.                     if(curretElement.type == '\''){  
  223.                         curretElement.append(expChars[i]);  
  224.                     }else{  
  225.                         this.elements.add(curretElement);  
  226.                         curretElement = null;  
  227.                     }  
  228.                 }  
  229.                   
  230.                 break;  
  231.               
  232.             case '\'' :  
  233.                 if(curretElement == null){  
  234.                     curretElement = new Element();  
  235.                     curretElement.type = '\'';  
  236.                       
  237.                 }else if(curretElement.type == '\''){  
  238.                     this.elements.add(curretElement);  
  239.                     curretElement = null;  
  240.                       
  241.                 }else{  
  242.                     this.elements.add(curretElement);  
  243.                     curretElement = new Element();  
  244.                     curretElement.type = '\'';  
  245.                       
  246.                 }  
  247.                 break;  
  248.                   
  249.             case '[':  
  250.                 if(curretElement != null){  
  251.                     if(curretElement.type == '\''){  
  252.                         curretElement.append(expChars[i]);  
  253.                         continue;  
  254.                     }else{  
  255.                         this.elements.add(curretElement);  
  256.                     }  
  257.                 }  
  258.                 curretElement = new Element();  
  259.                 curretElement.type = '[';  
  260.                 curretElement.append(expChars[i]);  
  261.                 this.elements.add(curretElement);  
  262.                 curretElement = null;                     
  263.                 break;  
  264.                   
  265.             case ']':  
  266.                 if(curretElement != null){  
  267.                     if(curretElement.type == '\''){  
  268.                         curretElement.append(expChars[i]);  
  269.                         continue;  
  270.                     }else{  
  271.                         this.elements.add(curretElement);  
  272.                     }  
  273.                 }  
  274.                 curretElement = new Element();  
  275.                 curretElement.type = ']';  
  276.                 curretElement.append(expChars[i]);  
  277.                 this.elements.add(curretElement);  
  278.                 curretElement = null;  
  279.                   
  280.                 break;  
  281.                   
  282.             case '{':  
  283.                 if(curretElement != null){  
  284.                     if(curretElement.type == '\''){  
  285.                         curretElement.append(expChars[i]);  
  286.                         continue;  
  287.                     }else{  
  288.                         this.elements.add(curretElement);  
  289.                     }  
  290.                 }  
  291.                 curretElement = new Element();  
  292.                 curretElement.type = '{';  
  293.                 curretElement.append(expChars[i]);  
  294.                 this.elements.add(curretElement);  
  295.                 curretElement = null;                     
  296.                 break;  
  297.                   
  298.             case '}':  
  299.                 if(curretElement != null){  
  300.                     if(curretElement.type == '\''){  
  301.                         curretElement.append(expChars[i]);  
  302.                         continue;  
  303.                     }else{  
  304.                         this.elements.add(curretElement);  
  305.                     }  
  306.                 }  
  307.                 curretElement = new Element();  
  308.                 curretElement.type = '}';  
  309.                 curretElement.append(expChars[i]);  
  310.                 this.elements.add(curretElement);  
  311.                 curretElement = null;  
  312.                   
  313.                 break;  
  314.             case ',':  
  315.                 if(curretElement != null){  
  316.                     if(curretElement.type == '\''){  
  317.                         curretElement.append(expChars[i]);  
  318.                         continue;  
  319.                     }else{  
  320.                         this.elements.add(curretElement);  
  321.                     }  
  322.                 }  
  323.                 curretElement = new Element();  
  324.                 curretElement.type = ',';  
  325.                 curretElement.append(expChars[i]);  
  326.                 this.elements.add(curretElement);  
  327.                 curretElement = null;  
  328.                   
  329.                 break;  
  330.                   
  331.             default :  
  332.                 if(curretElement == null){  
  333.                     curretElement = new Element();  
  334.                     curretElement.type = 'F';  
  335.                     curretElement.append(expChars[i]);  
  336.                       
  337.                 }else if(curretElement.type == 'F'){  
  338.                     curretElement.append(expChars[i]);  
  339.                       
  340.                 }else if(curretElement.type == '\''){  
  341.                     curretElement.append(expChars[i]);  
  342.   
  343.                 }else{  
  344.                     this.elements.add(curretElement);  
  345.                     curretElement = new Element();  
  346.                     curretElement.type = 'F';  
  347.                     curretElement.append(expChars[i]);  
  348.                 }             
  349.             }  
  350.         }  
  351.           
  352.         if(curretElement != null){  
  353.             this.elements.add(curretElement);  
  354.             curretElement = null;  
  355.         }  
  356.     }  
  357.           
  358.     /** 
  359.      * 语法解析 
  360.      *  
  361.      */  
  362.     private void parseSyntax(boolean quickMode){  
  363.         for(int i = 0 ; i < this.elements.size() ; i++){  
  364.             Element e = this.elements.get(i);  
  365.             if('F' == e.type){  
  366.                 Element e2 = this.elements.get(i + 1);  
  367.                 if('=' != e2.type && ':' != e2.type){  
  368.                     throw new IllegalStateException("表达式异常: = 或 : 号丢失");  
  369.                 }  
  370.                 Element e3 = this.elements.get(i + 2);  
  371.                 //处理 = 和 : 运算  
  372.                 if('\'' == e3.type){  
  373.                     i+=2;  
  374.                     if('=' == e2.type){  
  375.                         TermQuery tQuery = new TermQuery(new Term(e.toString() , e3.toString()));  
  376.                         this.querys.push(tQuery);  
  377.                     }else if(':' == e2.type){  
  378.                         String keyword = e3.toString();  
  379.                         //SWMCQuery Here  
  380.                         Query _SWMCQuery =  SWMCQueryBuilder.create(e.toString(), keyword , quickMode);  
  381.                         this.querys.push(_SWMCQuery);  
  382.                     }  
  383.                       
  384.                 }else if('[' == e3.type || '{' == e3.type){  
  385.                     i+=2;  
  386.                     //处理 [] 和 {}  
  387.                     LinkedList<Element> eQueue = new LinkedList<Element>();  
  388.                     eQueue.add(e3);  
  389.                     for( i++ ; i < this.elements.size() ; i++){                            
  390.                         Element eN = this.elements.get(i);  
  391.                         eQueue.add(eN);  
  392.                         if(']' == eN.type || '}' == eN.type){  
  393.                             break;  
  394.                         }  
  395.                     }  
  396.                     //翻译RangeQuery  
  397.                     Query rangeQuery = this.toTermRangeQuery(e , eQueue);  
  398.                     this.querys.push(rangeQuery);  
  399.                 }else{  
  400.                     throw new IllegalStateException("表达式异常:匹配值丢失");  
  401.                 }  
  402.                   
  403.             }else if('(' == e.type){  
  404.                 this.operates.push(e);  
  405.                   
  406.             }else if(')' == e.type){  
  407.                 boolean doPop = true;  
  408.                 while(doPop && !this.operates.empty()){  
  409.                     Element op = this.operates.pop();  
  410.                     if('(' == op.type){  
  411.                         doPop = false;  
  412.                     }else {  
  413.                         Query q = toBooleanQuery(op);  
  414.                         this.querys.push(q);  
  415.                     }  
  416.                       
  417.                 }  
  418.             }else{   
  419.                   
  420.                 if(this.operates.isEmpty()){  
  421.                     this.operates.push(e);  
  422.                 }else{  
  423.                     boolean doPeek = true;  
  424.                     while(doPeek && !this.operates.isEmpty()){  
  425.                         Element eleOnTop = this.operates.peek();  
  426.                         if('(' == eleOnTop.type){  
  427.                             doPeek = false;  
  428.                             this.operates.push(e);  
  429.                         }else if(compare(e , eleOnTop) == 1){  
  430.                             this.operates.push(e);  
  431.                             doPeek = false;  
  432.                         }else if(compare(e , eleOnTop) == 0){  
  433.                             Query q = toBooleanQuery(eleOnTop);  
  434.                             this.operates.pop();  
  435.                             this.querys.push(q);  
  436.                         }else{  
  437.                             Query q = toBooleanQuery(eleOnTop);  
  438.                             this.operates.pop();  
  439.                             this.querys.push(q);  
  440.                         }  
  441.                     }  
  442.                       
  443.                     if(doPeek && this.operates.empty()){  
  444.                         this.operates.push(e);  
  445.                     }  
  446.                 }  
  447.             }             
  448.         }  
  449.           
  450.         while(!this.operates.isEmpty()){  
  451.             Element eleOnTop = this.operates.pop();  
  452.             Query q = toBooleanQuery(eleOnTop);  
  453.             this.querys.push(q);              
  454.         }         
  455.     }  
  456.   
  457.     /** 
  458.      * 根据逻辑操作符,生成BooleanQuery 
  459.      * @param op 
  460.      * @return 
  461.      */  
  462.     private Query toBooleanQuery(Element op){  
  463.         if(this.querys.size() == 0){  
  464.             return null;  
  465.         }  
  466.           
  467.         //BooleanQuery resultQuery = null;  
  468.         Builder builder = new Builder();  
  469.   
  470.         if(this.querys.size() == 1){  
  471.             return this.querys.get(0);  
  472.         }  
  473.           
  474.         Query q2 = this.querys.pop();  
  475.         Query q1 = this.querys.pop();  
  476.         if('&' == op.type){  
  477.             if(q1 != null){  
  478.                 if(q1 instanceof BooleanQuery){  
  479.                     List<BooleanClause> clauses = ((BooleanQuery)q1).clauses();  
  480.                     if(clauses.size() > 0   
  481.                             && clauses.get(0).getOccur() == Occur.MUST){  
  482.                         for(BooleanClause c : clauses){  
  483.                             builder.add(c);  
  484.                         }                     
  485.                     }else{  
  486.                         builder.add(q1,Occur.MUST);  
  487.                     }  
  488.   
  489.                 }else{  
  490.                     //q1 instanceof TermQuery   
  491.                     //q1 instanceof TermRangeQuery   
  492.                     //q1 instanceof PhraseQuery  
  493.                     //others  
  494.                     builder.add(q1,Occur.MUST);  
  495.                 }  
  496.             }  
  497.               
  498.             if(q2 != null){  
  499.                 if(q2 instanceof BooleanQuery){  
  500.                     List<BooleanClause> clauses = ((BooleanQuery)q2).clauses();  
  501.                     if(clauses.size() > 0   
  502.                             && clauses.get(0).getOccur() == Occur.MUST){  
  503.                         for(BooleanClause c : clauses){  
  504.                             builder.add(c);  
  505.                         }                     
  506.                     }else{  
  507.                         builder.add(q2,Occur.MUST);  
  508.                     }  
  509.                       
  510.                 }else{  
  511.                     //q1 instanceof TermQuery   
  512.                     //q1 instanceof TermRangeQuery   
  513.                     //q1 instanceof PhraseQuery  
  514.                     //others  
  515.                     builder.add(q2,Occur.MUST);  
  516.                 }  
  517.             }  
  518.               
  519.         }else if('|' == op.type){  
  520.             if(q1 != null){  
  521.                 if(q1 instanceof BooleanQuery){  
  522.                     List<BooleanClause> clauses = ((BooleanQuery)q1).clauses();  
  523.                     if(clauses.size() > 0   
  524.                             && clauses.get(0).getOccur() == Occur.SHOULD){  
  525.                         for(BooleanClause c : clauses){  
  526.                             builder.add(c);  
  527.                         }                     
  528.                     }else{  
  529.                         builder.add(q1,Occur.SHOULD);  
  530.                     }  
  531.                       
  532.                 }else{  
  533.                     //q1 instanceof TermQuery   
  534.                     //q1 instanceof TermRangeQuery   
  535.                     //q1 instanceof PhraseQuery  
  536.                     //others  
  537.                     builder.add(q1,Occur.SHOULD);  
  538.                 }  
  539.             }  
  540.               
  541.             if(q2 != null){  
  542.                 if(q2 instanceof BooleanQuery){  
  543.                     List<BooleanClause> clauses = ((BooleanQuery)q2).clauses();  
  544.                     if(clauses.size() > 0   
  545.                             && clauses.get(0).getOccur() == Occur.SHOULD){  
  546.                         for(BooleanClause c : clauses){  
  547.                             builder.add(c);  
  548.                         }                     
  549.                     }else{  
  550.                         builder.add(q2,Occur.SHOULD);  
  551.                     }  
  552.                 }else{  
  553.                     //q2 instanceof TermQuery   
  554.                     //q2 instanceof TermRangeQuery   
  555.                     //q2 instanceof PhraseQuery  
  556.                     //others  
  557.                     builder.add(q2,Occur.SHOULD);  
  558.                       
  559.                 }  
  560.             }  
  561.               
  562.         }else if('-' == op.type){  
  563.             if(q1 == null || q2 == null){  
  564.                 throw new IllegalStateException("表达式异常:SubQuery 个数不匹配");  
  565.             }  
  566.               
  567.             if(q1 instanceof BooleanQuery){  
  568.                 List<BooleanClause> clauses = ((BooleanQuery)q1).clauses();  
  569.                 if(clauses.size() > 0){  
  570.                     for(BooleanClause c : clauses){  
  571.                         builder.add(c);  
  572.                     }                     
  573.                 }else{  
  574.                     builder.add(q1,Occur.MUST);  
  575.                 }  
  576.   
  577.             }else{  
  578.                 //q1 instanceof TermQuery   
  579.                 //q1 instanceof TermRangeQuery   
  580.                 //q1 instanceof PhraseQuery  
  581.                 //others  
  582.                 builder.add(q1,Occur.MUST);  
  583.             }                 
  584.               
  585.             builder.add(q2,Occur.MUST_NOT);  
  586.         }  
  587.         return builder.build();  
  588.     }     
  589.       
  590.     /** 
  591.      * 组装TermRangeQuery 
  592.      * @param elements 
  593.      * @return 
  594.      */  
  595.     private TermRangeQuery toTermRangeQuery(Element fieldNameEle , LinkedList<Element> elements){  
  596.   
  597.         boolean includeFirst = false;  
  598.         boolean includeLast = false;  
  599.         String firstValue = null;  
  600.         String lastValue = null;  
  601.         //检查第一个元素是否是[或者{  
  602.         Element first = elements.getFirst();  
  603.         if('[' == first.type){  
  604.             includeFirst = true;  
  605.         }else if('{' == first.type){  
  606.             includeFirst = false;  
  607.         }else {  
  608.             throw new IllegalStateException("表达式异常");  
  609.         }  
  610.         //检查最后一个元素是否是]或者}  
  611.         Element last = elements.getLast();  
  612.         if(']' == last.type){  
  613.             includeLast = true;  
  614.         }else if('}' == last.type){  
  615.             includeLast = false;  
  616.         }else {  
  617.             throw new IllegalStateException("表达式异常, RangeQuery缺少结束括号");  
  618.         }  
  619.         if(elements.size() < 4 || elements.size() > 5){  
  620.             throw new IllegalStateException("表达式异常, RangeQuery 错误");  
  621.         }             
  622.         //读出中间部分  
  623.         Element e2 = elements.get(1);  
  624.         if('\'' == e2.type){  
  625.             firstValue = e2.toString();  
  626.             //  
  627.             Element e3 = elements.get(2);  
  628.             if(',' != e3.type){  
  629.                 throw new IllegalStateException("表达式异常, RangeQuery缺少逗号分隔");  
  630.             }  
  631.             //  
  632.             Element e4 = elements.get(3);  
  633.             if('\'' == e4.type){  
  634.                 lastValue = e4.toString();  
  635.             }else if(e4 != last){  
  636.                 throw new IllegalStateException("表达式异常,RangeQuery格式错误");  
  637.             }                 
  638.         }else if(',' == e2.type){  
  639.             firstValue = null;  
  640.             //  
  641.             Element e3 = elements.get(2);  
  642.             if('\'' == e3.type){  
  643.                 lastValue = e3.toString();  
  644.             }else{  
  645.                 throw new IllegalStateException("表达式异常,RangeQuery格式错误");  
  646.             }  
  647.               
  648.         }else {  
  649.             throw new IllegalStateException("表达式异常, RangeQuery格式错误");  
  650.         }  
  651.           
  652.         return new TermRangeQuery(fieldNameEle.toString() , new BytesRef(firstValue) , new BytesRef(lastValue) , includeFirst , includeLast);  
  653.     }     
  654.       
  655.     /** 
  656.      * 比较操作符优先级 
  657.      * @param e1 
  658.      * @param e2 
  659.      * @return 
  660.      */  
  661.     private int compare(Element e1 , Element e2){  
  662.         if('&' == e1.type){  
  663.             if('&' == e2.type){  
  664.                 return 0;  
  665.             }else {  
  666.                 return 1;  
  667.             }  
  668.         }else if('|' == e1.type){  
  669.             if('&' == e2.type){  
  670.                 return -1;  
  671.             }else if('|' == e2.type){  
  672.                 return 0;  
  673.             }else{  
  674.                 return 1;  
  675.             }  
  676.         }else{  
  677.             if('-' == e2.type){  
  678.                 return 0;  
  679.             }else{  
  680.                 return -1;  
  681.             }  
  682.         }  
  683.     }  
  684.       
  685.     /** 
  686.      * 表达式元素(操作符、FieldName、FieldValue) 
  687.      * @author linliangyi 
  688.      * May 20, 2010 
  689.      */  
  690.     private class Element{  
  691.         char type = 0;  
  692.         StringBuffer eleTextBuff;  
  693.   
  694.         public Element(){  
  695.             eleTextBuff = new StringBuffer();  
  696.         }  
  697.           
  698.         public void append(char c){  
  699.             this.eleTextBuff.append(c);  
  700.         }  
  701.       
  702.         public String toString(){  
  703.             return this.eleTextBuff.toString();  
  704.         }  
  705.     }     
  706.   
  707.     public static void main(String[] args){  
  708.         IKQueryExpressionParser parser = new IKQueryExpressionParser();  
  709.         //String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'";  
  710.         String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18'  || ulr='www.ik.com') - name:'林良益'";  
  711.         Query result = parser.parseExp(ikQueryExp , true);  
  712.         System.out.println(result);  
  713.   
  714.     }     
  715.       
  716. }  
SWMCQueryBuilder
[java] view plain copy
  1. /** 
  2.  * IK 中文分词  版本 6.5.0 
  3.  * IK Analyzer release 6.5.0 
  4.  *  
  5.  * Licensed to the Apache Software Foundation (ASF) under one or more 
  6.  * contributor license agreements.  See the NOTICE file distributed with 
  7.  * this work for additional information regarding copyright ownership. 
  8.  * The ASF licenses this file to You under the Apache License, Version 2.0 
  9.  * (the "License"); you may not use this file except in compliance with 
  10.  * the License.  You may obtain a copy of the License at 
  11.  * 
  12.  *     http://www.apache.org/licenses/LICENSE-2.0 
  13.  * 
  14.  * Unless required by applicable law or agreed to in writing, software 
  15.  * distributed under the License is distributed on an "AS IS" BASIS, 
  16.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
  17.  * See the License for the specific language governing permissions and 
  18.  * limitations under the License. 
  19.  * provided by Linliangyi and copyright 2012 by Oolong studio 
  20.  *  
  21.  */  
  22. package org.wltea.analyzer.query;  
  23.   
  24. import java.io.IOException;  
  25. import java.io.StringReader;  
  26. import java.util.ArrayList;  
  27. import java.util.List;  
  28.   
  29. import org.apache.lucene.analysis.standard.StandardAnalyzer;  
  30. import org.apache.lucene.queryparser.classic.ParseException;  
  31. import org.apache.lucene.queryparser.classic.QueryParser;  
  32. import org.apache.lucene.search.Query;  
  33. import org.wltea.analyzer.core.IKSegmenter;  
  34. import org.wltea.analyzer.core.Lexeme;  
  35.   
  36. /** 
  37.  * Single Word Multi Char Query Builder 
  38.  * IK分词算法专用  暴走抹茶 2017.3.28 
  39.  * @author linliangyi 
  40.  * 
  41.  */  
  42. public class SWMCQueryBuilder {  
  43.   
  44.     /** 
  45.      * 生成SWMCQuery 
  46.      * @param fieldName 
  47.      * @param keywords 
  48.      * @param quickMode 
  49.      * @return Lucene Query 
  50.      */  
  51.     public static Query create(String fieldName ,String keywords , boolean quickMode){  
  52.         if(fieldName == null || keywords == null){  
  53.             throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");  
  54.         }  
  55.         //1.对keywords进行分词处理  
  56.         List<Lexeme> lexemes = doAnalyze(keywords);  
  57.         //2.根据分词结果,生成SWMCQuery  
  58.         Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);  
  59.         return _SWMCQuery;  
  60.     }  
  61.       
  62.     /** 
  63.      * 分词切分,并返回结链表 
  64.      * @param keywords 
  65.      * @return 
  66.      */  
  67.     private static List<Lexeme> doAnalyze(String keywords){  
  68.         List<Lexeme> lexemes = new ArrayList<Lexeme>();  
  69.         IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true);  
  70.         try{  
  71.             Lexeme l = null;  
  72.             while( (l = ikSeg.next()) != null){  
  73.                 lexemes.add(l);  
  74.             }  
  75.         }catch(IOException e){  
  76.             e.printStackTrace();  
  77.         }  
  78.         return lexemes;  
  79.     }  
  80.       
  81.       
  82.     /** 
  83.      * 根据分词结果生成SWMC搜索 
  84.      * @param fieldName 
  85.      * @param pathOption 
  86.      * @param quickMode 
  87.      * @return 
  88.      */  
  89.     private static Query getSWMCQuery(String fieldName , List<Lexeme> lexemes , boolean quickMode){  
  90.         //构造SWMC的查询表达式  
  91.         StringBuffer keywordBuffer = new StringBuffer();  
  92.         //精简的SWMC的查询表达式  
  93.         StringBuffer keywordBuffer_Short = new StringBuffer();  
  94.         //记录最后词元长度  
  95.         int lastLexemeLength = 0;  
  96.         //记录最后词元结束位置  
  97.         int lastLexemeEnd = -1;  
  98.           
  99.         int shortCount = 0;  
  100.         int totalCount = 0;  
  101.         for(Lexeme l : lexemes){  
  102.             totalCount += l.getLength();  
  103.             //精简表达式  
  104.             if(l.getLength() > 1){  
  105.                 keywordBuffer_Short.append(' ').append(l.getLexemeText());  
  106.                 shortCount += l.getLength();  
  107.             }  
  108.               
  109.             if(lastLexemeLength == 0){  
  110.                 keywordBuffer.append(l.getLexemeText());                  
  111.             }else if(lastLexemeLength == 1 && l.getLength() == 1  
  112.                     && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并)  
  113.                 keywordBuffer.append(l.getLexemeText());  
  114.             }else{  
  115.                 keywordBuffer.append(' ').append(l.getLexemeText());  
  116.                   
  117.             }  
  118.             lastLexemeLength = l.getLength();  
  119.             lastLexemeEnd = l.getEndPosition();  
  120.         }  
  121.   
  122.         //借助lucene queryparser 生成SWMC Query  
  123.         QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer());  
  124.         qp.setDefaultOperator(QueryParser.AND_OPERATOR);  
  125.         qp.setAutoGeneratePhraseQueries(true);  
  126.           
  127.         if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){  
  128.             try {  
  129.                 //System.out.println(keywordBuffer.toString());  
  130.                 Query q = qp.parse(keywordBuffer_Short.toString());  
  131.                 return q;  
  132.             } catch (ParseException e) {  
  133.                 e.printStackTrace();  
  134.             }  
  135.               
  136.         }else{  
  137.             if(keywordBuffer.length() > 0){  
  138.                 try {  
  139.                     //System.out.println(keywordBuffer.toString());  
  140.                     Query q = qp.parse(keywordBuffer.toString());  
  141.                     return q;  
  142.                 } catch (ParseException e) {  
  143.                     e.printStackTrace();  
  144.                 }  
  145.             }  
  146.         }  
  147.         return null;  
  148.     }  
  149. }