Lucene6.5.0 下中文分词IKAnalyzer编译和使用
前言
lucene本省对中文分词有支持,不过支持的不好,其分词方式是机械的将中文词一个分成一个进行存储,例如:成都信息工程大学,最终分成为::成|都|信|息|工|程|大|学,显然这种分词方式是低效且浪费存储空间的,IK分词是林良益前辈自定义写的一个专门针对中文分词的分析器,最新版本为2012年的版本for4.0之后未做更新,后续版本lucene的接口改变使其不支持,所以需要进行修改。
修改和编译IKAnalyzer
(谷歌不稳定访问不了)国内源码地址:http://git.oschina.net/wltea/IK-Analyzer-2012FF 网盘下载:链接:http://pan.baidu.com/s/1jIt7kGm 密码:hu1g
lucene6.5.0下载地址:https://lucene.apache.org 网盘下载:链接:http://pan.baidu.com/s/1mic8iBe 密码:axca
下载源码之后解压并导入到单独的java project,然后再导入lucene的jar包,如图所示,是我的工程结构
导入后修改四个文件:IKAnalyzer和IKTokenizer以及SWMCQueryBuilder、IKQueryExpressionParser,至于demo中的两个文件可直接删除或进行修改,我进行了修改。修改方式很简单,这里贴出修改的原文,以及修改后工程和源码下载。
修改后的工程地址:链接:http://pan.baidu.com/s/1nuALOql密码:miyq
编译好的IKAnalyzer的jar包下载地址:http://download.****.net/detail/fanpei_moukoy/9796612可直接导入lucene项目进行使用
IKAnalyzer
- /**
- * IK 中文分词 版本 6.5.0
- * IK Analyzer release 6.5.0
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * provided by Linliangyi and copyright 2012 by Oolong studio
- *
- */
- package org.wltea.analyzer.lucene;
- import java.io.Reader;
- import java.io.StringReader;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.util.IOUtils;
- /**
- * IK分词器,Lucene Analyzer接口实现
- * 兼容Lucene 6.5.0版本 暴走抹茶 2017.3.28
- */
- public final class IKAnalyzer extends Analyzer{
- private boolean useSmart;
- public boolean useSmart() {
- return useSmart;
- }
- public void setUseSmart(boolean useSmart) {
- this.useSmart = useSmart;
- }
- /**
- * IK分词器Lucene Analyzer接口实现类
- *
- * 默认细粒度切分算法
- */
- public IKAnalyzer(){
- this(false);
- }
- /**
- * IK分词器Lucene Analyzer接口实现类
- *
- * @param useSmart 当为true时,分词器进行智能切分
- */
- public IKAnalyzer(boolean useSmart){
- super();
- this.useSmart = useSmart;
- }
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Reader reader=null;
- try{
- reader=new StringReader(fieldName);
- IKTokenizer it = new IKTokenizer(reader);
- return new Analyzer.TokenStreamComponents(it);
- }finally {
- IOUtils.closeWhileHandlingException(reader);
- }
- }
- }
- /**
- * IK 中文分词 版本 6.5.0
- * IK Analyzer release 6.5.0
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * provided by Linliangyi and copyright 2012 by Oolong studio
- *
- *
- */
- package org.wltea.analyzer.lucene;
- import java.io.IOException;
- import java.io.Reader;
- import org.apache.lucene.analysis.Tokenizer;
- import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
- import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
- import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
- import org.wltea.analyzer.core.IKSegmenter;
- import org.wltea.analyzer.core.Lexeme;
- /**
- * IK分词器 Lucene Tokenizer适配器类 兼容Lucene 6.5.0版本 暴走抹茶 2017.3.28
- */
- public final class IKTokenizer extends Tokenizer {
- // IK分词器实现
- private IKSegmenter _IKImplement;
- // 词元文本属性
- private final CharTermAttribute termAtt;
- // 词元位移属性
- private final OffsetAttribute offsetAtt;
- // 词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
- private final TypeAttribute typeAtt;
- // 记录最后一个词元的结束位置
- private int endPosition;
- public IKTokenizer(Reader in) {
- this(in, false);
- }
- /**
- * Lucene 6.5.0 Tokenizer适配器类构造函数
- *
- * @param in
- * @param useSmart
- */
- public IKTokenizer(Reader in, boolean useSmart) {
- offsetAtt = addAttribute(OffsetAttribute.class);
- termAtt = addAttribute(CharTermAttribute.class);
- typeAtt = addAttribute(TypeAttribute.class);
- _IKImplement = new IKSegmenter(input, useSmart);
- }
- /*
- * (non-Javadoc)
- *
- * @see org.apache.lucene.analysis.TokenStream#incrementToken()
- */
- @Override
- public boolean incrementToken() throws IOException {
- // 清除所有的词元属性
- clearAttributes();
- Lexeme nextLexeme = _IKImplement.next();
- if (nextLexeme != null) {
- // 将Lexeme转成Attributes
- // 设置词元文本
- termAtt.append(nextLexeme.getLexemeText());
- // 设置词元长度
- termAtt.setLength(nextLexeme.getLength());
- // 设置词元位移
- offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
- // 记录分词的最后位置
- endPosition = nextLexeme.getEndPosition();
- // 记录词元分类
- typeAtt.setType(nextLexeme.getLexemeTypeString());
- // 返会true告知还有下个词元
- return true;
- }
- // 返会false告知词元输出完毕
- return false;
- }
- /*
- * (non-Javadoc)
- *
- * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
- */
- @Override
- public void reset() throws IOException {
- super.reset();
- _IKImplement.reset(input);
- }
- @Override
- public final void end() {
- // set final offset
- int finalOffset = correctOffset(this.endPosition);
- offsetAtt.setOffset(finalOffset, finalOffset);
- }
- }
- /**
- * IK 中文分词 版本 6.5.0
- * IK Analyzer release 6.5.0
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * provided by Linliangyi and copyright 2012 by Oolong studio
- *
- */
- package org.wltea.analyzer.query;
- import java.util.ArrayList;
- import java.util.LinkedList;
- import java.util.List;
- import java.util.Stack;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.search.BooleanClause;
- import org.apache.lucene.search.BooleanQuery;
- import org.apache.lucene.search.BooleanQuery.Builder;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.TermQuery;
- import org.apache.lucene.search.TermRangeQuery;
- import org.apache.lucene.search.BooleanClause.Occur;
- import org.apache.lucene.util.BytesRef;
- /**
- * IK简易查询表达式解析
- * 结合SWMCQuery算法 暴走抹茶 2017.3.28
- *
- * 表达式例子 :
- * (id='1231231' && title:'monkey') || (content:'你好吗' || ulr='www.ik.com') - name:'helloword'
- * @author linliangyi
- *
- */
- public class IKQueryExpressionParser {
- //public static final String LUCENE_SPECIAL_CHAR = "&&||-()':={}[],";
- private List<Element> elements = new ArrayList<Element>();
- private Stack<Query> querys = new Stack<Query>();
- private Stack<Element> operates = new Stack<Element>();
- /**
- * 解析查询表达式,生成Lucene Query对象
- *
- * @param expression
- * @param quickMode
- * @return Lucene query
- */
- public Query parseExp(String expression , boolean quickMode){
- Query lucenceQuery = null;
- if(expression != null && !"".equals(expression.trim())){
- try{
- //文法解析
- this.splitElements(expression);
- //语法解析
- this.parseSyntax(quickMode);
- if(this.querys.size() == 1){
- lucenceQuery = this.querys.pop();
- }else{
- throw new IllegalStateException("表达式异常: 缺少逻辑操作符 或 括号缺失");
- }
- }finally{
- elements.clear();
- querys.clear();
- operates.clear();
- }
- }
- return lucenceQuery;
- }
- /**
- * 表达式文法解析
- * @param expression
- */
- private void splitElements(String expression){
- if(expression == null){
- return;
- }
- Element curretElement = null;
- char[] expChars = expression.toCharArray();
- for(int i = 0 ; i < expChars.length ; i++){
- switch(expChars[i]){
- case '&' :
- if(curretElement == null){
- curretElement = new Element();
- curretElement.type = '&';
- curretElement.append(expChars[i]);
- }else if(curretElement.type == '&'){
- curretElement.append(expChars[i]);
- this.elements.add(curretElement);
- curretElement = null;
- }else if(curretElement.type == '\''){
- curretElement.append(expChars[i]);
- }else {
- this.elements.add(curretElement);
- curretElement = new Element();
- curretElement.type = '&';
- curretElement.append(expChars[i]);
- }
- break;
- case '|' :
- if(curretElement == null){
- curretElement = new Element();
- curretElement.type = '|';
- curretElement.append(expChars[i]);
- }else if(curretElement.type == '|'){
- curretElement.append(expChars[i]);
- this.elements.add(curretElement);
- curretElement = null;
- }else if(curretElement.type == '\''){
- curretElement.append(expChars[i]);
- }else {
- this.elements.add(curretElement);
- curretElement = new Element();
- curretElement.type = '|';
- curretElement.append(expChars[i]);
- }
- break;
- case '-' :
- if(curretElement != null){
- if(curretElement.type == '\''){
- curretElement.append(expChars[i]);
- continue;
- }else{
- this.elements.add(curretElement);
- }
- }
- curretElement = new Element();
- curretElement.type = '-';
- curretElement.append(expChars[i]);
- this.elements.add(curretElement);
- curretElement = null;
- break;
- case '(' :
- if(curretElement != null){
- if(curretElement.type == '\''){
- curretElement.append(expChars[i]);
- continue;
- }else{
- this.elements.add(curretElement);
- }
- }
- curretElement = new Element();
- curretElement.type = '(';
- curretElement.append(expChars[i]);
- this.elements.add(curretElement);
- curretElement = null;
- break;
- case ')' :
- if(curretElement != null){
- if(curretElement.type == '\''){
- curretElement.append(expChars[i]);
- continue;
- }else{
- this.elements.add(curretElement);
- }
- }
- curretElement = new Element();
- curretElement.type = ')';
- curretElement.append(expChars[i]);
- this.elements.add(curretElement);
- curretElement = null;
- break;
- case ':' :
- if(curretElement != null){
- if(curretElement.type == '\''){
- curretElement.append(expChars[i]);
- continue;
- }else{
- this.elements.add(curretElement);
- }
- }
- curretElement = new Element();
- curretElement.type = ':';
- curretElement.append(expChars[i]);
- this.elements.add(curretElement);
- curretElement = null;
- break;
- case '=' :
- if(curretElement != null){
- if(curretElement.type == '\''){
- curretElement.append(expChars[i]);
- continue;
- }else{
- this.elements.add(curretElement);
- }
- }
- curretElement = new Element();
- curretElement.type = '=';
- curretElement.append(expChars[i]);
- this.elements.add(curretElement);
- curretElement = null;
- break;
- case ' ' :
- if(curretElement != null){
- if(curretElement.type == '\''){
- curretElement.append(expChars[i]);
- }else{
- this.elements.add(curretElement);
- curretElement = null;
- }
- }
- break;
- case '\'' :
- if(curretElement == null){
- curretElement = new Element();
- curretElement.type = '\'';
- }else if(curretElement.type == '\''){
- this.elements.add(curretElement);
- curretElement = null;
- }else{
- this.elements.add(curretElement);
- curretElement = new Element();
- curretElement.type = '\'';
- }
- break;
- case '[':
- if(curretElement != null){
- if(curretElement.type == '\''){
- curretElement.append(expChars[i]);
- continue;
- }else{
- this.elements.add(curretElement);
- }
- }
- curretElement = new Element();
- curretElement.type = '[';
- curretElement.append(expChars[i]);
- this.elements.add(curretElement);
- curretElement = null;
- break;
- case ']':
- if(curretElement != null){
- if(curretElement.type == '\''){
- curretElement.append(expChars[i]);
- continue;
- }else{
- this.elements.add(curretElement);
- }
- }
- curretElement = new Element();
- curretElement.type = ']';
- curretElement.append(expChars[i]);
- this.elements.add(curretElement);
- curretElement = null;
- break;
- case '{':
- if(curretElement != null){
- if(curretElement.type == '\''){
- curretElement.append(expChars[i]);
- continue;
- }else{
- this.elements.add(curretElement);
- }
- }
- curretElement = new Element();
- curretElement.type = '{';
- curretElement.append(expChars[i]);
- this.elements.add(curretElement);
- curretElement = null;
- break;
- case '}':
- if(curretElement != null){
- if(curretElement.type == '\''){
- curretElement.append(expChars[i]);
- continue;
- }else{
- this.elements.add(curretElement);
- }
- }
- curretElement = new Element();
- curretElement.type = '}';
- curretElement.append(expChars[i]);
- this.elements.add(curretElement);
- curretElement = null;
- break;
- case ',':
- if(curretElement != null){
- if(curretElement.type == '\''){
- curretElement.append(expChars[i]);
- continue;
- }else{
- this.elements.add(curretElement);
- }
- }
- curretElement = new Element();
- curretElement.type = ',';
- curretElement.append(expChars[i]);
- this.elements.add(curretElement);
- curretElement = null;
- break;
- default :
- if(curretElement == null){
- curretElement = new Element();
- curretElement.type = 'F';
- curretElement.append(expChars[i]);
- }else if(curretElement.type == 'F'){
- curretElement.append(expChars[i]);
- }else if(curretElement.type == '\''){
- curretElement.append(expChars[i]);
- }else{
- this.elements.add(curretElement);
- curretElement = new Element();
- curretElement.type = 'F';
- curretElement.append(expChars[i]);
- }
- }
- }
- if(curretElement != null){
- this.elements.add(curretElement);
- curretElement = null;
- }
- }
- /**
- * 语法解析
- *
- */
- private void parseSyntax(boolean quickMode){
- for(int i = 0 ; i < this.elements.size() ; i++){
- Element e = this.elements.get(i);
- if('F' == e.type){
- Element e2 = this.elements.get(i + 1);
- if('=' != e2.type && ':' != e2.type){
- throw new IllegalStateException("表达式异常: = 或 : 号丢失");
- }
- Element e3 = this.elements.get(i + 2);
- //处理 = 和 : 运算
- if('\'' == e3.type){
- i+=2;
- if('=' == e2.type){
- TermQuery tQuery = new TermQuery(new Term(e.toString() , e3.toString()));
- this.querys.push(tQuery);
- }else if(':' == e2.type){
- String keyword = e3.toString();
- //SWMCQuery Here
- Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword , quickMode);
- this.querys.push(_SWMCQuery);
- }
- }else if('[' == e3.type || '{' == e3.type){
- i+=2;
- //处理 [] 和 {}
- LinkedList<Element> eQueue = new LinkedList<Element>();
- eQueue.add(e3);
- for( i++ ; i < this.elements.size() ; i++){
- Element eN = this.elements.get(i);
- eQueue.add(eN);
- if(']' == eN.type || '}' == eN.type){
- break;
- }
- }
- //翻译RangeQuery
- Query rangeQuery = this.toTermRangeQuery(e , eQueue);
- this.querys.push(rangeQuery);
- }else{
- throw new IllegalStateException("表达式异常:匹配值丢失");
- }
- }else if('(' == e.type){
- this.operates.push(e);
- }else if(')' == e.type){
- boolean doPop = true;
- while(doPop && !this.operates.empty()){
- Element op = this.operates.pop();
- if('(' == op.type){
- doPop = false;
- }else {
- Query q = toBooleanQuery(op);
- this.querys.push(q);
- }
- }
- }else{
- if(this.operates.isEmpty()){
- this.operates.push(e);
- }else{
- boolean doPeek = true;
- while(doPeek && !this.operates.isEmpty()){
- Element eleOnTop = this.operates.peek();
- if('(' == eleOnTop.type){
- doPeek = false;
- this.operates.push(e);
- }else if(compare(e , eleOnTop) == 1){
- this.operates.push(e);
- doPeek = false;
- }else if(compare(e , eleOnTop) == 0){
- Query q = toBooleanQuery(eleOnTop);
- this.operates.pop();
- this.querys.push(q);
- }else{
- Query q = toBooleanQuery(eleOnTop);
- this.operates.pop();
- this.querys.push(q);
- }
- }
- if(doPeek && this.operates.empty()){
- this.operates.push(e);
- }
- }
- }
- }
- while(!this.operates.isEmpty()){
- Element eleOnTop = this.operates.pop();
- Query q = toBooleanQuery(eleOnTop);
- this.querys.push(q);
- }
- }
- /**
- * 根据逻辑操作符,生成BooleanQuery
- * @param op
- * @return
- */
- private Query toBooleanQuery(Element op){
- if(this.querys.size() == 0){
- return null;
- }
- //BooleanQuery resultQuery = null;
- Builder builder = new Builder();
- if(this.querys.size() == 1){
- return this.querys.get(0);
- }
- Query q2 = this.querys.pop();
- Query q1 = this.querys.pop();
- if('&' == op.type){
- if(q1 != null){
- if(q1 instanceof BooleanQuery){
- List<BooleanClause> clauses = ((BooleanQuery)q1).clauses();
- if(clauses.size() > 0
- && clauses.get(0).getOccur() == Occur.MUST){
- for(BooleanClause c : clauses){
- builder.add(c);
- }
- }else{
- builder.add(q1,Occur.MUST);
- }
- }else{
- //q1 instanceof TermQuery
- //q1 instanceof TermRangeQuery
- //q1 instanceof PhraseQuery
- //others
- builder.add(q1,Occur.MUST);
- }
- }
- if(q2 != null){
- if(q2 instanceof BooleanQuery){
- List<BooleanClause> clauses = ((BooleanQuery)q2).clauses();
- if(clauses.size() > 0
- && clauses.get(0).getOccur() == Occur.MUST){
- for(BooleanClause c : clauses){
- builder.add(c);
- }
- }else{
- builder.add(q2,Occur.MUST);
- }
- }else{
- //q1 instanceof TermQuery
- //q1 instanceof TermRangeQuery
- //q1 instanceof PhraseQuery
- //others
- builder.add(q2,Occur.MUST);
- }
- }
- }else if('|' == op.type){
- if(q1 != null){
- if(q1 instanceof BooleanQuery){
- List<BooleanClause> clauses = ((BooleanQuery)q1).clauses();
- if(clauses.size() > 0
- && clauses.get(0).getOccur() == Occur.SHOULD){
- for(BooleanClause c : clauses){
- builder.add(c);
- }
- }else{
- builder.add(q1,Occur.SHOULD);
- }
- }else{
- //q1 instanceof TermQuery
- //q1 instanceof TermRangeQuery
- //q1 instanceof PhraseQuery
- //others
- builder.add(q1,Occur.SHOULD);
- }
- }
- if(q2 != null){
- if(q2 instanceof BooleanQuery){
- List<BooleanClause> clauses = ((BooleanQuery)q2).clauses();
- if(clauses.size() > 0
- && clauses.get(0).getOccur() == Occur.SHOULD){
- for(BooleanClause c : clauses){
- builder.add(c);
- }
- }else{
- builder.add(q2,Occur.SHOULD);
- }
- }else{
- //q2 instanceof TermQuery
- //q2 instanceof TermRangeQuery
- //q2 instanceof PhraseQuery
- //others
- builder.add(q2,Occur.SHOULD);
- }
- }
- }else if('-' == op.type){
- if(q1 == null || q2 == null){
- throw new IllegalStateException("表达式异常:SubQuery 个数不匹配");
- }
- if(q1 instanceof BooleanQuery){
- List<BooleanClause> clauses = ((BooleanQuery)q1).clauses();
- if(clauses.size() > 0){
- for(BooleanClause c : clauses){
- builder.add(c);
- }
- }else{
- builder.add(q1,Occur.MUST);
- }
- }else{
- //q1 instanceof TermQuery
- //q1 instanceof TermRangeQuery
- //q1 instanceof PhraseQuery
- //others
- builder.add(q1,Occur.MUST);
- }
- builder.add(q2,Occur.MUST_NOT);
- }
- return builder.build();
- }
- /**
- * 组装TermRangeQuery
- * @param elements
- * @return
- */
- private TermRangeQuery toTermRangeQuery(Element fieldNameEle , LinkedList<Element> elements){
- boolean includeFirst = false;
- boolean includeLast = false;
- String firstValue = null;
- String lastValue = null;
- //检查第一个元素是否是[或者{
- Element first = elements.getFirst();
- if('[' == first.type){
- includeFirst = true;
- }else if('{' == first.type){
- includeFirst = false;
- }else {
- throw new IllegalStateException("表达式异常");
- }
- //检查最后一个元素是否是]或者}
- Element last = elements.getLast();
- if(']' == last.type){
- includeLast = true;
- }else if('}' == last.type){
- includeLast = false;
- }else {
- throw new IllegalStateException("表达式异常, RangeQuery缺少结束括号");
- }
- if(elements.size() < 4 || elements.size() > 5){
- throw new IllegalStateException("表达式异常, RangeQuery 错误");
- }
- //读出中间部分
- Element e2 = elements.get(1);
- if('\'' == e2.type){
- firstValue = e2.toString();
- //
- Element e3 = elements.get(2);
- if(',' != e3.type){
- throw new IllegalStateException("表达式异常, RangeQuery缺少逗号分隔");
- }
- //
- Element e4 = elements.get(3);
- if('\'' == e4.type){
- lastValue = e4.toString();
- }else if(e4 != last){
- throw new IllegalStateException("表达式异常,RangeQuery格式错误");
- }
- }else if(',' == e2.type){
- firstValue = null;
- //
- Element e3 = elements.get(2);
- if('\'' == e3.type){
- lastValue = e3.toString();
- }else{
- throw new IllegalStateException("表达式异常,RangeQuery格式错误");
- }
- }else {
- throw new IllegalStateException("表达式异常, RangeQuery格式错误");
- }
- return new TermRangeQuery(fieldNameEle.toString() , new BytesRef(firstValue) , new BytesRef(lastValue) , includeFirst , includeLast);
- }
- /**
- * 比较操作符优先级
- * @param e1
- * @param e2
- * @return
- */
- private int compare(Element e1 , Element e2){
- if('&' == e1.type){
- if('&' == e2.type){
- return 0;
- }else {
- return 1;
- }
- }else if('|' == e1.type){
- if('&' == e2.type){
- return -1;
- }else if('|' == e2.type){
- return 0;
- }else{
- return 1;
- }
- }else{
- if('-' == e2.type){
- return 0;
- }else{
- return -1;
- }
- }
- }
- /**
- * 表达式元素(操作符、FieldName、FieldValue)
- * @author linliangyi
- * May 20, 2010
- */
- private class Element{
- char type = 0;
- StringBuffer eleTextBuff;
- public Element(){
- eleTextBuff = new StringBuffer();
- }
- public void append(char c){
- this.eleTextBuff.append(c);
- }
- public String toString(){
- return this.eleTextBuff.toString();
- }
- }
- public static void main(String[] args){
- IKQueryExpressionParser parser = new IKQueryExpressionParser();
- //String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'";
- String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'";
- Query result = parser.parseExp(ikQueryExp , true);
- System.out.println(result);
- }
- }
- /**
- * IK 中文分词 版本 6.5.0
- * IK Analyzer release 6.5.0
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * provided by Linliangyi and copyright 2012 by Oolong studio
- *
- */
- package org.wltea.analyzer.query;
- import java.io.IOException;
- import java.io.StringReader;
- import java.util.ArrayList;
- import java.util.List;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.queryparser.classic.ParseException;
- import org.apache.lucene.queryparser.classic.QueryParser;
- import org.apache.lucene.search.Query;
- import org.wltea.analyzer.core.IKSegmenter;
- import org.wltea.analyzer.core.Lexeme;
- /**
- * Single Word Multi Char Query Builder
- * IK分词算法专用 暴走抹茶 2017.3.28
- * @author linliangyi
- *
- */
- public class SWMCQueryBuilder {
- /**
- * 生成SWMCQuery
- * @param fieldName
- * @param keywords
- * @param quickMode
- * @return Lucene Query
- */
- public static Query create(String fieldName ,String keywords , boolean quickMode){
- if(fieldName == null || keywords == null){
- throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
- }
- //1.对keywords进行分词处理
- List<Lexeme> lexemes = doAnalyze(keywords);
- //2.根据分词结果,生成SWMCQuery
- Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);
- return _SWMCQuery;
- }
- /**
- * 分词切分,并返回结链表
- * @param keywords
- * @return
- */
- private static List<Lexeme> doAnalyze(String keywords){
- List<Lexeme> lexemes = new ArrayList<Lexeme>();
- IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true);
- try{
- Lexeme l = null;
- while( (l = ikSeg.next()) != null){
- lexemes.add(l);
- }
- }catch(IOException e){
- e.printStackTrace();
- }
- return lexemes;
- }
- /**
- * 根据分词结果生成SWMC搜索
- * @param fieldName
- * @param pathOption
- * @param quickMode
- * @return
- */
- private static Query getSWMCQuery(String fieldName , List<Lexeme> lexemes , boolean quickMode){
- //构造SWMC的查询表达式
- StringBuffer keywordBuffer = new StringBuffer();
- //精简的SWMC的查询表达式
- StringBuffer keywordBuffer_Short = new StringBuffer();
- //记录最后词元长度
- int lastLexemeLength = 0;
- //记录最后词元结束位置
- int lastLexemeEnd = -1;
- int shortCount = 0;
- int totalCount = 0;
- for(Lexeme l : lexemes){
- totalCount += l.getLength();
- //精简表达式
- if(l.getLength() > 1){
- keywordBuffer_Short.append(' ').append(l.getLexemeText());
- shortCount += l.getLength();
- }
- if(lastLexemeLength == 0){
- keywordBuffer.append(l.getLexemeText());
- }else if(lastLexemeLength == 1 && l.getLength() == 1
- && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并)
- keywordBuffer.append(l.getLexemeText());
- }else{
- keywordBuffer.append(' ').append(l.getLexemeText());
- }
- lastLexemeLength = l.getLength();
- lastLexemeEnd = l.getEndPosition();
- }
- //借助lucene queryparser 生成SWMC Query
- QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer());
- qp.setDefaultOperator(QueryParser.AND_OPERATOR);
- qp.setAutoGeneratePhraseQueries(true);
- if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){
- try {
- //System.out.println(keywordBuffer.toString());
- Query q = qp.parse(keywordBuffer_Short.toString());
- return q;
- } catch (ParseException e) {
- e.printStackTrace();
- }
- }else{
- if(keywordBuffer.length() > 0){
- try {
- //System.out.println(keywordBuffer.toString());
- Query q = qp.parse(keywordBuffer.toString());
- return q;
- } catch (ParseException e) {
- e.printStackTrace();
- }
- }
- }
- return null;
- }
- }