Lucene学习笔记(一)
建立索引:
<!--<br><br>Code highlighting produced by Actipro CodeHighlighter (freeware)<br>http://www.CodeHighlighter.com/<br><br>-->importjava.io.File;
importjava.io.FileReader;
importjava.io.IOException;
importjava.util.Date;
importorg.apache.lucene.analysis.standard.StandardAnalyzer;
importorg.apache.lucene.document.Document;
importorg.apache.lucene.document.Field;
importorg.apache.lucene.index.IndexWriter;
publicclassIndexer
{
//建立索引
/**
*@paramargs
*/
publicstaticvoidmain(String[]args)throwsException
{
//TODOAuto-generatedmethodstub
if(args.length!=2)
{
thrownewException("Usage:java"+Indexer.class.getName()+"<indexdir><datadir>");
}
FileindexDir=newFile(args[0]);//存放索引的路径
FiledataDir=newFile(args[1]);//要建立索引的目录路径
longstart=newDate().getTime();
intnumIndexed=index(indexDir,dataDir);//建立索引
longend=newDate().getTime();
System.out.println("Indexing"+numIndexed+"filestook"+(end-start)+"milliseconds");
}
privatestaticintindex(FileindexDir,FiledataDir)throwsIOException
{
if(!dataDir.exists()||!dataDir.isDirectory())
{
thrownewIOException(dataDir+"doesnotexistorisnotadirectory");
}
IndexWriterwriter=newIndexWriter(indexDir,newStandardAnalyzer(),true);
writer.setUseCompoundFile(false);
indexDirectory(writer,dataDir);
intnumIndexed=writer.docCount();
writer.optimize();
writer.close();
returnnumIndexed;
}
privatestaticvoidindexDirectory(IndexWriterwriter,FiledataDir)throwsIOException
{
//TODOAuto-generatedmethodstub
File[]files=dataDir.listFiles();
for(inti=0;i<files.length;i++)
{
Filef=files[i];
if(f.isDirectory())
{
indexDirectory(writer,f);
}
elseif(f.getName().endsWith(".txt"))
{
indexFile(writer,f);
}
}
}
privatestaticvoidindexFile(IndexWriterwriter,Filef)throwsIOException
{
if(f.isHidden()||!f.exists()||!f.canRead())
{
return;
}
System.out.println("Indexing"+f.getCanonicalPath());
Documentdoc=newDocument();
doc.add(newField("contents",newFileReader(f)));
doc.add(newField("filename",f.getCanonicalPath(),Field.Store.YES,Field.Index.TOKENIZED));
writer.addDocument(doc);
}
}
importjava.io.FileReader;
importjava.io.IOException;
importjava.util.Date;
importorg.apache.lucene.analysis.standard.StandardAnalyzer;
importorg.apache.lucene.document.Document;
importorg.apache.lucene.document.Field;
importorg.apache.lucene.index.IndexWriter;
publicclassIndexer
{
//建立索引
/**
*@paramargs
*/
publicstaticvoidmain(String[]args)throwsException
{
//TODOAuto-generatedmethodstub
if(args.length!=2)
{
thrownewException("Usage:java"+Indexer.class.getName()+"<indexdir><datadir>");
}
FileindexDir=newFile(args[0]);//存放索引的路径
FiledataDir=newFile(args[1]);//要建立索引的目录路径
longstart=newDate().getTime();
intnumIndexed=index(indexDir,dataDir);//建立索引
longend=newDate().getTime();
System.out.println("Indexing"+numIndexed+"filestook"+(end-start)+"milliseconds");
}
privatestaticintindex(FileindexDir,FiledataDir)throwsIOException
{
if(!dataDir.exists()||!dataDir.isDirectory())
{
thrownewIOException(dataDir+"doesnotexistorisnotadirectory");
}
IndexWriterwriter=newIndexWriter(indexDir,newStandardAnalyzer(),true);
writer.setUseCompoundFile(false);
indexDirectory(writer,dataDir);
intnumIndexed=writer.docCount();
writer.optimize();
writer.close();
returnnumIndexed;
}
privatestaticvoidindexDirectory(IndexWriterwriter,FiledataDir)throwsIOException
{
//TODOAuto-generatedmethodstub
File[]files=dataDir.listFiles();
for(inti=0;i<files.length;i++)
{
Filef=files[i];
if(f.isDirectory())
{
indexDirectory(writer,f);
}
elseif(f.getName().endsWith(".txt"))
{
indexFile(writer,f);
}
}
}
privatestaticvoidindexFile(IndexWriterwriter,Filef)throwsIOException
{
if(f.isHidden()||!f.exists()||!f.canRead())
{
return;
}
System.out.println("Indexing"+f.getCanonicalPath());
Documentdoc=newDocument();
doc.add(newField("contents",newFileReader(f)));
doc.add(newField("filename",f.getCanonicalPath(),Field.Store.YES,Field.Index.TOKENIZED));
writer.addDocument(doc);
}
}
搜索:
<!--<br><br>Code highlighting produced by Actipro CodeHighlighter (freeware)<br>http://www.CodeHighlighter.com/<br><br>-->importjava.io.File;
importjava.util.Date;
importorg.apache.lucene.analysis.standard.StandardAnalyzer;
importorg.apache.lucene.document.Document;
importorg.apache.lucene.queryParser.QueryParser;
importorg.apache.lucene.search.Hits;
importorg.apache.lucene.search.IndexSearcher;
importorg.apache.lucene.search.Query;
importorg.apache.lucene.store.Directory;
importorg.apache.lucene.store.FSDirectory;
publicclassSearcher
{
/**
*@paramargs
*/
publicstaticvoidmain(String[]args)throwsException
{
if(args.length!=2)
{
thrownewException("Usage:java"+Searcher.class.getName()+"<indexdir><auery>");
}
FileindexDir=newFile(args[0]);//要搜索的索引所在目录
Stringq=args[1];//搜索关键字
if(!indexDir.exists()||!indexDir.isDirectory())
{
thrownewException(indexDir+"doesnotexistorisnotadirectory.");
}
search(indexDir,q);
}
privatestaticvoidsearch(FileindexDir,Stringq)throwsException
{
DirectoryfsDir=FSDirectory.getDirectory(indexDir,false);
IndexSearcheris=newIndexSearcher(fsDir);//打开索引
QueryParserparser=newQueryParser("contents",newStandardAnalyzer());
Queryquery=parser.parse(q);//对文本内容进行分析查询
longstart=newDate().getTime();
Hitshits=is.search(query);//搜索索引
longend=newDate().getTime();
System.err.println("Found"+hits.length()+"document(s)(in"+(end-start)+"milliseconds)thatmatchedquery'"+q+"’:");
for(inti=0;i<hits.length();i++)
{
Documentdoc=hits.doc(i);//得到匹配的文档
System.out.println(doc.get("filename"));
}
}
}
importjava.util.Date;
importorg.apache.lucene.analysis.standard.StandardAnalyzer;
importorg.apache.lucene.document.Document;
importorg.apache.lucene.queryParser.QueryParser;
importorg.apache.lucene.search.Hits;
importorg.apache.lucene.search.IndexSearcher;
importorg.apache.lucene.search.Query;
importorg.apache.lucene.store.Directory;
importorg.apache.lucene.store.FSDirectory;
publicclassSearcher
{
/**
*@paramargs
*/
publicstaticvoidmain(String[]args)throwsException
{
if(args.length!=2)
{
thrownewException("Usage:java"+Searcher.class.getName()+"<indexdir><auery>");
}
FileindexDir=newFile(args[0]);//要搜索的索引所在目录
Stringq=args[1];//搜索关键字
if(!indexDir.exists()||!indexDir.isDirectory())
{
thrownewException(indexDir+"doesnotexistorisnotadirectory.");
}
search(indexDir,q);
}
privatestaticvoidsearch(FileindexDir,Stringq)throwsException
{
DirectoryfsDir=FSDirectory.getDirectory(indexDir,false);
IndexSearcheris=newIndexSearcher(fsDir);//打开索引
QueryParserparser=newQueryParser("contents",newStandardAnalyzer());
Queryquery=parser.parse(q);//对文本内容进行分析查询
longstart=newDate().getTime();
Hitshits=is.search(query);//搜索索引
longend=newDate().getTime();
System.err.println("Found"+hits.length()+"document(s)(in"+(end-start)+"milliseconds)thatmatchedquery'"+q+"’:");
for(inti=0;i<hits.length();i++)
{
Documentdoc=hits.doc(i);//得到匹配的文档
System.out.println(doc.get("filename"));
}
}
}
重点学习的几个方面:
<!--[if !supportLists]-->1, <!--[endif]-->Lucene的索引结构,
<!--[if !supportLists]-->2, <!--[endif]-->各种异构数据源的解析器(Parser)
<!--[if !supportLists]-->3, <!--[endif]-->分词器(Analyzer)提取索引项(尤其是如何进行中文分词)
注:这里使用的是lucene2.2.0