书签分享收藏举报版权申诉 / 23

立即下载加入VIP,免费下载

当前位置：首页 > 考试认证 > 司法考试 > lucene代码.docx

lucene代码.docx

文档编号：4725518
上传时间：2022-12-08
格式：DOCX
页数：23
大小：21.71KB

lucene代码.docx

《lucene代码.docx》由会员分享，可在线阅读，更多相关《lucene代码.docx（23页珍藏版）》请在冰豆网上搜索。

lucene代码.docx

lucene代码

Demo.java

packagecom.xxd;

importjava.io.File;

importjava.io.IOException;

importjava.util.ArrayList;

importjava.util.Collections;

importjava.util.HashMap;

importjava.util.Scanner;

importorg.apache.lucene.analysis.Analyzer;

importorg.apache.lucene.analysis.standard.StandardAnalyzer;

importorg.apache.lucene.document.Document;

importorg.apache.lucene.index.DirectoryReader;

importorg.apache.lucene.index.DocsEnum;

importorg.apache.lucene.index.Fields;

importorg.apache.lucene.index.IndexWriter;

importorg.apache.lucene.index.IndexWriterConfig;

importorg.apache.lucene.index.IndexWriterConfig.OpenMode;

importorg.apache.lucene.index.MultiFields;

importorg.apache.lucene.index.Term;

importorg.apache.lucene.index.Terms;

importorg.apache.lucene.index.TermsEnum;

importorg.apache.lucene.queryparser.classic.ParseException;

importorg.apache.lucene.queryparser.classic.QueryParser;

importorg.apache.lucene.search.DocIdSetIterator;

importorg.apache.lucene.search.IndexSearcher;

importorg.apache.lucene.search.Query;

importorg.apache.lucene.search.ScoreDoc;

importorg.apache.lucene.search.TopDocs;

importorg.apache.lucene.store.Directory;

importorg.apache.lucene.store.FSDirectory;

importorg.apache.lucene.util.BytesRef;

importorg.apache.lucene.util.Version;

/*

*http:

//lucene.apache.org/core/4_7_0/

*/

publicclassDemo{

//源文件路径

privatestaticStringfilePath="test/";

//要创建索引的文件目录

privatestaticStringindexPath="index/";

//停用词表

privatestaticStringstopwordTxt="stopwords.txt";

//存放统计出来词频的文件

privatestaticStringwordFreqTxt="wordfreq.txt";

//经过词干化的统计

privatestaticStringstemFreqTxt="stemfreq.txt";

//经过词干化的统计

privatestaticStringtermVectorTxt="termVecor.txt";

publicstaticvoidmain（Stringargs[]）{

Demodemo=newDemo（）;

//demo.getWordList（）;

//demo.runLesson1Demo（）;

//demo.runLesson2Demo（）;

//demo.runLesson3Demo（）;

demo.runLesson4Demo（）;

}

//实验一测试

publicvoidrunLesson1Demo（）{

System.out.println（"-------------正在运行实验一-------------"）;

System.out.println（"-------------删除历史索引-------------"）;

deleteIndex（indexPath）;

System.out.println（"-------------重建索引-------------"）;

System.out.println（"-------------分析器：

标准分析器-------------"）;

System.out.println（"-------------过滤器：

标准过滤器-------------"）;

Analyzeranalyzer=newStandardAnalyzer（Version.LUCENE_47）;

//创建索引

createIndex（analyzer）;

//存放词频统计的哈希表

HashMapmap=countWord（）;

System.out.println（"-------------正在将结果写入文件-------------"）;

XxdUtil.writeToFile（map,wordFreqTxt）;

System.out.println（"-------------写入完毕，打开"+wordFreqTxt

+"查看-------------"）;

}

//实验二测试

publicvoidrunLesson2Demo（）{

System.out.println（"-------------正在运行实验二-------------"）;

System.out.println（"-------------删除历史索引-------------"）;

deleteIndex（indexPath）;

System.out.println（"-------------重建索引：

停用词-------------"）;

System.out.println（"-------------分析器：

自建分析器-------------"）;

System.out

.println（"-------------过滤器：

大小写过滤器，停用词过滤器，波特词干过滤器-------------"）;

StopwordPorterStemAnalyzerporterAnalyzer=newStopwordPorterStemAnalyzer（

Version.LUCENE_47,stopwordTxt）;

//创建索引

createIndex（porterAnalyzer）;

//存放词频统计的哈希表

HashMapmap=countWord（）;

System.out.println（"-------------正在将结果写入文件-------------"）;

XxdUtil.writeToFile（map,stemFreqTxt）;

System.out.println（"-------------写入完毕，打开"+stemFreqTxt

+"查看-------------"）;

}

//实验三测试

publicvoidrunLesson3Demo（）{

System.out.println（"-------------正在运行实验三-------------"）;

ArrayListlist=getWordList（）;

XxdUtil.writeToFile（list,termVectorTxt）;

System.out.println（"-------------写入完毕，打开"+termVectorTxt

+"查看-------------"）;

//排序

Collections.sort（list,newXxdSorter（））;

//遍历一次，找出频次最高的

FilefileFile=newFile（filePath）;

//源文件数量

intfileCount=fileFile.list（）.length;

int[]maxCountArray=newint[fileCount];

//遍历源文件

for（inti=0;i

intmaxCount=0;

for（Wordword:

list）{

if（word.getFreqList（）.get（i）>maxCount）

maxCount=word.getFreqList（）.get（i）;

}

maxCountArray[i]=maxCount;

ArrayListtfList=newArrayList（）;

for（Wordword:

list）{

//设置这一篇文档的TF

word.addTFList（（double）word.getFreqList（）.get（i）

/（double）maxCount）;

}

for（Wordword:

list）{

Strings="";

s+="词语:

"+word.getName（）+"\tTF：

";

for（inti=0;i

s+=word.getTFList（）.get（i）+",";

s=s.substring（0,s.length（）-1）;

s+="\tIDF：

"+word.getIDF（）;

System.out.println（s）;

}

//实验四

publicvoidrunLesson4Demo（）{

System.out.println（"-------------正在运行实验四-------------"）;

Scannerscan=newScanner（System.in）;

Strings=scan.nextLine（）;

search（s）;

}

privatevoidsearch（StringqueryString）{

try{//获取索引文件目录字符串

FileindexFile=newFile（indexPath）;

//创建索引文件目录

DirectoryindexDir=FSDirectory.open（indexFile）;

//初始化读取流

DirectoryReaderdirectoryReader=DirectoryReader.open（indexDir）;

directoryReader=DirectoryReader.open（indexDir）;

IndexSearcherindexSearcher=newIndexSearcher（directoryReader）;

Analyzeranalyzer=newQueryAnalyzer（Version.LUCENE_47,

stopwordTxt）;

QueryParserqueryParser=newQueryParser（Version.LUCENE_47,

"content",analyzer）;

Queryquery=queryParser.parse（queryString）;

TopDocstd=indexSearcher.search（query,50）;

Stringstr="DocNo"+'\t'+"DocScore"+'\n';

for（ScoreDocsd:

td.scoreDocs）{

intdocID=sd.doc;

Documentdoc=indexSearcher.doc（docID）;

Stringfilename=doc.getField（"filepath"）.stringValue（）;

str+=filename+"\t"+sd.score+'\n';

}

System.out.println（str）;

}catch（IOExceptionioe）{

//TODO自动生成的catch块

ioe.printStackTrace（）;

}catch（ParseExceptionpe）{

//TODO自动生成的catch块

pe.printStackTrace（）;

}

/*

*function得到词语动态数组

*

*@returnArrayList词语动态数组

*/

privateArrayListgetWordList（）{

ArrayListlist=newArrayList（）;

//TODO自动生成的方法存根

try{

FilefileFile=newFile（filePath）;

intfileCount=fileFile.list（）.length;

//获取索引文件目录字符串

FileindexFile=newFile（indexPath）;

//创建索引文件目录

DirectoryindexDir=FSDirectory.open（indexFile）;

//初始化读取流

DirectoryReaderdirectoryReader=DirectoryReader.open（indexDir）;

//获取索引的域

Fieldsfields=MultiFields.getFields（directoryReader）;

//设置要搜索的域

Stringfieldname="content";

//获取该域下的所有词

Termsterms=fields.terms（fieldname）;

//根据所有词的迭代器迭代到每个具体的词

TermsEnumtermEnum=terms.iterator（null）;

//字节

BytesRefbyteRef=null;

System.out.println（"-------------打印词频统计结果-------------"）;

//遍历ByteRef对象

while（（byteRef=termEnum.next（））!

=null）{

Wordword=newWord（）;

//转换为Term对象，方便用API统计

Termterm=newTerm（fieldname,byteRef）;

intcount=（int）directoryReader.totalTermFreq（term）;

//设置词名

word.setName（term.text（））;

//设置词频

word.setCount（count）;

//设置IDF

word.setIDF（Math.log（directoryReader.numDocs（）

/（double）termEnum.docFreq（）））;

System.out.println（Math.log（directoryReader.numDocs（）

/（double）termEnum.docFreq（）））;

//初始化动态数组用于存放词语在每个文档中的出现次数，默认为0

ArrayListwordFreqList=newArrayList（

fileCount）;

for（inti=0;i

wordFreqList.add（0）;

}

System.out.println（"词语:

"+term.text（）+"\t词频"

+directoryReader.totalTermFreq（term））;

DocsEnumdocs=termEnum.docs（null,null）;

while（docs.nextDoc（）!

=DocIdSetIterator.NO_MORE_DOCS）{

HashMapmap=newHashMap（）;

map.put（docs.docID（）,docs.freq（））;

//输出调试

System.out.println（term.text（）+"在"+"第"

+（1+docs.docID（））+"篇文章中出现了"+docs.freq（）

+"次"）;

wordFreqList.set（docs.docID（）,docs.freq（））;

}

//设置文档向量

word.setFreqList（wordFreqList）;

//加入链表

list.add（word）;

}

}catch（IOExceptione）{

//TODO自动生成的catch块

e.printStackTrace（）;

}

returnlist;

}

/*

*function统计单文档词频

*

*@returnHashMap词频统计哈希表

*/

privateHashMapcountWord（）{

HashMapmap=newHashMap（）;

//TODO自动生成的方法存根

try{

FilefileFile=newFile（filePath）;

intfileCount=fileFile.list（）.length;

//获取索引文件目录字符串

FileindexFile=newFile（indexPath）;

//创建索引文件目录

DirectoryindexDir=FSDirectory.open（indexFile）;

//初始化读取流

DirectoryReaderindexReader=DirectoryReader.open（indexDir）;

//获取索引的域

Fieldsfields=MultiFields.getFields（indexReader）;

//设置要搜索的域

Stringfieldname="content";

//获取该域下的所有词

Termsterms=fields.terms（fieldname）;

//根据所有词的迭代器迭代到每个具体的词

TermsEnumtermEnum=terms.iterator（null）;

//字节

BytesRefbyteRef=null;

System.out.println（"-------------打印词频统计结果-------------"）;

//遍历ByteRef对象

while（（byteRef=termEnum.next（））!

=null）{

//转换为Term对象，方便用API统计

Termterm=newTerm（fieldname,byteRef）;

intcount=（int）indexReader.totalTermFreq（term）;

//存入哈希表

map.put（term.text（）,count）;

System.out.println（term.text（）+"出现了"+count+"次"）;

}

}catch（IOExceptione）{

//TODO自动生成的catch块

e.printStackTrace（）;

}

returnmap;

}

/*

*function删除索引

*

*@paramStringfilepath索引目录

*@returnvoid

*/

publicvoiddeleteIndex（Stringfilepath）{

Filefile=newFile（filepath）;

if（file.isDirectory（））

for（Filesubfile:

file.listFiles（））

deleteIndex（subfile.getPath（））;

else

file.delete（）;

}

/*

*func

文档加载中……请稍候！
如果长时间未打开，您也可以点击刷新试试。

下载文档到电脑，查找使用更方便

下载	加入VIP,免费下载

版权申诉 word格式文档无特别注明外均可编辑修改；预览文档经过压缩，下载后原文更清晰！ 立即下载

配套讲稿：: 如PPT文件的首页显示word图标，表示该PPT已包含配套word讲稿。双击word图标可打开word文档。
特殊限制：: 部分文档作品中含有的国旗、国徽等图片，仅作为作品整体效果示例展示，禁止商用。设计者仅对作品中独创性部分享有著作权。
关键词：: lucene 代码

冰豆网所有资源均是用户自行上传分享，仅供网友学习交流，未经上传用户书面授权，请勿作他用。

关于本文

本文标题：lucene代码.docx
链接地址：https://www.bdocx.com/doc/4725518.html

lucene代码.docx

热门标签