书签分享收藏举报版权申诉 / 18

立即下载加入VIP,免费下载

当前位置：首页 > 自然科学 > 物理 > 简单搜索引擎设计和 Java 源代码.docx

简单搜索引擎设计和 Java 源代码.docx

文档编号：23675144
上传时间：2023-05-19
格式：DOCX
页数：18
大小：17.78KB

《简单搜索引擎设计和 Java 源代码.docx》由会员分享，可在线阅读，更多相关《简单搜索引擎设计和 Java 源代码.docx（18页珍藏版）》请在冰豆网上搜索。

简单搜索引擎设计和 Java 源代码.docx

简单搜索引擎设计和Java源代码

Asimplesearchenginewiththefollowingfeatures:

Includedinthepackage:

sourcecode:

SimpleSearchEngine.java

SimpleSearchEngineImpl.java

SimpleSearchEngineTest.java

readme:

thisfile

stopWords:

thestopwordfile

searchFiles/:

adirectorythatcontainsabunchoftestfiles

Usage:

1.SimpleSearchEngineTest.javacanbemodifiedtoaddmoredocumentsandaddnewqueries.

1.compilethecode

2.Torun:

java-cp.SimpleSearchEngineTest

Features:

1.buildinvertedindexfortermsindocumentsandstoreinanindexfile.Theindexwillbeupdatedasmoredocumentsareadded.Andtheindexisloadedintomemoryduringstartup

2.examinestopwords

3.simplequerybysplittingthequerystringintowordsandreturningthelistofthenamesofdocumentswithoneormorewordsinthem

4.simplerankingofthesearchresultbasedonthenumberofsearchwordsinthedocuments

Preparation:

1.adocumentfolderwhereallthedocumentsresides,assumingsearchFiles/inthetest.

2.thepathoftheindexfile.AnindexfilehastheinvertedindexoftermmappedtoalistofdocIds.Thisindexwillbeupdatedandthefilewillbeupdatedasdocumentsareadded.

3.thepathofadocumentnameindexfile.ThisfilehasthedocIdtodocNamemapping.Thisfilewillbeupdatedasdocumentsareadded.

4.astopwordfilewiththestopwords.Anexampleisgiven.

SimpleSearchEngine.java

importjava.util.List;

/**

*Asimplesearchengine

*

*/

publicinterfaceSimpleSearchEngine{

/*

*simplequerybysplittingthequeryintosearchtermsandlookinguptheindex,

*rankingresultsbythenumberofsearchtermsappearinginadocument

*

*@returnlistofdocumentnames

*

*/

publicListquery（StringqueryStr）;

/**

*addadocumentandupdatetheindex

*

*@paramdocNamedocumentname

*/

publicvoidaddDoc（StringdocName）;

}

SimpleSearchEngineImpl.java

importjava.io.BufferedReader;

importjava.io.File;

importjava.io.FileReader;

importjava.io.FileWriter;

importjava.io.IOException;

importjava.util.Comparator;

importjava.util.HashMap;

importjava.util.HashSet;

importjava.util.Iterator;

importjava.util.List;

importjava.util.Map;

importjava.util.Set;

importjava.util.TreeMap;

importjava.util.TreeSet;

/**

*Asimplesearchenginewiththefollowingfeatures:

*

*1.buildinvertedindexfortermsindocumentsandstoreinanindexfile.Theindexwillbeupdatedasmoredocumentsareadded.

*Andtheindexisloadedintomemoryduringstartup

*2.examinestopwords

*3.simplequerybysplittingthequerystringintowordsandreturningthelistofthenamesofdocumentswithoneormorewordsinthem

*4.simplerankingofthesearchresultbasedonthenumberofsearchwordsinthedocuments

*

*Preparation:

*1.adocumentfolderwhereallthedocumentsresides

*2.pathoftheindexfile.AnindexfilehastheinvertedindexoftermmappedtoalistofdocIds.Thisindexwillbeupdatedand

*thefilewillbeupdatedasdocumentsareadded.

*3.pathofadocumentnameindexfile.ThisfilehasthedocIdtodocNamemapping.Thisfilewillbeupdatedasdocumentsareadded.

*4.astopwordfilewiththestopwords

*

*@authordennisli

*

*/

publicclassSimpleSearchEngineImplimplementsSimpleSearchEngine{

privateStringdocFolderPath=null;

privateStringindexFilePath=null;

privateStringdocNameIndexPath=null;

privateStringstopWordsFilePath=null;

//searchindexmap,mappingwordstoasetofdocumentIds

privateMap>searchIndex=newHashMap>（）;

//docnameindexmap,mappingthedocIdtodocName

privateMapdocNames=newHashMap（）;

//asetofstopwords

privateSetstopWords=newHashSet（）;

publicSimpleSearchEngineImpl（StringdocFolderPath,StringindexFilePath,StringdocNameIndexPath,StringstopWordsFilePath）{

if（docFolderPath.charAt（docFolderPath.length（）-1）=='/'）

this.docFolderPath=docFolderPath;

else

this.docFolderPath=docFolderPath+"//";

this.indexFilePath=indexFilePath;

this.docNameIndexPath=docNameIndexPath;

this.stopWordsFilePath=stopWordsFilePath;

}

/*

*initialize

*

*loadthesearchindex,docnameindex,stopwordsfromfilesiftheyexist

*

*/

publicvoidinit（）{

loadIndexFile（）;

loadDocNameIndex（）;

loadStopWords（）;

}

/*

*simplequerybysplittingthequeryintosearchtermsandlookinguptheindex,

*rankingresultsbythenumberofsearchtermsappearinginadocument

*

*@returnthelistofdocumentnames

*

*/

publicListquery（StringqueryStr）{

//splitthequerystringintoqueryterms

String[]terms=queryStr.split（"[\\s]+"）;

//lookupthesearchindexandgeneratefildId->countmap

HashMapmap=newHashMap（）;

for（inti=0;i

SetdocIds=searchIndex.get（terms[i]）;

if（docIds!

=null&&docIds.size（）>0）{

for（Integerid:

docIds）{

Integercount=map.get（id）;

if（count==null）

map.put（id,newInteger

（1））;

else

map.put（id,count+1）;

}

//rankthesearchresult,simplybasedonthenumberofquerytermsappearinginadocument.Themorethehighertherank.

ValueComparatorbvc=newValueComparator（map）;

TreeMapsortedMap=newTreeMap（bvc）;

sortedMap.putAll（map）;

StringBuilderbuilder=newStringBuilder（）;

Iteratoriter=sortedMap.keySet（）.iterator（）;

if（iter.hasNext（））

builder.append（docNames.get（iter.next（）））;

while（iter.hasNext（））

builder.append（","+docNames.get（iter.next（）））;

System.out.println（"searchresults:

"+builder.toString（））;

returnnull;

}

/**

*addadocumentandupdatetheindex

*

*@paramdocName

*/

publicvoidaddDoc（StringdocName）{

BufferedReaderbr=null;

try{

IntegerfileId=docNames.size（）;

//findthenextavailablefileId

while（docNames.containsKey（fileId））

fileId++;

docNames.put（fileId,docName）;

Stringline;

br=newBufferedReader（newFileReader（docFolderPath+docName））;

while（（line=br.readLine（））!

=null）{

line=line.toLowerCase（）;

String[]terms=line.split（"[^a-z]+"）;

for（inti=0;i

//checkstopword

if（terms[i].length（）<=1||stopWords.contains（terms[i]））

continue;

SetdocIds=searchIndex.get（terms[i]）;

//createdocIdslistandaddtothesearchindex

if（docIds==null）{

docIds=newTreeSet（）;

docIds.add（fileId）;

searchIndex.put（terms[i],docIds）;

}

else

docIds.add（fileId）;

}

//printSearchIndex（）;

}

catch（IOExceptionex）{

System.err.println（"erroraccessingdoc:

"+ex）;

}

finally{

if（br!

=null）{

try{

br.close（）;

}

catch（IOExceptionex）{

System.err.println（"errorclosingdoc:

"+ex）;

}

//printDocNameIndexFile（）;

}

/**

*loadthesearchindexfromfile.

*

*Theformatofeachlineoftheindexfileisasfollows:

*word:

docId1,docId2,docId3,...

*

*Example:

*will:

0,1,2,3

*wise:

2

*/

publicvoidloadIndexFile（）{

BufferedReaderbr=null;

try{

Filefile=newFile（indexFilePath）;

//iffiledoesntexists,thencreateit

if（!

file.exists（））{

file.createNewFile（）;

return;

}

br=newBufferedReader（newFileReader（file））;

Stringline;

while（（line=br.readLine（））!

=null）{

inti=line.indexOf（':

'）;

Stringkey=line.substring（0,i）;

String[]terms=line.substring（i+1）.trim（）.split（"[,\\s]+"）;

Setset=newTreeSet（）;

for（Stringterm:

terms）{

term=term.trim（）;

if（term.length（）>0）{

try{

set.add（Integer.valueOf（term））;

}

catch（NumberFormatExceptionex）{

System.err.println（"loadIndexFile:

baddocIdinline:

"+line）;

}

searchIndex.put（key,set）;

}

catch（IOExceptionex）{

System.err.println（"erroraccessingfile:

"+ex）;

}

finally{

if（br!

=null）{

try{

br.close（）;

}

catch（IOExceptionex）{

System.err.println（"errorclosingfile:

"+ex）;

}

/**

*loadthedocumentnameindexfromfile

*

*Theformatis:

*docIddocName

*

*example:

*0braveNewWord

*1weAre

*

*/

publicvoidloadDocNameIndex（）{

BufferedReaderbr=null;

try{

Filefile=newFile（docNameIndexPath）;

//iffiledoesntexists,thencreateit

if（!

file.exists（））{

file.createNewFile（）;

return;

}

br=newBufferedReader（newFileReader（file））;

Stringline;

while（（line=br.readLine（））!

=null）{

String[]terms=line.split（"[\\s]+"）;

if（terms[0].length（）>0&&terms[1].length（）>0）{

try{

docNames.put（Integer.valueOf（terms[0]）,terms[1]）;

}

catch（NumberFormatExceptionex）{

System.err.println（"loadDocNameIndex:

baddocIdinline:

"+line）;

}

catch（IOExceptionex）{

System.err.println（"erroraccessingfile:

"+ex）;

}

finally{

if（br!

=null）{

try{

br.close（）;

}

catch（IOExceptionex）{

System.err.println（"errorclosingfile:

"+ex）;

}

/**

*loadthestopwordsfromfile

*theformatisonewordperline

*/

publicvoidloadStopWords（）{

BufferedReaderbr=null;

try{

br=newBufferedReader（newFileReader（stopWords

文档加载中……请稍候！
如果长时间未打开，您也可以点击刷新试试。

下载文档到电脑，查找使用更方便

下载	加入VIP,免费下载

版权申诉 word格式文档无特别注明外均可编辑修改；预览文档经过压缩，下载后原文更清晰！ 立即下载

配套讲稿：: 如PPT文件的首页显示word图标，表示该PPT已包含配套word讲稿。双击word图标可打开word文档。
特殊限制：: 部分文档作品中含有的国旗、国徽等图片，仅作为作品整体效果示例展示，禁止商用。设计者仅对作品中独创性部分享有著作权。
关键词：: 简单搜索引擎设计和 Java 源代码简单搜索引擎设计

冰豆网所有资源均是用户自行上传分享，仅供网友学习交流，未经上传用户书面授权，请勿作他用。

关于本文

本文标题：简单搜索引擎设计和 Java 源代码.docx
链接地址：https://www.bdocx.com/doc/23675144.html

简单搜索引擎设计和 Java 源代码.docx

热门标签