简单搜索引擎设计和 Java 源代码.docx
- 文档编号:23675144
- 上传时间:2023-05-19
- 格式:DOCX
- 页数:18
- 大小:17.78KB
简单搜索引擎设计和 Java 源代码.docx
《简单搜索引擎设计和 Java 源代码.docx》由会员分享,可在线阅读,更多相关《简单搜索引擎设计和 Java 源代码.docx(18页珍藏版)》请在冰豆网上搜索。
简单搜索引擎设计和Java源代码
Asimplesearchenginewiththefollowingfeatures:
Includedinthepackage:
sourcecode:
SimpleSearchEngine.java
SimpleSearchEngineImpl.java
SimpleSearchEngineTest.java
readme:
thisfile
stopWords:
thestopwordfile
searchFiles/:
adirectorythatcontainsabunchoftestfiles
Usage:
1.SimpleSearchEngineTest.javacanbemodifiedtoaddmoredocumentsandaddnewqueries.
1.compilethecode
2.Torun:
java-cp.SimpleSearchEngineTest
Features:
1.buildinvertedindexfortermsindocumentsandstoreinanindexfile.Theindexwillbeupdatedasmoredocumentsareadded.Andtheindexisloadedintomemoryduringstartup
2.examinestopwords
3.simplequerybysplittingthequerystringintowordsandreturningthelistofthenamesofdocumentswithoneormorewordsinthem
4.simplerankingofthesearchresultbasedonthenumberofsearchwordsinthedocuments
Preparation:
1.adocumentfolderwhereallthedocumentsresides,assumingsearchFiles/inthetest.
2.thepathoftheindexfile.AnindexfilehastheinvertedindexoftermmappedtoalistofdocIds.Thisindexwillbeupdatedandthefilewillbeupdatedasdocumentsareadded.
3.thepathofadocumentnameindexfile.ThisfilehasthedocIdtodocNamemapping.Thisfilewillbeupdatedasdocumentsareadded.
4.astopwordfilewiththestopwords.Anexampleisgiven.
SimpleSearchEngine.java
importjava.util.List;
/**
*Asimplesearchengine
*
*
*/
publicinterfaceSimpleSearchEngine{
/*
*simplequerybysplittingthequeryintosearchtermsandlookinguptheindex,
*rankingresultsbythenumberofsearchtermsappearinginadocument
*
*@returnlistofdocumentnames
*
*
*/
publicList
/**
*addadocumentandupdatetheindex
*
*@paramdocNamedocumentname
*/
publicvoidaddDoc(StringdocName);
}
SimpleSearchEngineImpl.java
importjava.io.BufferedReader;
importjava.io.File;
importjava.io.FileReader;
importjava.io.FileWriter;
importjava.io.IOException;
importjava.util.Comparator;
importjava.util.HashMap;
importjava.util.HashSet;
importjava.util.Iterator;
importjava.util.List;
importjava.util.Map;
importjava.util.Set;
importjava.util.TreeMap;
importjava.util.TreeSet;
/**
*Asimplesearchenginewiththefollowingfeatures:
*
*1.buildinvertedindexfortermsindocumentsandstoreinanindexfile.Theindexwillbeupdatedasmoredocumentsareadded.
*Andtheindexisloadedintomemoryduringstartup
*2.examinestopwords
*3.simplequerybysplittingthequerystringintowordsandreturningthelistofthenamesofdocumentswithoneormorewordsinthem
*4.simplerankingofthesearchresultbasedonthenumberofsearchwordsinthedocuments
*
*
*Preparation:
*1.adocumentfolderwhereallthedocumentsresides
*2.pathoftheindexfile.AnindexfilehastheinvertedindexoftermmappedtoalistofdocIds.Thisindexwillbeupdatedand
*thefilewillbeupdatedasdocumentsareadded.
*3.pathofadocumentnameindexfile.ThisfilehasthedocIdtodocNamemapping.Thisfilewillbeupdatedasdocumentsareadded.
*4.astopwordfilewiththestopwords
*
*@authordennisli
*
*/
publicclassSimpleSearchEngineImplimplementsSimpleSearchEngine{
privateStringdocFolderPath=null;
privateStringindexFilePath=null;
privateStringdocNameIndexPath=null;
privateStringstopWordsFilePath=null;
//searchindexmap,mappingwordstoasetofdocumentIds
privateMap
//docnameindexmap,mappingthedocIdtodocName
privateMap
//asetofstopwords
privateSet
publicSimpleSearchEngineImpl(StringdocFolderPath,StringindexFilePath,StringdocNameIndexPath,StringstopWordsFilePath){
if(docFolderPath.charAt(docFolderPath.length()-1)=='/')
this.docFolderPath=docFolderPath;
else
this.docFolderPath=docFolderPath+"//";
this.indexFilePath=indexFilePath;
this.docNameIndexPath=docNameIndexPath;
this.stopWordsFilePath=stopWordsFilePath;
}
/*
*initialize
*
*loadthesearchindex,docnameindex,stopwordsfromfilesiftheyexist
*
*/
publicvoidinit(){
loadIndexFile();
loadDocNameIndex();
loadStopWords();
}
/*
*simplequerybysplittingthequeryintosearchtermsandlookinguptheindex,
*rankingresultsbythenumberofsearchtermsappearinginadocument
*
*@returnthelistofdocumentnames
*
*/
publicList
//splitthequerystringintoqueryterms
String[]terms=queryStr.split("[\\s]+");
//lookupthesearchindexandgeneratefildId->countmap
HashMap
for(inti=0;i Set if(docIds! =null&&docIds.size()>0){ for(Integerid: docIds){ Integercount=map.get(id); if(count==null) map.put(id,newInteger (1)); else map.put(id,count+1); } } } //rankthesearchresult,simplybasedonthenumberofquerytermsappearinginadocument.Themorethehighertherank. ValueComparatorbvc=newValueComparator(map); TreeMap sortedMap.putAll(map); StringBuilderbuilder=newStringBuilder(); Iterator if(iter.hasNext()) builder.append(docNames.get(iter.next())); while(iter.hasNext()) builder.append(","+docNames.get(iter.next())); System.out.println("searchresults: "+builder.toString()); returnnull; } /** *addadocumentandupdatetheindex * *@paramdocName */ publicvoidaddDoc(StringdocName){ BufferedReaderbr=null; try{ IntegerfileId=docNames.size(); //findthenextavailablefileId while(docNames.containsKey(fileId)) fileId++; docNames.put(fileId,docName); Stringline; br=newBufferedReader(newFileReader(docFolderPath+docName)); while((line=br.readLine())! =null){ line=line.toLowerCase(); String[]terms=line.split("[^a-z]+"); for(inti=0;i //checkstopword if(terms[i].length()<=1||stopWords.contains(terms[i])) continue; Set //createdocIdslistandaddtothesearchindex if(docIds==null){ docIds=newTreeSet docIds.add(fileId); searchIndex.put(terms[i],docIds); } else docIds.add(fileId); } } //printSearchIndex(); } catch(IOExceptionex){ System.err.println("erroraccessingdoc: "+ex); } finally{ if(br! =null){ try{ br.close(); } catch(IOExceptionex){ System.err.println("errorclosingdoc: "+ex); } } } //printDocNameIndexFile(); } /** *loadthesearchindexfromfile. * *Theformatofeachlineoftheindexfileisasfollows: *word: docId1,docId2,docId3,... * *Example: *will: 0,1,2,3 *wise: 2 */ publicvoidloadIndexFile(){ BufferedReaderbr=null; try{ Filefile=newFile(indexFilePath); //iffiledoesntexists,thencreateit if(! file.exists()){ file.createNewFile(); return; } br=newBufferedReader(newFileReader(file)); Stringline; while((line=br.readLine())! =null){ inti=line.indexOf(': '); Stringkey=line.substring(0,i); String[]terms=line.substring(i+1).trim().split("[,\\s]+"); Set for(Stringterm: terms){ term=term.trim(); if(term.length()>0){ try{ set.add(Integer.valueOf(term)); } catch(NumberFormatExceptionex){ System.err.println("loadIndexFile: baddocIdinline: "+line); } } } searchIndex.put(key,set); } } catch(IOExceptionex){ System.err.println("erroraccessingfile: "+ex); } finally{ if(br! =null){ try{ br.close(); } catch(IOExceptionex){ System.err.println("errorclosingfile: "+ex); } } } } /** *loadthedocumentnameindexfromfile * *Theformatis: *docIddocName * *example: *0braveNewWord *1weAre * */ publicvoidloadDocNameIndex(){ BufferedReaderbr=null; try{ Filefile=newFile(docNameIndexPath); //iffiledoesntexists,thencreateit if(! file.exists()){ file.createNewFile(); return; } br=newBufferedReader(newFileReader(file)); Stringline; while((line=br.readLine())! =null){ String[]terms=line.split("[\\s]+"); if(terms[0].length()>0&&terms[1].length()>0){ try{ docNames.put(Integer.valueOf(terms[0]),terms[1]); } catch(NumberFormatExceptionex){ System.err.println("loadDocNameIndex: baddocIdinline: "+line); } } } } catch(IOExceptionex){ System.err.println("erroraccessingfile: "+ex); } finally{ if(br! =null){ try{ br.close(); } catch(IOExceptionex){ System.err.println("errorclosingfile: "+ex); } } } } /** *loadthestopwordsfromfile *theformatisonewordperline */ publicvoidloadStopWords(){ BufferedReaderbr=null; try{ br=newBufferedReader(newFileReader(stopWords
- 配套讲稿:
如PPT文件的首页显示word图标,表示该PPT已包含配套word讲稿。双击word图标可打开word文档。
- 特殊限制:
部分文档作品中含有的国旗、国徽等图片,仅作为作品整体效果示例展示,禁止商用。设计者仅对作品中独创性部分享有著作权。
- 关 键 词:
- 简单搜索引擎设计和 Java 源代码 简单 搜索引擎 设计