TFIDF实现.docx
- 文档编号:8442673
- 上传时间:2023-01-31
- 格式:DOCX
- 页数:6
- 大小:15.38KB
TFIDF实现.docx
《TFIDF实现.docx》由会员分享,可在线阅读,更多相关《TFIDF实现.docx(6页珍藏版)》请在冰豆网上搜索。
TFIDF实现
TFIDF实现
packagetfidf;
importjava.io.BufferedReader;
importjava.io.File;
importjava.io.FileInputStream;
importjava.io.FileNotFoundException;
importjava.io.IOException;
importjava.io.InputStreamReader;
importjava.io.UnsupportedEncodingException;
importjava.util.ArrayList;
importjava.util.HashMap;
importjava.util.List;
importjava.util.Map;
importjeasy.analysis.MMAnalyzer;
publicclassReadFiles{
privatestaticList<String>fileList=newArrayList<String>();
privatestaticHashMap<String,HashMap<String,Float>>allTheTf=newHashMap<String,HashMap<String,Float>>();
privatestaticHashMap<String,HashMap<String,Integer>>allTheNormalTF=newHashMap<String,HashMap<String,Integer>>();
publicstaticList<String>readDirs(Stringfilepath)throwsFileNotFoundException,IOException{
try{
Filefile=newFile(filepath);
if(!
file.isDirectory()){
System.out.println("输入的参数应该为[文件夹名]");
System.out.println("filepath:
"+file.getAbsolutePath());
}elseif(file.isDirectory()){
String[]filelist=file.list();
for(inti=0;i<filelist.length;i++){
Filereadfile=newFile(filepath+"\\"+filelist[i]);
if(!
readfile.isDirectory()){
//System.out.println("filepath:
"+readfile.getAbsolutePath());
fileList.add(readfile.getAbsolutePath());
}elseif(readfile.isDirectory()){
readDirs(filepath+"\\"+filelist[i]);
}
}
}
}catch(FileNotFoundExceptione){
System.out.println(e.getMessage());
}
returnfileList;
}
publicstaticStringreadFiles(Stringfile)throwsFileNotFoundException,IOException{
StringBuffersb=newStringBuffer();
InputStreamReaderis=newInputStreamReader(newFileInputStream(file),"gbk");
BufferedReaderbr=newBufferedReader(is);
Stringline=br.readLine();
while(line!
=null){
sb.append(line).append("\r\n");
line=br.readLine();
}
br.close();
returnsb.toString();
}
publicstaticString[]cutWord(Stringfile)throwsIOException{
String[]cutWordResult=null;
Stringtext=ReadFiles.readFiles(file);
MMAnalyzeranalyzer=newMMAnalyzer();
//System.out.println("filecontent:
"+text);
//System.out.println("cutWordResult:
"+analyzer.segment(text,""));
StringtempCutWordResult=analyzer.segment(text,"");
cutWordResult=tempCutWordResult.split("");
returncutWordResult;
}
publicstaticHashMap<String,Float>tf(String[]cutWordResult){
HashMap<String,Float>tf=newHashMap<String,Float>();//正规化
intwordNum=cutWordResult.length;
intwordtf=0;
for(inti=0;i<wordNum;i++){
wordtf=0;
for(intj=0;j<wordNum;j++){
if(cutWordResult[i]!
=""&&i!
=j){
if(cutWordResult[i].equals(cutWordResult[j])){
cutWordResult[j]="";
wordtf++;
}
}
}
if(cutWordResult[i]!
=""){
tf.put(cutWordResult[i],(newFloat(++wordtf))/wordNum);
cutWordResult[i]="";
}
}
returntf;
}
publicstaticHashMap<String,Integer>normalTF(String[]cutWordResult){
HashMap<String,Integer>tfNormal=newHashMap<String,Integer>();//没有正规化
intwordNum=cutWordResult.length;
intwordtf=0;
for(inti=0;i<wordNum;i++){
wordtf=0;
if(cutWordResult[i]!
=""){
for(intj=0;j<wordNum;j++){
if(i!
=j){
if(cutWordResult[i].equals(cutWordResult[j])){
cutWordResult[j]="";
wordtf++;
}
}
}
tfNormal.put(cutWordResult[i],++wordtf);
cutWordResult[i]="";
}
}
returntfNormal;
}
publicstaticMap<String,HashMap<String,Float>>tfOfAll(Stringdir)throwsIOException{
List<String>fileList=ReadFiles.readDirs(dir);
for(Stringfile:
fileList){
HashMap<String,Float>dict=newHashMap<String,Float>();
dict=ReadFiles.tf(ReadFiles.cutWord(file));
allTheTf.put(file,dict);
}
returnallTheTf;
}
publicstaticMap<String,HashMap<String,Integer>>NormalTFOfAll(Stringdir)throwsIOException{
List<String>fileList=ReadFiles.readDirs(dir);
for(inti=0;i<fileList.size();i++){
HashMap<String,Integer>dict=newHashMap<String,Integer>();
dict=ReadFiles.normalTF(ReadFiles.cutWord(fileList.get(i)));
allTheNormalTF.put(fileList.get(i),dict);
}
returnallTheNormalTF;
}
publicstaticMap<String,Float>idf(Stringdir)throwsFileNotFoundException,UnsupportedEncodingException,IOException{
//公式IDF=log((1+|D|)/|Dt|),其中|D|表示文档总数,|Dt|表示包含关键词t的文档数量。
Map<String,Float>idf=newHashMap<String,Float>();
List<String>located=newArrayList<String>();
floatDt=1;
floatD=allTheNormalTF.size();//文档总数
List<String>key=fileList;//存储各个文档名的List
Map<String,HashMap<String,Integer>>tfInIdf=allTheNormalTF;//存储各个文档tf的Map
for(inti=0;i<D;i++){
HashMap<String,Integer>temp=tfInIdf.get(key.get(i));
for(Stringword:
temp.keySet()){
Dt=1;
if(!
(located.contains(word))){
for(intk=0;k<D;k++){
if(k!
=i){
HashMap<String,Integer>temp2=tfInIdf.get(key.get(k));
if(temp2.keySet().contains(word)){
located.add(word);
Dt=Dt+1;
continue;
}
}
}
idf.put(word,Log.log((1+D)/Dt,10));
}
}
}
returnidf;
}
publicstaticMap<String,HashMap<String,Float>>tfidf(Stringdir)throwsIOException{
Map<String,Float>idf=ReadFiles.idf(dir);
Map<String,HashMap<String,Float>>tf=ReadFiles.tfOfAll(dir);
for(Stringfile:
tf.keySet()){
Map<String,Float>singelFile=tf.get(file);
for(Stringword:
singelFile.keySet()){
singelFile.put(word,(idf.get(word))*singelFile.get(word));
}
}
returntf;
}
}
- 配套讲稿:
如PPT文件的首页显示word图标,表示该PPT已包含配套word讲稿。双击word图标可打开word文档。
- 特殊限制:
部分文档作品中含有的国旗、国徽等图片,仅作为作品整体效果示例展示,禁止商用。设计者仅对作品中独创性部分享有著作权。
- 关 键 词:
- TFIDF 实现