也谈贝叶斯分类(C#)版本
生活随笔
收集整理的這篇文章主要介紹了
也谈贝叶斯分类(C#)版本
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
???
代碼下載
?最近在做一個大作業(yè)。搭建一個信息檢索平臺。用到了貝葉斯分類參考了洞庭散人大哥的技術博客
http://www.cnblogs.com/phinecos/archive/2008/10/21/1316044.html
但是,他的算法運行起來很慢,原因是IO操作過于頻繁,而且有些IO操作是可以避免的。下面開始介紹我的貝葉斯分類算法實現(xiàn)。
采用分詞器為河北理工大學呂震宇老師的SHARPICTCLAS 該分詞器沒有Lucene接口,自己實現(xiàn)Analyzer 和Tokenizer 類如下
?
ICTCLASAnalyzer using?System;using?System.Collections.Generic;
using?System.Text;
using?System.IO;
using?Lucene.Net.Analysis;
using?Lucene.Net.Analysis.Standard;
namespace?Bayes
{
????class?ICTCLASAnalyzer:Analyzer
????{
????????public?static?readonly?System.String[]?CHINESE_ENGLISH_STOP_WORDS?=?new?string[400];
????????public?string?NoisePath?=?Environment.CurrentDirectory?+?"\\data\\stopwords.txt";
????????public?ICTCLASAnalyzer()
????????{
???????????StreamReader?reader?=?new?StreamReader(NoisePath,?System.Text.Encoding.Default);
????????????string?noise?=?reader.ReadLine();
????????????int?i?=?0;
???????????
????????????while?(!string.IsNullOrEmpty(noise)&&i<400)
????????????{
????????????????CHINESE_ENGLISH_STOP_WORDS[i]?=?noise;
???????????????noise?=?reader.ReadLine();
???????????????i++;
?????????????}
????????????
??????}
???????????????/**//**//**////?Constructs?a?{@link?StandardTokenizer}?filtered?by?a?{@link
???????///?StandardFilter},?a?{@link?LowerCaseFilter}?and?a?{@link?StopFilter}.?
???????///?
????????public?override?TokenStream?TokenStream(System.String?fieldName,?System.IO.TextReader?reader)
????????{
???????????TokenStream?result?=?new?ICTCLASTokenizer(reader);
????????????result?=?new?StandardFilter(result);
????????????result?=?new?LowerCaseFilter(result);
????????????result?=?new?StopFilter(result,?CHINESE_ENGLISH_STOP_WORDS);
???????????return?result;
????????}
????}
}
?
?
?
ICTCLASTokenizer using?System;using?System.Collections.Generic;
using?System.Text;
using?Lucene.Net.Analysis;
using?Lucene.Net.Documents;
using?Lucene.Net.Analysis.Standard;
using?System.IO;
using?SharpICTCLAS;
namespace?Bayes
{
????class?ICTCLASTokenizer:Tokenizer
????{
?????????int?nKind?=?1;
?????????List<WordResult[]>?result;
?????????int?startIndex?=?0;
?????????int?endIndex?=?0;
?????????int?i?=?1;
?????????/**//**/
?????????/**////?
????????///?待分詞的句子
????????///?
????????private?string?sentence;
?????????/**//**/
????????/**////?Constructs?a?tokenizer?for?this?Reader.?
????????public?ICTCLASTokenizer(System.IO.TextReader?reader)
????????{
?????????????this.input?=?reader;
?????????????sentence?=?input.ReadToEnd();
?????????????sentence?=?sentence.Replace("\r\n",?"");
?????????????string?DictPath?=?Path.Combine(Environment.CurrentDirectory,?"Data")?+?Path.DirectorySeparatorChar;
????????????//Console.WriteLine("正在初始化字典庫,請稍候");
????????????WordSegment?wordSegment?=?new?WordSegment();
?????????????wordSegment.InitWordSegment(DictPath);
?????????????result?=?wordSegment.Segment(sentence,?nKind);
?????????}
?
?????????/**//**/
?????????/**////?進行切詞,返回數(shù)據(jù)流中下一個token或者數(shù)據(jù)流為空時返回null
?????????///?
?????????public?override?Token?Next()
?????????{
?????????????Token?token?=?null;
????????????while?(i?<?result[0].Length?-?1)
?????????????{
?????????????????string?word?=?result[0][i].sWord;
?????????????????endIndex?=?startIndex?+?word.Length?-?1;
?????????????????token?=?new?Token(word,?startIndex,?endIndex);
????????????????startIndex?=?endIndex?+?1;
?????????????????i++;
?????????????????return?token;
????????????}
????????????return?null;
?????????}
????}
}
?
?
?下面開始介紹我的實現(xiàn):分為五個類: ChineseSpliter用于分詞,ClassifyResult用于儲存結果。MemoryTrainingDataManager,用于管理IO操作 FastNaiveBayesClassification 用于實現(xiàn)貝葉斯算法。和洞庭散人不同之處在于我的各個計算前向概率,條件概率,聯(lián)合概率的函數(shù)寫在了一個類里,而不是多個類,這樣做的目的在于避免不必要的IO操作。
?
ClassifyResult using?System;using?System.Collections.Generic;
using?System.Text;
namespace?Bayes
{
????class?ClassifyResult
????{
????????public?string?className;
????????public?float?score;
????????public?ClassifyResult()
????????{
????????????className?=?"";
????????????score?=?0;
????????}
????
????
????}
}
?
?
?
ChineseSpliter using?System;using?System.Collections.Generic;
using?System.Text;
using?System.IO;
using?Lucene.Net.Analysis;
namespace?Bayes
{
????class?ChineseSpliter
????{????public?string?Split(string?text,string?splitToken)
????????{
??????????StringBuilder?sb?=?new?StringBuilder();
????????????Analyzer?an?=?new?ICTCLASAnalyzer();
????????????//TokenStream?ts?=?an.ReusableTokenStream("",?new?StringReader(text));
???????????TokenStream?ts?=?an.TokenStream("",?new?StringReader(text));
?????????????Lucene.Net.Analysis.Token?token;
??????????????while?((token?=?ts.Next())?!=?null)
??????????????{
???????????????????sb.Append(splitToken?+?token.TermText());
???????????????}
?
?????????????return?sb.ToString().Substring(1);
?????????}
????????public?string[]?GetTerms(string?result,?string?spliter)
????????{
????????????string[]?terms?=?result.Split(new?string[]?{?spliter?},?StringSplitOptions.RemoveEmptyEntries);
????????????return?terms;
????????}
????}
}
?
?
??
MemoryTrainingDataManager using?System;using?System.Collections.Generic;
using?System.Text;
using?System.IO;
namespace?Bayes
{
????class?MemoryTrainingDataManager
????{???//調用?函數(shù)GetClassifications()獲取類別子目錄在磁盤中的儲存位置,為公有成員變量?txtClassification賦值
????????//調用?GetTtotalFileCount()?獲取總共的樣本集文章數(shù)目,為公有成員變量?totalFileCount賦值
????????public?String[]?txtClassifications;//訓練語料分類集合
????????private?static?String?defaultPath?=?"F:\\TrainingSet";
????????public?int?totalFileCount;
????????public?void???GetClassifications()
????????{
????????????this.txtClassifications?=?Directory.GetDirectories(defaultPath);
???????????
????????}
????????public?int?GetSubClassFileCount(string?subclass)
????????{
????????????string[]?paths?=?Directory.GetFiles(subclass);
????????????return?paths.Length;
????????}
????????public?void??GetTotalFileCount()
????????{
????????????int?count?=?0;
????????????for?(int?i?=?0;?i?<?txtClassifications.Length;?i++)
????????????{
????????????????count?+=?GetSubClassFileCount(txtClassifications[i]);
????????????}
????????????totalFileCount?=?count;
????????}
???????
????????public?string?GetText(string?filePath)
????????{
????????????StreamReader?sr?=?new?StreamReader(filePath,?Encoding.Default);
????????????string?text?=?sr.ReadToEnd();
????????????sr.Close();
????????????return?text;
????????}
????????public?void??SetMainMemoryStructure(ref?StoreClass?sc?,string?subclass)
????????{
???????????
???????????????string?[]paths=Directory.GetFiles(subclass);
????????????????sc.classificationName?=?subclass;
???????????????sc.classificationCount?=?paths.Length;
???????????????sc.strFileContentList?=?new?string[sc.classificationCount];
????????????????for?(int?k?=?0;?k?<?paths.Length;?k++)
????????????????{
????????????????????sc.strFileContentList[k]=GetText(paths[k]);
????????????????}
???????????}
????????public?int?GetKeyCountOfSubClass(string?key,?ref?StoreClass?sc)
????????{
????????????int?count?=?0;
????????????for?(int?i?=?0;??i?<?sc.classificationCount;?i++)
????????????{
????????????????if?(sc.strFileContentList[i].Contains(key))
????????????????{
????????????????????count++;
????????????????}
????????????}
????????????????return?count;
????????}
?????????
????????
????}
}
?
?
FastNaiveBayesClassification using?System;using?System.Collections.Generic;
using?System.Text;
namespace?Bayes
{
????class?FastNaiveBayesClassification
????{
???????//?public??StoreClass?memorystore=new?StoreClass();
????????public?MemoryTrainingDataManager?mtdm=new?MemoryTrainingDataManager();
????????private?ChineseSpliter?spliter?=?new?ChineseSpliter();
????????private?static?float?ZoomFactor?=?10;
???????
????????public?FastNaiveBayesClassification()
????????{
????????????mtdm.GetClassifications();
????????????mtdm.GetTotalFileCount();
????????}
????????///?<summary>
????????///?Nc?表示屬于c類的文本數(shù),N表示總文件數(shù)
????????///?</summary>
????????///?<param?name="Nc"></param>
????????///?<param?name="N"></param>
????????///?<returns></returns>
????????public?float?CalculatePriorProbability(float?Nc,float?N)
????????{
????????????float?ret?=?0F;
????????????ret?=?Nc?/?N;
????????????return?ret;
????????}
????????///?<summary>
????????///?
????????///?</summary>
????????///?<param?name="NxC">某一類別中某一詞頻出現(xiàn)的文件數(shù)</param>
????????///?<param?name="Nc">該類別文件總數(shù)</param>
????????///?<returns></returns>
????????public?float?CalculateConditionalProbability(float?NxC,?float?Nc)
????????{
????????????float?M?=?0F;
????????????float?ret?=?0F;
????????????ret?=?(NxC?+?1)?/?(Nc?+?M?+?mtdm.txtClassifications.Length);
????????????return?ret;
????????}
????????public?float?CalculateJointProbability(float?[]NxC,?float?Nc,?float??N)
????????{
????????????float?ret?=?1;
????????????for?(int?i?=?0;?i?<?NxC.Length;?i++)
????????????{
????????????????ret?*=?CalculateConditionalProbability(NxC[i],?Nc)?*?ZoomFactor;
????????????}
????????????ret?=?ret?*?CalculatePriorProbability(Nc,?N)?;
????????????return?ret;
????????}
????????public?string[]?SplitTerms(string?text)
????????{
????????????//string?result?=?tokenizer.TextSplit(text,?"@@@");
????????????//?string[]?terms?=?tokenizer.GetTerms(result,?"@@@");
????????????string?result?=?spliter.Split(text,?"@@@");
????????????string[]?terms?=?spliter.GetTerms(result,?"@@@");
????????????return?terms;
????????}
????????public?ClassifyResult?Classify(string?text)
????????{???int?end=mtdm.txtClassifications.Length;
????????????ClassifyResult[]?results?=?new?ClassifyResult[end];
????????????for?(int?i?=?0;?i?<?end;?i++)
????????????{
????????????????results[i]?=?new?ClassifyResult();
????????????}
????????????string[]?terms?=?SplitTerms(text);
????????????float?N?=?mtdm.totalFileCount;
????????????for?(int?i?=?0;?i?<?end;?i++)
????????????{
????????????????StoreClass?sc?=?new?StoreClass();
????????????????mtdm.SetMainMemoryStructure(ref?sc,??mtdm.txtClassifications[i]);
????????????????float??Nc?=?sc.classificationCount;
????????????????float[]?Nxc?=?new?float[terms.Length];
???????????????
????????????????for(int?k=0;k<terms.Length;k++)
????????????????{
??????????????????Nxc[k]=mtdm.GetKeyCountOfSubClass(terms[k],ref?sc);
?????????????????//?Console.WriteLine("含有的關鍵詞數(shù)量{0}",Nxc[k]);
????????????????}
?????????????????results[i].score=?CalculateJointProbability(Nxc,?Nc,?N);??
?????????????????results[i].className?=?sc.classificationName;
?????????????????Console.WriteLine("類別{0},分數(shù){1}",?results[i].className,?results[i].score);
????????????
????????????}
????????????//選擇法排序
????????????for?(int?m?=?0;?m?<?results.Length?-?1;?m++)
????????????{
????????????????int?k?=?m;
????????????????for?(int?n?=?m?+?1;?n?<?results.Length;?n++)
????????????????{
????????????????????if?(results[n].score?>?results[k].score)
????????????????????{
????????????????????????k?=?n;
????????????????????}
????????????????}
????????????????if?(k?!=?m)
????????????????{
????????????????????ClassifyResult?temp?=?new?ClassifyResult();
????????????????????temp.score?=?results[k].score;
????????????????????temp.className?=?results[k].className;
????????????????????results[k].className?=?results[m].className;
????????????????????results[k].score?=?results[m].score;
????????????????????results[m].score?=?temp.score;
????????????????????results[m].className?=?temp.className;
????????????????}
????????????}
????????????return?results[0];
????????}
????}
}
?
總結
以上是生活随笔為你收集整理的也谈贝叶斯分类(C#)版本的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 程序物语(四):苹果是如何落到牛顿头上的
- 下一篇: C# 窗体最小化的托盘/系统通知区域(转