日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問 生活随笔!

生活随笔

當前位置: 首頁 > 编程语言 > C# >内容正文

C#

也谈贝叶斯分类(C#)版本

發(fā)布時間:2025/3/20 C# 21 豆豆
生活随笔 收集整理的這篇文章主要介紹了 也谈贝叶斯分类(C#)版本 小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.

???

代碼下載

?最近在做一個大作業(yè)。搭建一個信息檢索平臺。用到了貝葉斯分類參考了洞庭散人大哥的技術博客

http://www.cnblogs.com/phinecos/archive/2008/10/21/1316044.html

但是,他的算法運行起來很慢,原因是IO操作過于頻繁,而且有些IO操作是可以避免的。下面開始介紹我的貝葉斯分類算法實現(xiàn)。

采用分詞器為河北理工大學呂震宇老師的SHARPICTCLAS 該分詞器沒有Lucene接口,自己實現(xiàn)Analyzer 和Tokenizer 類如下

?

ICTCLASAnalyzer using?System;
using?System.Collections.Generic;
using?System.Text;
using?System.IO;
using?Lucene.Net.Analysis;
using?Lucene.Net.Analysis.Standard;

namespace?Bayes
{
????
class?ICTCLASAnalyzer:Analyzer
????{
????????
public?static?readonly?System.String[]?CHINESE_ENGLISH_STOP_WORDS?=?new?string[400];
????????
public?string?NoisePath?=?Environment.CurrentDirectory?+?"\\data\\stopwords.txt";
????????
public?ICTCLASAnalyzer()
????????{
???????????StreamReader?reader?
=?new?StreamReader(NoisePath,?System.Text.Encoding.Default);
????????????
string?noise?=?reader.ReadLine();
????????????
int?i?=?0;
???????????
????????????
while?(!string.IsNullOrEmpty(noise)&&i<400)
????????????{
????????????????CHINESE_ENGLISH_STOP_WORDS[i]?
=?noise;
???????????????noise?
=?reader.ReadLine();
???????????????i
++;
?????????????}
????????????
??????}

???????????????
/**//**//**////?Constructs?a?{@link?StandardTokenizer}?filtered?by?a?{@link
???????
///?StandardFilter},?a?{@link?LowerCaseFilter}?and?a?{@link?StopFilter}.?
???????
///?
????????public?override?TokenStream?TokenStream(System.String?fieldName,?System.IO.TextReader?reader)
????????{
???????????TokenStream?result?
=?new?ICTCLASTokenizer(reader);
????????????result?
=?new?StandardFilter(result);
????????????result?
=?new?LowerCaseFilter(result);
????????????result?
=?new?StopFilter(result,?CHINESE_ENGLISH_STOP_WORDS);
???????????
return?result;
????????}


????}
}

?

?

?

ICTCLASTokenizer using?System;
using?System.Collections.Generic;
using?System.Text;
using?Lucene.Net.Analysis;
using?Lucene.Net.Documents;
using?Lucene.Net.Analysis.Standard;
using?System.IO;
using?SharpICTCLAS;


namespace?Bayes
{
????
class?ICTCLASTokenizer:Tokenizer
????{
?????????
int?nKind?=?1;
?????????List
<WordResult[]>?result;
?????????
int?startIndex?=?0;
?????????
int?endIndex?=?0;
?????????
int?i?=?1;
?????????
/**//**/
?????????
/**////?
????????
///?待分詞的句子
????????
///?
????????private?string?sentence;
?????????
/**//**/
????????
/**////?Constructs?a?tokenizer?for?this?Reader.?
????????public?ICTCLASTokenizer(System.IO.TextReader?reader)
????????{
?????????????
this.input?=?reader;
?????????????sentence?
=?input.ReadToEnd();
?????????????sentence?
=?sentence.Replace("\r\n",?"");
?????????????
string?DictPath?=?Path.Combine(Environment.CurrentDirectory,?"Data")?+?Path.DirectorySeparatorChar;
????????????
//Console.WriteLine("正在初始化字典庫,請稍候");
????????????WordSegment?wordSegment?=?new?WordSegment();
?????????????wordSegment.InitWordSegment(DictPath);
?????????????result?
=?wordSegment.Segment(sentence,?nKind);
?????????}
?
?????????
/**//**/
?????????
/**////?進行切詞,返回數(shù)據(jù)流中下一個token或者數(shù)據(jù)流為空時返回null
?????????
///?
?????????public?override?Token?Next()
?????????{
?????????????Token?token?
=?null;
????????????
while?(i?<?result[0].Length?-?1)
?????????????{
?????????????????
string?word?=?result[0][i].sWord;
?????????????????endIndex?
=?startIndex?+?word.Length?-?1;
?????????????????token?
=?new?Token(word,?startIndex,?endIndex);
????????????????startIndex?
=?endIndex?+?1;

?????????????????i
++;
?????????????????
return?token;

????????????}
????????????
return?null;
?????????}

????}
}

?

?

?下面開始介紹我的實現(xiàn):分為五個類: ChineseSpliter用于分詞,ClassifyResult用于儲存結果。MemoryTrainingDataManager,用于管理IO操作 FastNaiveBayesClassification 用于實現(xiàn)貝葉斯算法。和洞庭散人不同之處在于我的各個計算前向概率,條件概率,聯(lián)合概率的函數(shù)寫在了一個類里,而不是多個類,這樣做的目的在于避免不必要的IO操作。

?

ClassifyResult using?System;
using?System.Collections.Generic;
using?System.Text;

namespace?Bayes
{
????
class?ClassifyResult
????{
????????
public?string?className;
????????
public?float?score;
????????
public?ClassifyResult()
????????{
????????????className?
=?"";
????????????score?
=?0;
????????}
????
????
????}
}

?

?

?

ChineseSpliter using?System;
using?System.Collections.Generic;
using?System.Text;
using?System.IO;
using?Lucene.Net.Analysis;


namespace?Bayes
{
????
class?ChineseSpliter
????{????
public?string?Split(string?text,string?splitToken)
????????{
??????????StringBuilder?sb?
=?new?StringBuilder();

????????????Analyzer?an?
=?new?ICTCLASAnalyzer();

????????????
//TokenStream?ts?=?an.ReusableTokenStream("",?new?StringReader(text));

???????????TokenStream?ts?
=?an.TokenStream("",?new?StringReader(text));

?????????????Lucene.Net.Analysis.Token?token;
??????????????
while?((token?=?ts.Next())?!=?null)
??????????????{
???????????????????sb.Append(splitToken?
+?token.TermText());
???????????????}
?
?????????????
return?sb.ToString().Substring(1);
?????????}
????????
public?string[]?GetTerms(string?result,?string?spliter)
????????{
????????????
string[]?terms?=?result.Split(new?string[]?{?spliter?},?StringSplitOptions.RemoveEmptyEntries);
????????????
return?terms;

????????}

????}
}

?

?

??

MemoryTrainingDataManager using?System;
using?System.Collections.Generic;
using?System.Text;
using?System.IO;



namespace?Bayes
{
????
class?MemoryTrainingDataManager
????{???
//調用?函數(shù)GetClassifications()獲取類別子目錄在磁盤中的儲存位置,為公有成員變量?txtClassification賦值
????????
//調用?GetTtotalFileCount()?獲取總共的樣本集文章數(shù)目,為公有成員變量?totalFileCount賦值
????????public?String[]?txtClassifications;//訓練語料分類集合
????????private?static?String?defaultPath?=?"F:\\TrainingSet";
????????
public?int?totalFileCount;
????????
public?void???GetClassifications()
????????{
????????????
this.txtClassifications?=?Directory.GetDirectories(defaultPath);
???????????
????????}

????????
public?int?GetSubClassFileCount(string?subclass)
????????{
????????????
string[]?paths?=?Directory.GetFiles(subclass);
????????????
return?paths.Length;
????????}
????????
public?void??GetTotalFileCount()
????????{
????????????
int?count?=?0;
????????????
for?(int?i?=?0;?i?<?txtClassifications.Length;?i++)
????????????{
????????????????count?
+=?GetSubClassFileCount(txtClassifications[i]);
????????????}
????????????totalFileCount?
=?count;
????????}
???????
????????
public?string?GetText(string?filePath)
????????{
????????????StreamReader?sr?
=?new?StreamReader(filePath,?Encoding.Default);
????????????
string?text?=?sr.ReadToEnd();
????????????sr.Close();
????????????
return?text;
????????}
????????
public?void??SetMainMemoryStructure(ref?StoreClass?sc?,string?subclass)
????????{
???????????
???????????????
string?[]paths=Directory.GetFiles(subclass);
????????????????sc.classificationName?
=?subclass;
???????????????sc.classificationCount?
=?paths.Length;
???????????????sc.strFileContentList?
=?new?string[sc.classificationCount];
????????????????
for?(int?k?=?0;?k?<?paths.Length;?k++)
????????????????{
????????????????????sc.strFileContentList[k]
=GetText(paths[k]);
????????????????}
???????????}

????????
public?int?GetKeyCountOfSubClass(string?key,?ref?StoreClass?sc)
????????{
????????????
int?count?=?0;
????????????
for?(int?i?=?0;??i?<?sc.classificationCount;?i++)
????????????{
????????????????
if?(sc.strFileContentList[i].Contains(key))
????????????????{
????????????????????count
++;
????????????????}
????????????}
????????????????
return?count;


????????}
?????????
????????




????}
}

?

?

FastNaiveBayesClassification using?System;
using?System.Collections.Generic;
using?System.Text;

namespace?Bayes
{
????
class?FastNaiveBayesClassification
????{
???????
//?public??StoreClass?memorystore=new?StoreClass();
????????public?MemoryTrainingDataManager?mtdm=new?MemoryTrainingDataManager();
????????
private?ChineseSpliter?spliter?=?new?ChineseSpliter();
????????
private?static?float?ZoomFactor?=?10;
???????
????????
public?FastNaiveBayesClassification()
????????{
????????????mtdm.GetClassifications();
????????????mtdm.GetTotalFileCount();
????????}
????????
///?<summary>
????????
///?Nc?表示屬于c類的文本數(shù),N表示總文件數(shù)
????????
///?</summary>
????????
///?<param?name="Nc"></param>
????????
///?<param?name="N"></param>
????????
///?<returns></returns>
????????public?float?CalculatePriorProbability(float?Nc,float?N)
????????{
????????????
float?ret?=?0F;
????????????ret?
=?Nc?/?N;
????????????
return?ret;
????????}
????????
///?<summary>
????????
///?
????????
///?</summary>
????????
///?<param?name="NxC">某一類別中某一詞頻出現(xiàn)的文件數(shù)</param>
????????
///?<param?name="Nc">該類別文件總數(shù)</param>
????????
///?<returns></returns>
????????public?float?CalculateConditionalProbability(float?NxC,?float?Nc)
????????{
????????????
float?M?=?0F;
????????????
float?ret?=?0F;
????????????ret?
=?(NxC?+?1)?/?(Nc?+?M?+?mtdm.txtClassifications.Length);
????????????
return?ret;
????????}
????????
public?float?CalculateJointProbability(float?[]NxC,?float?Nc,?float??N)
????????{
????????????
float?ret?=?1;
????????????
for?(int?i?=?0;?i?<?NxC.Length;?i++)
????????????{
????????????????ret?
*=?CalculateConditionalProbability(NxC[i],?Nc)?*?ZoomFactor;
????????????}
????????????ret?
=?ret?*?CalculatePriorProbability(Nc,?N)?;
????????????
return?ret;

????????}
????????
public?string[]?SplitTerms(string?text)
????????{
????????????
//string?result?=?tokenizer.TextSplit(text,?"@@@");
????????????
//?string[]?terms?=?tokenizer.GetTerms(result,?"@@@");
????????????string?result?=?spliter.Split(text,?"@@@");
????????????
string[]?terms?=?spliter.GetTerms(result,?"@@@");
????????????
return?terms;
????????}

????????
public?ClassifyResult?Classify(string?text)
????????{???
int?end=mtdm.txtClassifications.Length;
????????????ClassifyResult[]?results?
=?new?ClassifyResult[end];
????????????
for?(int?i?=?0;?i?<?end;?i++)
????????????{
????????????????results[i]?
=?new?ClassifyResult();
????????????}
????????????
string[]?terms?=?SplitTerms(text);
????????????
float?N?=?mtdm.totalFileCount;
????????????
for?(int?i?=?0;?i?<?end;?i++)
????????????{
????????????????StoreClass?sc?
=?new?StoreClass();
????????????????mtdm.SetMainMemoryStructure(
ref?sc,??mtdm.txtClassifications[i]);
????????????????
float??Nc?=?sc.classificationCount;
????????????????
float[]?Nxc?=?new?float[terms.Length];
???????????????
????????????????
for(int?k=0;k<terms.Length;k++)
????????????????{
??????????????????Nxc[k]
=mtdm.GetKeyCountOfSubClass(terms[k],ref?sc);
?????????????????
//?Console.WriteLine("含有的關鍵詞數(shù)量{0}",Nxc[k]);
????????????????}
?????????????????results[i].score
=?CalculateJointProbability(Nxc,?Nc,?N);??
?????????????????results[i].className?
=?sc.classificationName;
?????????????????Console.WriteLine(
"類別{0},分數(shù){1}",?results[i].className,?results[i].score);
????????????
????????????}
????????????
//選擇法排序
????????????for?(int?m?=?0;?m?<?results.Length?-?1;?m++)
????????????{
????????????????
int?k?=?m;
????????????????
for?(int?n?=?m?+?1;?n?<?results.Length;?n++)
????????????????{
????????????????????
if?(results[n].score?>?results[k].score)
????????????????????{
????????????????????????k?
=?n;
????????????????????}
????????????????}
????????????????
if?(k?!=?m)
????????????????{
????????????????????ClassifyResult?temp?
=?new?ClassifyResult();
????????????????????temp.score?
=?results[k].score;
????????????????????temp.className?
=?results[k].className;
????????????????????results[k].className?
=?results[m].className;
????????????????????results[k].score?
=?results[m].score;
????????????????????results[m].score?
=?temp.score;
????????????????????results[m].className?
=?temp.className;
????????????????}
????????????}
????????????
return?results[0];

????????}
????}
}

?

總結

以上是生活随笔為你收集整理的也谈贝叶斯分类(C#)版本的全部內容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內容還不錯,歡迎將生活随笔推薦給好友。