您好,欢迎光临本网站![请登录][注册会员]  
文件名称: word detection
  所属分类: C#
  开发工具:
  文件大小: 39kb
  下载次数: 0
  上传时间: 2013-04-03
  提 供 者: zlj***
 详细说明: 基于词在大数据文本的出现频率来进行分词 using System; using System.Text.RegularExpressions; using System.Collections.Generic; using System.Linq; using System.Text; using System.IO; namespace WordDetection { class Program { static WordDetector wordDetector = null; static StreamWriter sw = null; private static void PrintResults() { if (sw == null) return; foreach (string word in wordDetector.FinalWords) { sw.WriteLine("{0}\t{1}", word, wordDetector .Freq[word]); } } static void Main(string[] args) { if (args.Length < 2) { Console.WriteLine("Usage: worddetector "); return; } wordDetector = new WordDetector(args[0]); //wordDetector.ProcessOver += PrintResults; var sw = new StreamWriter(args[1]); wordDetector.Process(); PrintResults(); sw.Flush(); sw.Close(); } } public class WordDetector { public Action ProcessOver = null; internal struct CharPos { public char ThisChar; public bool PositionOnRight; public CharPos(char value, bool positionOnRight) { this.ThisChar = value; this.PositionOnRight = positionOnRight; } } public const int MaxWordLength = 5, // 要检测的最长的词组长度 MinFreq = 10; // 词语出现的最小频数 public const double PSvPThreshold = 100, // theta_c EntropyThreshold = 1.3; // theta_f HashSet finalWords = new HashSet(); Dictionary> words = new Dictionary>(); Dictionary freq = new Dictionary(); Dictionary ps = new Dictionary(); Regex regSplit = new Regex(@"\W+|[a-zA-Z0-9]+", RegexOptions.Compiled | RegexOptions.Multiline); StreamReader sr = null; int total = 0; string _filename = ""; public HashSet FinalWords { get { return finalWords; } } public Dictionary Freq { get { return freq; } } public WordDetector (string filename) { _filename = filename; renewStreamReader(); } private void renewStreamReader () { sr = new StreamReader(_filename); } public void StartProcess () { System.Threading.Thread thr = new System.Threading.Thread(new System.Threading.ThreadStart(Process)); thr.Start(); } private void wordInfoEntropy (string word, out double leftEntropy, out double rightEntropy) { leftEntropy = rightEntropy = 0; double totalL = 0, totalR = 0; foreach (KeyValuePair pair in words[word]) { if (pair.Key.PositionOnRight) totalR += pair.Value; else totalL += pair.Value; } if (totalL <= 0) leftEntropy = double.MaxValue; if (totalR <= 0) rightEntropy = double.MaxValue; foreach (KeyValuePair pair in words[word]) { double p; if (pair.Key.PositionOnRight) { p = (double)pair.Value / totalR; rightEntropy -= p * Math.Log(p); } else { p = (double)pair.Value / totalL; leftEntropy -= p * Math.Log(p); } } } public void Process () { Console.WriteLine("Reading input..."); string line = ""; while ((line = sr.ReadLine()) != null) { total += addParagraph (line); } finalizeParagraph (); sr.Close (); Console.WriteLine("Building candidate word list..."); foreach (KeyValuePair pair in ps) { if (pair.Key.Length < 2 || pair.Key.Length > MaxWordLength) continue; double p = 0; for (int i=1; i= MinFreq && pair.Value / p > PSvPThreshold) words.Add (pair.Key, new Dictionary()); } renewStreamReader (); Console.WriteLine("Preparing word/adjacent character list..."); foreach(string cword in freq.Keys) { string wl = cword.Length > 1 ? cword.Substring(1) : "", wr = cword.Length > 1 ? cword.Substring(0, cword.Length - 1) : "", wc = cword.Length > 2 ? cword.Substring(1, cword.Length - 1) : ""; CharPos c = new CharPos('a', false); int frq = freq[cword]; if (words.ContainsKey(wl)) { c = new CharPos(cword[0], false); if (words[wl].ContainsKey(c)) words[wl][c] += frq; else words[wl].Add(c, frq); } if (words.ContainsKey(wr)) { c = new CharPos(cword[cword.Length - 1], true); if (words[wr].ContainsKey(c)) words[wr][c] += frq; else words[wr].Add(c, frq); } if (words.ContainsKey(wc)) { c = new CharPos(cword[0], false); if (words[wc].ContainsKey(c)) words[wc][c] += frq; else words[wc].Add(c, frq); c = new CharPos(cword[cword.Length - 1], true); if (words[wc].ContainsKey(c)) words[wc][c] += frq; else words[wc].Add(c, frq); } } Console.WriteLine("Calculating word information entropy..."); foreach (string word in words.Keys) { double leftEntropy = 0, rightEntropy = 0; wordInfoEntropy(word, out leftEntropy, out rightEntropy); if (leftEntropy < EntropyThreshold || rightEntropy < EntropyThreshold) continue; finalWords.Add(word); } Console.WriteLine("Done. Writing results."); if (ProcessOver != null) ProcessOver.Invoke(); } private int addParagraph (string paragraph) { int incr_total = 0; foreach (string sentence in regSplit.Split(paragraph)) { if (sentence.Length < 2) continue; for (int i = 0; i
(系统自动生成,下载前可以参看下载内容)

下载文件列表

相关说明

  • 本站资源为会员上传分享交流与学习,如有侵犯您的权益,请联系我们删除.
  • 本站是交换下载平台,提供交流渠道,下载内容来自于网络,除下载问题外,其它问题请自行百度
  • 本站已设置防盗链,请勿用迅雷、QQ旋风等多线程下载软件下载资源,下载后用WinRAR最新版进行解压.
  • 如果您发现内容无法下载,请稍后再次尝试;或者到消费记录里找到下载记录反馈给我们.
  • 下载后发现下载的内容跟说明不相乎,请到消费记录里找到下载记录反馈给我们,经确认后退回积分.
  • 如下载前有疑问,可以通过点击"提供者"的名字,查看对方的联系方式,联系对方咨询.
 相关搜索: 分词 文本 语义
 输入关键字,在本站1000多万海量源码库中尽情搜索: