using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; namespace Sog.Service { //脏词过滤, AC自动机算法 public class AcNode { public char m_data; public Dictionary m_children = new Dictionary(); public bool m_isEndChar = false; //结尾字符为true public int m_length = -1; // isEndChar=true时,记录模式串长度 public AcNode m_fail = null; //失败指针 public AcNode(char data) { m_data = data; } public void Clear() { foreach(var it in m_children) { it.Value.Clear(); } m_children.Clear(); } } public class DirtyServiceAc : Singleton { private AcNode m_root; public void InitFromFile(string filename) { if (!File.Exists(filename)) { TraceLog.Error("DirtyServiceAc.InitFromFile file {0} not exists", filename); return; } //这样支持reload if(m_root == null) { m_root = new AcNode('/'); } m_root.Clear(); string[] allline = File.ReadAllLines(filename); foreach (var line in allline) { if (string.IsNullOrEmpty(line)) { continue; } if (string.IsNullOrWhiteSpace(line)) { continue; } //if (line.Length < 2) //{ // continue; //} string lowLine = line.ToLower(); //转成小写 Insert(lowLine); } BuildFailurePointer(); //BFS(); } private void Insert(string text) { var p = m_root; foreach(char c in text) { if (!p.m_children.ContainsKey(c)) //不存在新建 { AcNode newNode = new AcNode(c); p.m_children.Add(c, newNode); } p = p.m_children[c]; } if(p.m_isEndChar) //存在重复插入 { TraceLog.Trace("DirtyServiceAc.Insert {0} is exist", text); return; } p.m_isEndChar = true; p.m_length = text.Length; } private void BuildFailurePointer() { Queue queue = new Queue(); m_root.m_fail = null; queue.Enqueue(m_root); while(queue.Count > 0) { AcNode p = queue.Dequeue(); foreach(var pc in p.m_children) { if(p == m_root) { pc.Value.m_fail = m_root; } else { AcNode q = p.m_fail; while(q != null) { if(q.m_children.ContainsKey(pc.Value.m_data)) { pc.Value.m_fail = q.m_children[pc.Value.m_data]; break; } q = q.m_fail; } if(q == null) { pc.Value.m_fail = m_root; } } queue.Enqueue(pc.Value); } } } //private void BFS() //{ // Queue queue = new Queue(); // queue.Enqueue(m_root); // while(queue.Count > 0) // { // var q = queue.Dequeue(); // if(q != m_root && q.m_fail == null) // { // TraceLog.Error("DirtyServiceAc.BFS fail is nill data {0}", q.m_data); // } // foreach (var qc in q.m_children) // { // queue.Enqueue(qc.Value); // } // } //} //匹配脏词,匹配成功放回true,匹配失败返回false private bool Match(string text, ref List> result, bool bOnlyCheck = false) { result.Clear(); AcNode p = m_root; for(int i = 0; i < text.Length; i++) { while(!p.m_children.ContainsKey(text[i]) && p!= m_root) { p = p.m_fail; } if(p.m_children.ContainsKey(text[i])) { p = p.m_children[text[i]]; } else { p = m_root; } AcNode tmp = p; while(tmp != m_root && tmp != null) { if(tmp.m_isEndChar) { int pos = i - tmp.m_length + 1; result.Add(new KeyValuePair(pos, tmp.m_length)); if(bOnlyCheck) //只做校验,不用全匹配,立即返回 { return true; } } if(tmp.m_fail == null) { TraceLog.Error("DirtyServiceAc Match is null data {0}", tmp.m_data); } tmp = tmp.m_fail; } } return result.Count > 0; } //替换脏词,如果存在替换返回true, 不存在返回false public bool ReplaceDirtyText(string content, out string outcontent) { outcontent = content; string lowContent = content.ToLower(); char[] charArray = null; List> result = new List>(); bool bMatch = Match(lowContent, ref result); if(bMatch && result.Count > 0) { charArray = content.ToArray(); foreach (var it in result) { for(int i = it.Key; i < it.Key + it.Value; i++) { charArray[i] = '*'; } } } else { return false; } StringBuilder builder = new StringBuilder(charArray.Length); builder.Append(charArray); outcontent = builder.ToString(); return true; } //只做校验,没有脏词返回true, 有脏词false public bool CheckDirty(string content) { string lowContent = content.ToLower(); List> result = new List>(); return !Match(lowContent, ref result, true); } } }