You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

254 lines
7.2 KiB

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
namespace Sog.Service
{
//脏词过滤, AC自动机算法
public class AcNode
{
public char m_data;
public Dictionary<char, AcNode> m_children = new Dictionary<char, AcNode>();
public bool m_isEndChar = false; //结尾字符为true
public int m_length = -1; // isEndChar=true时,记录模式串长度
public AcNode m_fail = null; //失败指针
public AcNode(char data)
{
m_data = data;
}
public void Clear()
{
foreach(var it in m_children)
{
it.Value.Clear();
}
m_children.Clear();
}
}
public class DirtyServiceAc : Singleton<DirtyServiceAc>
{
private AcNode m_root;
public void InitFromFile(string filename)
{
if (!File.Exists(filename))
{
TraceLog.Error("DirtyServiceAc.InitFromFile file {0} not exists", filename);
return;
}
//这样支持reload
if(m_root == null)
{
m_root = new AcNode('/');
}
m_root.Clear();
string[] allline = File.ReadAllLines(filename);
foreach (var line in allline)
{
if (string.IsNullOrEmpty(line))
{
continue;
}
if (string.IsNullOrWhiteSpace(line))
{
continue;
}
//if (line.Length < 2)
//{
// continue;
//}
string lowLine = line.ToLower();
//转成小写
Insert(lowLine);
}
BuildFailurePointer();
//BFS();
}
private void Insert(string text)
{
var p = m_root;
foreach(char c in text)
{
if (!p.m_children.ContainsKey(c)) //不存在新建
{
AcNode newNode = new AcNode(c);
p.m_children.Add(c, newNode);
}
p = p.m_children[c];
}
if(p.m_isEndChar) //存在重复插入
{
TraceLog.Trace("DirtyServiceAc.Insert {0} is exist", text);
return;
}
p.m_isEndChar = true;
p.m_length = text.Length;
}
private void BuildFailurePointer()
{
Queue<AcNode> queue = new Queue<AcNode>();
m_root.m_fail = null;
queue.Enqueue(m_root);
while(queue.Count > 0)
{
AcNode p = queue.Dequeue();
foreach(var pc in p.m_children)
{
if(p == m_root)
{
pc.Value.m_fail = m_root;
}
else
{
AcNode q = p.m_fail;
while(q != null)
{
if(q.m_children.ContainsKey(pc.Value.m_data))
{
pc.Value.m_fail = q.m_children[pc.Value.m_data];
break;
}
q = q.m_fail;
}
if(q == null)
{
pc.Value.m_fail = m_root;
}
}
queue.Enqueue(pc.Value);
}
}
}
//private void BFS()
//{
// Queue<AcNode> queue = new Queue<AcNode>();
// queue.Enqueue(m_root);
// while(queue.Count > 0)
// {
// var q = queue.Dequeue();
// if(q != m_root && q.m_fail == null)
// {
// TraceLog.Error("DirtyServiceAc.BFS fail is nill data {0}", q.m_data);
// }
// foreach (var qc in q.m_children)
// {
// queue.Enqueue(qc.Value);
// }
// }
//}
//匹配脏词,匹配成功放回true,匹配失败返回false
private bool Match(string text, ref List<KeyValuePair<int, int>> result, bool bOnlyCheck = false)
{
result.Clear();
AcNode p = m_root;
for(int i = 0; i < text.Length; i++)
{
while(!p.m_children.ContainsKey(text[i]) && p!= m_root)
{
p = p.m_fail;
}
if(p.m_children.ContainsKey(text[i]))
{
p = p.m_children[text[i]];
}
else
{
p = m_root;
}
AcNode tmp = p;
while(tmp != m_root && tmp != null)
{
if(tmp.m_isEndChar)
{
int pos = i - tmp.m_length + 1;
result.Add(new KeyValuePair<int, int>(pos, tmp.m_length));
if(bOnlyCheck) //只做校验,不用全匹配,立即返回
{
return true;
}
}
if(tmp.m_fail == null)
{
TraceLog.Error("DirtyServiceAc Match is null data {0}", tmp.m_data);
}
tmp = tmp.m_fail;
}
}
return result.Count > 0;
}
//替换脏词,如果存在替换返回true, 不存在返回false
public bool ReplaceDirtyText(string content, out string outcontent)
{
outcontent = content;
string lowContent = content.ToLower();
char[] charArray = null;
List<KeyValuePair<int, int>> result = new List<KeyValuePair<int, int>>();
bool bMatch = Match(lowContent, ref result);
if(bMatch && result.Count > 0)
{
charArray = content.ToArray();
foreach (var it in result)
{
for(int i = it.Key; i < it.Key + it.Value; i++)
{
charArray[i] = '*';
}
}
}
else
{
return false;
}
StringBuilder builder = new StringBuilder(charArray.Length);
builder.Append(charArray);
outcontent = builder.ToString();
return true;
}
//只做校验,没有脏词返回true, 有脏词false
public bool CheckDirty(string content)
{
string lowContent = content.ToLower();
List<KeyValuePair<int, int>> result = new List<KeyValuePair<int, int>>();
return !Match(lowContent, ref result, true);
}
}
}