You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
254 lines
7.2 KiB
254 lines
7.2 KiB
using System;
|
|
using System.Collections.Generic;
|
|
using System.IO;
|
|
using System.Linq;
|
|
using System.Text;
|
|
|
|
namespace Sog.Service
|
|
{
|
|
//脏词过滤, AC自动机算法
|
|
public class AcNode
|
|
{
|
|
public char m_data;
|
|
public Dictionary<char, AcNode> m_children = new Dictionary<char, AcNode>();
|
|
public bool m_isEndChar = false; //结尾字符为true
|
|
public int m_length = -1; // isEndChar=true时,记录模式串长度
|
|
public AcNode m_fail = null; //失败指针
|
|
|
|
public AcNode(char data)
|
|
{
|
|
m_data = data;
|
|
}
|
|
|
|
public void Clear()
|
|
{
|
|
foreach(var it in m_children)
|
|
{
|
|
it.Value.Clear();
|
|
}
|
|
m_children.Clear();
|
|
}
|
|
}
|
|
public class DirtyServiceAc : Singleton<DirtyServiceAc>
|
|
{
|
|
private AcNode m_root;
|
|
|
|
|
|
|
|
public void InitFromFile(string filename)
|
|
{
|
|
if (!File.Exists(filename))
|
|
{
|
|
TraceLog.Error("DirtyServiceAc.InitFromFile file {0} not exists", filename);
|
|
return;
|
|
}
|
|
|
|
//这样支持reload
|
|
if(m_root == null)
|
|
{
|
|
m_root = new AcNode('/');
|
|
}
|
|
|
|
m_root.Clear();
|
|
|
|
string[] allline = File.ReadAllLines(filename);
|
|
|
|
foreach (var line in allline)
|
|
{
|
|
if (string.IsNullOrEmpty(line))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (string.IsNullOrWhiteSpace(line))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
//if (line.Length < 2)
|
|
//{
|
|
// continue;
|
|
//}
|
|
|
|
string lowLine = line.ToLower();
|
|
|
|
//转成小写
|
|
Insert(lowLine);
|
|
}
|
|
|
|
BuildFailurePointer();
|
|
//BFS();
|
|
}
|
|
private void Insert(string text)
|
|
{
|
|
var p = m_root;
|
|
foreach(char c in text)
|
|
{
|
|
if (!p.m_children.ContainsKey(c)) //不存在新建
|
|
{
|
|
AcNode newNode = new AcNode(c);
|
|
p.m_children.Add(c, newNode);
|
|
}
|
|
|
|
p = p.m_children[c];
|
|
}
|
|
|
|
if(p.m_isEndChar) //存在重复插入
|
|
{
|
|
TraceLog.Trace("DirtyServiceAc.Insert {0} is exist", text);
|
|
return;
|
|
}
|
|
|
|
p.m_isEndChar = true;
|
|
p.m_length = text.Length;
|
|
|
|
}
|
|
|
|
private void BuildFailurePointer()
|
|
{
|
|
Queue<AcNode> queue = new Queue<AcNode>();
|
|
m_root.m_fail = null;
|
|
queue.Enqueue(m_root);
|
|
while(queue.Count > 0)
|
|
{
|
|
AcNode p = queue.Dequeue();
|
|
foreach(var pc in p.m_children)
|
|
{
|
|
if(p == m_root)
|
|
{
|
|
pc.Value.m_fail = m_root;
|
|
}
|
|
else
|
|
{
|
|
AcNode q = p.m_fail;
|
|
while(q != null)
|
|
{
|
|
if(q.m_children.ContainsKey(pc.Value.m_data))
|
|
{
|
|
pc.Value.m_fail = q.m_children[pc.Value.m_data];
|
|
break;
|
|
}
|
|
q = q.m_fail;
|
|
}
|
|
|
|
if(q == null)
|
|
{
|
|
pc.Value.m_fail = m_root;
|
|
}
|
|
}
|
|
queue.Enqueue(pc.Value);
|
|
}
|
|
}
|
|
}
|
|
|
|
//private void BFS()
|
|
//{
|
|
// Queue<AcNode> queue = new Queue<AcNode>();
|
|
// queue.Enqueue(m_root);
|
|
// while(queue.Count > 0)
|
|
// {
|
|
// var q = queue.Dequeue();
|
|
// if(q != m_root && q.m_fail == null)
|
|
// {
|
|
// TraceLog.Error("DirtyServiceAc.BFS fail is nill data {0}", q.m_data);
|
|
// }
|
|
// foreach (var qc in q.m_children)
|
|
// {
|
|
// queue.Enqueue(qc.Value);
|
|
// }
|
|
// }
|
|
//}
|
|
|
|
//匹配脏词,匹配成功放回true,匹配失败返回false
|
|
private bool Match(string text, ref List<KeyValuePair<int, int>> result, bool bOnlyCheck = false)
|
|
{
|
|
result.Clear();
|
|
AcNode p = m_root;
|
|
for(int i = 0; i < text.Length; i++)
|
|
{
|
|
while(!p.m_children.ContainsKey(text[i]) && p!= m_root)
|
|
{
|
|
p = p.m_fail;
|
|
}
|
|
|
|
if(p.m_children.ContainsKey(text[i]))
|
|
{
|
|
p = p.m_children[text[i]];
|
|
}
|
|
else
|
|
{
|
|
p = m_root;
|
|
}
|
|
|
|
AcNode tmp = p;
|
|
while(tmp != m_root && tmp != null)
|
|
{
|
|
if(tmp.m_isEndChar)
|
|
{
|
|
int pos = i - tmp.m_length + 1;
|
|
result.Add(new KeyValuePair<int, int>(pos, tmp.m_length));
|
|
if(bOnlyCheck) //只做校验,不用全匹配,立即返回
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
if(tmp.m_fail == null)
|
|
{
|
|
TraceLog.Error("DirtyServiceAc Match is null data {0}", tmp.m_data);
|
|
}
|
|
tmp = tmp.m_fail;
|
|
}
|
|
|
|
|
|
}
|
|
|
|
return result.Count > 0;
|
|
}
|
|
|
|
//替换脏词,如果存在替换返回true, 不存在返回false
|
|
public bool ReplaceDirtyText(string content, out string outcontent)
|
|
{
|
|
outcontent = content;
|
|
|
|
string lowContent = content.ToLower();
|
|
char[] charArray = null;
|
|
|
|
List<KeyValuePair<int, int>> result = new List<KeyValuePair<int, int>>();
|
|
bool bMatch = Match(lowContent, ref result);
|
|
if(bMatch && result.Count > 0)
|
|
{
|
|
charArray = content.ToArray();
|
|
foreach (var it in result)
|
|
{
|
|
for(int i = it.Key; i < it.Key + it.Value; i++)
|
|
{
|
|
|
|
charArray[i] = '*';
|
|
}
|
|
}
|
|
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
|
|
StringBuilder builder = new StringBuilder(charArray.Length);
|
|
builder.Append(charArray);
|
|
|
|
outcontent = builder.ToString();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
//只做校验,没有脏词返回true, 有脏词false
|
|
public bool CheckDirty(string content)
|
|
{
|
|
string lowContent = content.ToLower();
|
|
List<KeyValuePair<int, int>> result = new List<KeyValuePair<int, int>>();
|
|
return !Match(lowContent, ref result, true);
|
|
}
|
|
|
|
}
|
|
}
|
|
|