DFA
在社交媒体内容审核、在线评论过滤、新闻出版等场景中, 为维护内容的健康和合法性。需要过滤可能被认为是不当、冒犯或敏感的词汇和短语。
在实现文字过滤的算法中,DFA是比较好的实现算法。结合敏感词库实现业务也比较简单。
using namespace std;
// DFA节点
struct DFANode {
unordered_map<char, DFANode*> next; // 下一个节点的映射表
bool is_end; // 是否是敏感词的结束节点
};
class DFAFilter {
public:
static DFAFilter &Instance() {
static DFAFilter instance;
return instance;
}
DFAFilter() {
root_ = new DFANode();
}
~DFAFilter() {
Destroy(root_);
}
void AddWord(const string& word) {
DFANode* current = root_;
for (char c : word) {
if (current->next.find(c) == current->next.end()) {
current->next[c] = new DFANode();
}
current = current->next[c];
}
current->is_end = true;
}
bool ContainsWord(const string& text) {
size_t length = text.length();
size_t start = 0;
size_t pos = 0;
while (pos < length) {
DFANode* current = root_;
size_t match_len = 0;
for (size_t i = pos; i < length; ++i) {
char c = text[i];
auto it = current->next.find(c);
if (it != current->next.end()) {
current = it->second;
++match_len;
if (current->is_end) {
return true;
}
} else {
break;
}
}
if (match_len == 0) {
pos = start + 1;
start = pos;
} else {
pos += match_len;
}
}
return false;
}
string Filter(const string& text) {
string filtered_text;
size_t length = text.length();
size_t start = 0;
size_t pos = 0;
while (pos < length) {
DFANode* current = root_;
size_t match_len = 0;
for (size_t i = pos; i < length; ++i) {
char c = text[i];
auto it = current->next.find(c);
if (it != current->next.end()) {
current = it->second;
++match_len;
if (current->is_end) {
filtered_text += string(match_len, '*');
start = i + 1;
break;
}
} else {
break;
}
}
if (match_len == 0) {
filtered_text += text[start];
pos = start + 1;
start = pos;
} else {
pos += match_len;
}
}
return filtered_text;
}
private:
DFANode* root_;
// 递归销毁DFA节点
void Destroy(DFANode* node) {
if (node == nullptr) {
return;
}
for (auto& pair : node->next) {
Destroy(pair.second);
}
delete node;
}
};
将检测到的敏感词替换为*。
"这是一个包含****的文本,****也在这里。"
需要注意敏感词库构建大小, 最好是做一些裁剪和懒加载.