服务器之家

服务器之家 > 正文

Java使用DFA算法实现过滤多家公司自定义敏感字功能详解

时间:2020-12-22 15:32     来源/作者:jack-0023

本文实例讲述了java使用dfa算法实现过滤多家公司自定义敏感字功能。分享给大家供大家参考,具体如下:

背景

因为最近有通讯有个需求,说需要让多家客户公司可以自定义敏感词过滤掉他们自定义的规则,选择了dfa算法来做,不过和以前传统了dfa写法不太一样了

模式图

Java使用DFA算法实现过滤多家公司自定义敏感字功能详解

直接上代码

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
public class keywordfilter {
//  private static reentrantreadwritelock lock = new reentrantreadwritelock();
  public static map<string, hashmap> currentmap = new concurrenthashmap<string, hashmap>();
  public static map nowhash = null;
  public static object wordmap;// map子节点
  // 不建立对象
  private keywordfilter() {
  }
  private static string getkey(int companyid) {
    return "companyid" + companyid;
  }
  /*
   * <p>说明:清扫内容</p>
   *
   * @author:姚旭民
   *
   * @data:2017-8-22 上午10:13:11
   */
  public static void clear() {
    try {
      currentmap.clear();
    } catch (exception e) {
      e.printstacktrace();
    } finally {
    }
  }
  /*
   * <p>说明:各个渠道的过滤字符</p>
   *
   * @author:姚旭民
   *
   * @data:2017-8-20 下午2:55:06
   */
  public static void savekeywords(int companyid, list<string> keywords) {
    try {
      map tempallmap = currentmap;
      string key = getkey(companyid);
      int l = keywords.size();
      int il;
      map tempmap;
      for (int i = 0; i < l; i++) {
        string key2 = keywords.get(i).trim();// 去掉空白
        nowhash = currentmap;
        il = key2.length();
        for (int j = 0; j < il; j++) {
          char word = key2.charat(j);
          tempmap = (map) nowhash.get(word);
          wordmap = nowhash.get(word);
          if (wordmap != null) {// 检查数据
            if (!tempmap.containskey(key)) {
              nowhash.put(key, 0);
            }
            nowhash = (hashmap) wordmap;
          } else {
            hashmap<string, string> newwordhash = new hashmap<string, string>();
            newwordhash.put(key, "0");
            nowhash.put(word, newwordhash);
            nowhash = newwordhash;
          }
          if (j == il - 1) {
            nowhash.put(key, "1");
          }
        }
      }
    } catch (exception e) {
      e.printstacktrace();
    } finally {
      nowhash = null;
      wordmap = null;
    }
  }
  /*
   * <p>说明:替换掉对应的渠道规定掉敏感字</p>
   *
   * @author:姚旭民
   *
   * @data:2017-8-20 上午11:41:47
   */
  public static list<string> repword(int companyid, string txt) {
    map tempmap = currentmap;
    list<string> result = new arraylist<string>();
    string key = getkey(companyid);
    nowhash = currentmap;
    int l = txt.length();
    char word;
    string keywordstr = "";
    string keystatu;
    stringbuilder keyword = new stringbuilder();// 敏感字
    for (int i = 0; i < l; i++) {
      word = txt.charat(i);
      wordmap = nowhash.get(word);
      if (wordmap != null) {// 找到类似敏感字的字体,开始查询
        keyword.append(word);
        object te = nowhash = (hashmap) wordmap;
        // 遍历到这一步,就符合完整的关键字模板
        if (nowhash.get(key) != null
            && nowhash.get(key).tostring().equals("1")) {// 确定是敏感字,开始替换
          if (i < l - 1 && nowhash.get(txt.charat(i + 1)) != null) {// 优先过滤长敏感词,去掉就槟城了优先过滤段敏感词
            continue;
          }
          txt = txt.replaceall(keyword.tostring(), "*");
          nowhash = currentmap;
          keywordstr += keyword.tostring() + ",";
          i = i - keyword.length() + 1;
          l = txt.length();// 重新获取字符长度
          keyword.delete(0, keyword.length());// 清空数据
        }
      } else {// 这个字不是敏感字,直接排除
        nowhash = currentmap;
        keyword.delete(0, keyword.length());// 清空数据
        continue;
      }
    }
    // 清除内存指向
    nowhash = null;
    wordmap = null;
    result.add(txt);
    result.add(keywordstr.length() - 1 > 0 ? keywordstr.substring(0,
        keywordstr.length() - 1) : keywordstr);
    return result;
  }
  /*
   * <p>说明:检查是否存在敏感字</p>
   *
   * @author:姚旭民
   *
   * @data:2017-8-20 下午3:00:06 专门设计成私有的,如果没有理由,别改动他
   */
  private static int checkkeywords(string txt, int companyid, int begin) {
    int result = 0;
    string key = getkey(companyid);
    try {
      nowhash = currentmap;
      int l = txt.length();
      char word = 0;
      for (int i = begin; i < l; i++) {
        word = txt.charat(i);
        wordmap = nowhash.get(word);
        if (wordmap != null) {
          result++;
          nowhash = (hashmap) wordmap;
          if (((string) nowhash.get(key)).equals("1")) {
            nowhash = null;
            wordmap = null;
            return result;
          }
        } else {
          result = 0;
          break;
        }
      }
    } catch (exception e) {
      e.printstacktrace();
    } finally {
      nowhash = null;
      wordmap = null;
      return result;
    }
  }
  /*
   * <p>说明:返回检查的文本中包含的敏感字</p>
   *
   * @author:姚旭民
   *
   * @data:2017-8-20 下午3:32:53
   */
  public static string gettxtkeywords(string txt, int companyid) {
    string result = null;
    stringbuilder temp = new stringbuilder();
    string key;
    int l = txt.length();
    for (int i = 0; i < l;) {
      int len = checkkeywords(txt, companyid, i);
      if (len > 0) {
        key = (txt.substring(i, i + len));// 挑选出来的关键字
        temp.append(key + ",");
        txt = txt.replaceall(key, "");// 挑选出来的关键字替换成空白,加快挑选速度
        l = txt.length();
      } else {
        i++;
      }
    }
    if (temp.length() > 0) {
      result = temp.substring(0, temp.length() - 1);
    }
    return result;
  }
  /*
   * <p>说明:判断文中是否包含渠道规定的敏感字</p>
   *
   * @author:姚旭民
   *
   * @data:2017-8-20 下午3:33:19
   */
  public boolean iskeywords(string txt, int companyid) {
    for (int i = 0; i < txt.length(); i++) {
      int len = checkkeywords(txt, companyid, i);
      if (len > 0) {
        return true;
      }
    }
    return false;
  }
  public static void main(string[] arg) {
    list<string> keywords = new arraylist<string>();
    keywords.add("傻×");
    keywords.add("汉奸");
    keywords.add("草");
    keywords.add("草泥马");
    keywordfilter.savekeywords(1, keywords);
    string txt = "是傻×汉奸傻a傻b傻c傻d汉奸傻×草泥马";
    list<string> list = repword(1, txt);
    system.out.println("文中包含的敏感字为:" + list.get(1));
    system.out.println("原文:" + txt);
    system.out.println("敏感字过滤后:" + list.get(0));
  }
}

希望本文所述对大家java程序设计有所帮助。

原文链接:https://my.oschina.net/grkj/blog/1522696

相关文章

热门资讯

2020微信伤感网名听哭了 让对方看到心疼的伤感网名大全
2020微信伤感网名听哭了 让对方看到心疼的伤感网名大全 2019-12-26
Intellij idea2020永久破解,亲测可用!!!
Intellij idea2020永久破解,亲测可用!!! 2020-07-29
歪歪漫画vip账号共享2020_yy漫画免费账号密码共享
歪歪漫画vip账号共享2020_yy漫画免费账号密码共享 2020-04-07
电视剧《琉璃》全集在线观看 琉璃美人煞1-59集免费观看地址
电视剧《琉璃》全集在线观看 琉璃美人煞1-59集免费观看地址 2020-08-12
背刺什么意思 网络词语背刺是什么梗
背刺什么意思 网络词语背刺是什么梗 2020-05-22
返回顶部