服务器之家

服务器之家 > 正文

Java实现AC自动机全文检索示例

时间:2020-08-04 15:46     来源/作者:Acce1erator

第一步,构建Trie树,定义Node类型:

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/**
 * Created by zhaoyy on 2017/2/7.
 */
interface Node {
 
  char value();
 
  boolean exists();
 
  boolean isRoot();
 
  Node parent();
 
  Node childOf(char c);
 
  Node fail();
 
  void setFail(Node node);
 
  void setExists(boolean exists);
 
  void add(Node child);
 
  List<Node> children();
}

第二步,实现两种Node,如果词汇全是可打印的ASCII字符,就采用AsciiNode,否则(比如包含汉字),使用基于hash表的MapNode;这两种Node均集成自AbstractNode:

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
/**
 * Created by zhaoyy on 2017/2/8.
 */
abstract class AbstractNode implements Node {
 
  private static final char EMPTY = '\0';
  private final char value;
  private final Node parent;
  private boolean exists;
  private Node fail;
 
  public AbstractNode(Node parent, char value) {
    this.parent = parent;
    this.value = value;
    this.exists = false;
    this.fail = null;
  }
 
  public AbstractNode() {
    this(null, EMPTY);
  }
 
 
  private static String tab(int n) {
    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < n; i++) {
      builder.append('\t');
    }
    return builder.toString();
  }
 
  private static String toString(Node node, int depth) {
    StringBuilder builder = new StringBuilder();
    String tab = tab(depth);
    Node fail = node.fail();
    Node parent = node.parent();
    builder
        .append(tab)
        .append('<')
        .append(node.value())
        .append(" exists=\"")
        .append(node.exists())
        .append('"')
        .append(" parent=\"")
        .append(parent == null ? "null" : parent.value())
        .append('"')
        .append(" fail=\"")
        .append(fail == null ? "null" : fail.value())
        .append("\">\r\n");
    for (Node child : node.children())
      builder.append(toString(child, depth + 1));
    builder
        .append(tab)
        .append("</")
        .append(node.value())
        .append(">\r\n");
 
    return builder.toString();
  }
 
  @Override
  public char value() {
    return value;
  }
 
 
  @Override
  public boolean exists() {
    return exists;
  }
 
  @Override
  public boolean isRoot() {
    return value == EMPTY;
  }
 
  @Override
  public Node parent() {
    return parent;
  }
 
  @Override
  public Node fail() {
    return fail;
  }
 
  @Override
  public void setFail(Node node) {
    this.fail = node;
  }
 
  @Override
  public void setExists(boolean exists) {
    this.exists = exists;
  }
 
  @Override
  public String toString() {
    return toString(this, 0);
  }
}
 
/////////////////////////////////////////////////////////////////////////////////////////////
 
/**
 * Created by zhaoyy on 2017/2/8.
 */
final class AsciiNode extends AbstractNode implements Node {
 
 
  private static final char FROM = 32;
  private static final char TO = 126;
  private final Node[] children;
 
 
  public AsciiNode() {
    super();
    this.children = new Node[TO - FROM + 1];
  }
 
  public AsciiNode(Node parent, char value) {
    super(parent, value);
    this.children = new Node[TO - FROM + 1];
  }
 
  @Override
  public Node childOf(char c) {
    if (c >= FROM && c <= TO)
      return children[(int) c - FROM];
    else return null;
  }
 
  @Override
  public void add(Node child) {
    int index = (int) child.value();
    children[index - FROM] = child;
  }
 
  @Override
  public List<Node> children() {
    List<Node> nodes = new ArrayList<Node>();
    for (Node child : children)
      if (child != null)
        nodes.add(child);
    return nodes;
  }
}
 
 
//////////////////////////////////////////////////////////////////////////////////////////////
 
/**
 * Created by zhaoyy on 2017/2/8.
 */
final class MapNode extends AbstractNode implements Node {
 
  private final Map<Character, Node> children;
 
  public MapNode() {
    super();
    this.children = new HashMap<Character, Node>();
  }
 
  public MapNode(Node parent, char value) {
    super(parent, value);
    this.children = new HashMap<Character, Node>();
  }
 
  @Override
  public Node childOf(char c) {
    return children.get(c);
  }
 
  @Override
  public void add(Node child) {
    children.put(child.value(), child);
  }
 
  @Override
  public List<Node> children() {
    List<Node> nodes = new ArrayList<Node>();
    nodes.addAll(children.values());
    return nodes;
  }
}

第三步,

首先定义一个Node构造器:

?
1
2
3
4
5
6
7
8
9
/**
 * Created by zhaoyy on 2017/2/8.
 */
public interface NodeMaker {
 
  Node make(Node parent, char value);
 
  Node makeRoot();
}

然后构建AC自动机,实现创建及查找方法:

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/**
 * Created by zhaoyy on 2017/2/7.
 */
public final class WordTable {
 
  private final Node root;
 
 
  private WordTable(Collection<? extends CharSequence> words, NodeMaker maker) {
    Node root = buildTrie(words, maker);
    setFailNode(root);
    this.root = root;
  }
 
  public static WordTable compile(Collection<? extends CharSequence> words) {
    if (words == null || words.isEmpty())
      throw new IllegalArgumentException();
    final NodeMaker maker;
    if (isAscii(words))
      maker = new NodeMaker() {
        @Override
        public Node make(Node parent, char value) {
          return new AsciiNode(parent, value);
        }
 
        @Override
        public Node makeRoot() {
          return new AsciiNode();
        }
      };
    else maker = new NodeMaker() {
      @Override
      public Node make(Node parent, char value) {
        return new MapNode(parent, value);
      }
 
      @Override
      public Node makeRoot() {
        return new MapNode();
      }
    };
    return new WordTable(words, maker);
  }
 
  private static boolean isAscii(Collection<? extends CharSequence> words) {
    for (CharSequence word : words) {
      int len = word.length();
      for (int i = 0; i < len; i++) {
        int c = (int) word.charAt(i);
        if (c < 32 || c > 126)
          return false;
      }
    }
    return true;
  }
 
  private static Node buildTrie(Collection<? extends CharSequence> sequences, NodeMaker maker) {
    Node root = maker.makeRoot();
    for (CharSequence sequence : sequences) {
      int len = sequence.length();
      Node current = root;
      for (int i = 0; i < len; i++) {
        char c = sequence.charAt(i);
        Node node = current.childOf(c);
        if (node == null) {
          node = maker.make(current, c);
          current.add(node);
        }
        current = node;
        if (i == len - 1)
          node.setExists(true);
      }
    }
    return root;
  }
 
  private static void setFailNode(final Node root) {
    root.setFail(null);
    Queue<Node> queue = new LinkedList<Node>();
    queue.add(root);
    while (!queue.isEmpty()) {
      Node parent = queue.poll();
      Node temp;
      for (Node child : parent.children()) {
        if (parent.isRoot())
          child.setFail(root);
        else {
          temp = parent.fail();
          while (temp != null) {
            Node node = temp.childOf(child.value());
            if (node != null) {
              child.setFail(node);
              break;
            }
            temp = temp.fail();
          }
          if (temp == null)
            child.setFail(root);
        }
        queue.add(child);
      }
    }
  }
 
 
  public boolean findAnyIn(CharSequence cs) {
    int len = cs.length();
    Node node = root;
    for (int i = 0; i < len; i++) {
      Node next = node.childOf(cs.charAt(i));
      if (next == null) {
        next = node.fail();
        if (next == null) {
          node = root;
          continue;
        }
      }
      if (next.exists())
        return true;
    }
 
    return false;
  }
 
  public List<MatchInfo> search(CharSequence cs) {
    if (cs == null || cs.length() == 0)
      return Collections.emptyList();
    List<MatchInfo> result = new ArrayList<MatchInfo>();
    int len = cs.length();
    Node node = root;
    for (int i = 0; i < len; i++) {
      Node next = node.childOf(cs.charAt(i));
      if (next == null) {
        next = node.fail();
        if (next == null) {
          node = root;
          continue;
        }
      }
      if (next.exists()) {
        MatchInfo info = new MatchInfo(i, next);
        result.add(info);
        node = root;
        continue;
      }
      node = next;
    }
    return result;
  }
 
  @Override
  public String toString() {
    return root.toString();
  }
}

定义一个保存查找结果的实体:

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
/**
 * Created by zhaoyy on 2017/2/7.
 */
public final class MatchInfo {
 
  private final int index;
  private final String word;
 
  public MatchInfo(int index, String word) {
    this.index = index;
    this.word = word;
  }
 
  public MatchInfo(int index, Node node) {
    StringBuilder builder = new StringBuilder();
    while (node != null) {
      if (!node.isRoot())
        builder.append(node.value());
      node = node.parent();
    }
    String word = builder.reverse().toString();
    this.index = index + 1 - word.length();
    this.word = word;
  }
 
 
  public int getIndex() {
    return index;
  }
 
  public String getWord() {
    return word;
  }
 
  @Override
  public String toString() {
    return index + ":" + word;
  }
}

第四步,调用Demo:

?
1
2
3
4
5
6
public static void main(String[] args) {
    List<String> list = Arrays.asList("say", "her", "he", "she", "shr", "alone");
    WordTable table = WordTable.compile(list);
    System.out.println(table);
    System.out.println(table.search("1shesaynothingabouthislivinghimalone"));
  }

以下是输出结果:

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
< exists="false" parent="null" fail="null">
 <s exists="false" parent=" " fail=" ">
 <a exists="false" parent="s" fail="a">
  <y exists="true" parent="a" fail=" ">
  </y>
 </a>
 <h exists="false" parent="s" fail="h">
  <e exists="true" parent="h" fail="e">
  </e>
  <r exists="true" parent="h" fail=" ">
  </r>
 </h>
 </s>
 <h exists="false" parent=" " fail=" ">
 <e exists="true" parent="h" fail=" ">
  <r exists="true" parent="e" fail=" ">
  </r>
 </e>
 </h>
 <a exists="false" parent=" " fail=" ">
 &lt;l exists="false" parent="a" fail=" ">
  <o exists="false" parent="l" fail=" ">
  <n exists="false" parent="o" fail=" ">
   <e exists="true" parent="n" fail=" ">
   </e>
  </n>
  </o>
 &lt;/l>
 </a>
</ >
 
[1:she, 4:say, 31:alone]

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。

原文链接:https://my.oschina.net/u/2541538/blog/833284

标签:

相关文章

热门资讯

2020微信伤感网名听哭了 让对方看到心疼的伤感网名大全
2020微信伤感网名听哭了 让对方看到心疼的伤感网名大全 2019-12-26
歪歪漫画vip账号共享2020_yy漫画免费账号密码共享
歪歪漫画vip账号共享2020_yy漫画免费账号密码共享 2020-04-07
Intellij idea2020永久破解,亲测可用!!!
Intellij idea2020永久破解,亲测可用!!! 2020-07-29
男生常说24816是什么意思?女生说13579是什么意思?
男生常说24816是什么意思?女生说13579是什么意思? 2019-09-17
沙雕群名称大全2019精选 今年最火的微信群名沙雕有创意
沙雕群名称大全2019精选 今年最火的微信群名沙雕有创意 2019-07-07
返回顶部