Java实现AC自动机全文检索示例
第一步,构建Trie树,定义Node类型:
/**
*Createdbyzhaoyyon2017/2/7.
*/
interfaceNode{
charvalue();
booleanexists();
booleanisRoot();
Nodeparent();
NodechildOf(charc);
Nodefail();
voidsetFail(Nodenode);
voidsetExists(booleanexists);
voidadd(Nodechild);
List<Node>children();
}
第二步,实现两种Node,如果词汇全是可打印的ASCII字符,就采用AsciiNode,否则(比如包含汉字),使用基于hash表的MapNode;这两种Node均集成自AbstractNode:
/**
*Createdbyzhaoyyon2017/2/8.
*/
abstractclassAbstractNodeimplementsNode{
privatestaticfinalcharEMPTY='\0';
privatefinalcharvalue;
privatefinalNodeparent;
privatebooleanexists;
privateNodefail;
publicAbstractNode(Nodeparent,charvalue){
this.parent=parent;
this.value=value;
this.exists=false;
this.fail=null;
}
publicAbstractNode(){
this(null,EMPTY);
}
privatestaticStringtab(intn){
StringBuilderbuilder=newStringBuilder();
for(inti=0;i<n;i++){
builder.append('\t');
}
returnbuilder.toString();
}
privatestaticStringtoString(Nodenode,intdepth){
StringBuilderbuilder=newStringBuilder();
Stringtab=tab(depth);
Nodefail=node.fail();
Nodeparent=node.parent();
builder
.append(tab)
.append('<')
.append(node.value())
.append("exists=\"")
.append(node.exists())
.append('"')
.append("parent=\"")
.append(parent==null?"null":parent.value())
.append('"')
.append("fail=\"")
.append(fail==null?"null":fail.value())
.append("\">\r\n");
for(Nodechild:node.children())
builder.append(toString(child,depth+1));
builder
.append(tab)
.append("</")
.append(node.value())
.append(">\r\n");
returnbuilder.toString();
}
@Override
publiccharvalue(){
returnvalue;
}
@Override
publicbooleanexists(){
returnexists;
}
@Override
publicbooleanisRoot(){
returnvalue==EMPTY;
}
@Override
publicNodeparent(){
returnparent;
}
@Override
publicNodefail(){
returnfail;
}
@Override
publicvoidsetFail(Nodenode){
this.fail=node;
}
@Override
publicvoidsetExists(booleanexists){
this.exists=exists;
}
@Override
publicStringtoString(){
returntoString(this,0);
}
}
/////////////////////////////////////////////////////////////////////////////////////////////
/**
*Createdbyzhaoyyon2017/2/8.
*/
finalclassAsciiNodeextendsAbstractNodeimplementsNode{
privatestaticfinalcharFROM=32;
privatestaticfinalcharTO=126;
privatefinalNode[]children;
publicAsciiNode(){
super();
this.children=newNode[TO-FROM+1];
}
publicAsciiNode(Nodeparent,charvalue){
super(parent,value);
this.children=newNode[TO-FROM+1];
}
@Override
publicNodechildOf(charc){
if(c>=FROM&&c<=TO)
returnchildren[(int)c-FROM];
elsereturnnull;
}
@Override
publicvoidadd(Nodechild){
intindex=(int)child.value();
children[index-FROM]=child;
}
@Override
publicList<Node>children(){
List<Node>nodes=newArrayList<Node>();
for(Nodechild:children)
if(child!=null)
nodes.add(child);
returnnodes;
}
}
//////////////////////////////////////////////////////////////////////////////////////////////
/**
*Createdbyzhaoyyon2017/2/8.
*/
finalclassMapNodeextendsAbstractNodeimplementsNode{
privatefinalMap<Character,Node>children;
publicMapNode(){
super();
this.children=newHashMap<Character,Node>();
}
publicMapNode(Nodeparent,charvalue){
super(parent,value);
this.children=newHashMap<Character,Node>();
}
@Override
publicNodechildOf(charc){
returnchildren.get(c);
}
@Override
publicvoidadd(Nodechild){
children.put(child.value(),child);
}
@Override
publicList<Node>children(){
List<Node>nodes=newArrayList<Node>();
nodes.addAll(children.values());
returnnodes;
}
}
第三步,
首先定义一个Node构造器:
/**
*Createdbyzhaoyyon2017/2/8.
*/
publicinterfaceNodeMaker{
Nodemake(Nodeparent,charvalue);
NodemakeRoot();
}
然后构建AC自动机,实现创建及查找方法:
/**
*Createdbyzhaoyyon2017/2/7.
*/
publicfinalclassWordTable{
privatefinalNoderoot;
privateWordTable(Collection<?extendsCharSequence>words,NodeMakermaker){
Noderoot=buildTrie(words,maker);
setFailNode(root);
this.root=root;
}
publicstaticWordTablecompile(Collection<?extendsCharSequence>words){
if(words==null||words.isEmpty())
thrownewIllegalArgumentException();
finalNodeMakermaker;
if(isAscii(words))
maker=newNodeMaker(){
@Override
publicNodemake(Nodeparent,charvalue){
returnnewAsciiNode(parent,value);
}
@Override
publicNodemakeRoot(){
returnnewAsciiNode();
}
};
elsemaker=newNodeMaker(){
@Override
publicNodemake(Nodeparent,charvalue){
returnnewMapNode(parent,value);
}
@Override
publicNodemakeRoot(){
returnnewMapNode();
}
};
returnnewWordTable(words,maker);
}
privatestaticbooleanisAscii(Collection<?extendsCharSequence>words){
for(CharSequenceword:words){
intlen=word.length();
for(inti=0;i<len;i++){
intc=(int)word.charAt(i);
if(c<32||c>126)
returnfalse;
}
}
returntrue;
}
privatestaticNodebuildTrie(Collection<?extendsCharSequence>sequences,NodeMakermaker){
Noderoot=maker.makeRoot();
for(CharSequencesequence:sequences){
intlen=sequence.length();
Nodecurrent=root;
for(inti=0;i<len;i++){
charc=sequence.charAt(i);
Nodenode=current.childOf(c);
if(node==null){
node=maker.make(current,c);
current.add(node);
}
current=node;
if(i==len-1)
node.setExists(true);
}
}
returnroot;
}
privatestaticvoidsetFailNode(finalNoderoot){
root.setFail(null);
Queue<Node>queue=newLinkedList<Node>();
queue.add(root);
while(!queue.isEmpty()){
Nodeparent=queue.poll();
Nodetemp;
for(Nodechild:parent.children()){
if(parent.isRoot())
child.setFail(root);
else{
temp=parent.fail();
while(temp!=null){
Nodenode=temp.childOf(child.value());
if(node!=null){
child.setFail(node);
break;
}
temp=temp.fail();
}
if(temp==null)
child.setFail(root);
}
queue.add(child);
}
}
}
publicbooleanfindAnyIn(CharSequencecs){
intlen=cs.length();
Nodenode=root;
for(inti=0;i<len;i++){
Nodenext=node.childOf(cs.charAt(i));
if(next==null){
next=node.fail();
if(next==null){
node=root;
continue;
}
}
if(next.exists())
returntrue;
}
returnfalse;
}
publicList<MatchInfo>search(CharSequencecs){
if(cs==null||cs.length()==0)
returnCollections.emptyList();
List<MatchInfo>result=newArrayList<MatchInfo>();
intlen=cs.length();
Nodenode=root;
for(inti=0;i<len;i++){
Nodenext=node.childOf(cs.charAt(i));
if(next==null){
next=node.fail();
if(next==null){
node=root;
continue;
}
}
if(next.exists()){
MatchInfoinfo=newMatchInfo(i,next);
result.add(info);
node=root;
continue;
}
node=next;
}
returnresult;
}
@Override
publicStringtoString(){
returnroot.toString();
}
}
定义一个保存查找结果的实体:
/**
*Createdbyzhaoyyon2017/2/7.
*/
publicfinalclassMatchInfo{
privatefinalintindex;
privatefinalStringword;
publicMatchInfo(intindex,Stringword){
this.index=index;
this.word=word;
}
publicMatchInfo(intindex,Nodenode){
StringBuilderbuilder=newStringBuilder();
while(node!=null){
if(!node.isRoot())
builder.append(node.value());
node=node.parent();
}
Stringword=builder.reverse().toString();
this.index=index+1-word.length();
this.word=word;
}
publicintgetIndex(){
returnindex;
}
publicStringgetWord(){
returnword;
}
@Override
publicStringtoString(){
returnindex+":"+word;
}
}
第四步,调用Demo:
publicstaticvoidmain(String[]args){
List<String>list=Arrays.asList("say","her","he","she","shr","alone");
WordTabletable=WordTable.compile(list);
System.out.println(table);
System.out.println(table.search("1shesaynothingabouthislivinghimalone"));
}
以下是输出结果:
<exists="false"parent="null"fail="null"> <sexists="false"parent=""fail=""> <aexists="false"parent="s"fail="a"> <yexists="true"parent="a"fail=""> </y> </a> <hexists="false"parent="s"fail="h"> <eexists="true"parent="h"fail="e"> </e> <rexists="true"parent="h"fail=""> </r> </h> </s> <hexists="false"parent=""fail=""> <eexists="true"parent="h"fail=""> <rexists="true"parent="e"fail=""> </r> </e> </h> <aexists="false"parent=""fail=""> <lexists="false"parent="a"fail=""> <oexists="false"parent="l"fail=""> <nexists="false"parent="o"fail=""> <eexists="true"parent="n"fail=""> </e> </n> </o> </l> </a> </> [1:she,4:say,31:alone]
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。