golang-- 字典树

关注

发布于: 20 小时前

一.前言

看了百度团队在 infoq 上发表的一篇【如何在秒级完成词表匹配】(https://xie.infoq.cn/article/97b2df7e41456335627ce4cd4)的文章，文章业务背景介绍的很清楚，里面有提到字典树，看到结构图我能明白工作原理，但是总感觉自己处于一种懂与不懂的半懂状态，希望通过写这篇文章我能彻底将字典树闹明白。

二.介绍

概念

字典树是一个存储字符串的树，一个节点的最大子节点数等于字母表的大小。支持在 O(L)时间内的搜索，插入和删除操作，L 是键的长度。

应用场景

1.可以在 O(L)时间内插入和查找字符串，其中 L 表示单个单词的长度。

2.可以轻松地按字母顺序打印所有单词。

3.可以使用字典树完成前缀搜索，这也是字典书叫前缀树的原因之一。

字典树样式

拥有字段：

（图有些问题，讲 i 视作右边最后一个节点）

b|bcd|abc|abd|abcd|efg|hi

总结

根据字典树特性，我们可以发现在敏感词场景中用字典树确实是一个很合理的选择，它采用空间换时间，保证用户可以在极短时间内获取字符串是否安全的结论，具体流程如下图所示：

针对百度的图，有以下两种情况

1.china，由于二级没有任何匹配的节点，所以安全

2.abd，在左边，有 abd 到有效节点 d，所以属于不安全

三.实现

了解了字典树的应用和特性，我们下一步要做的就是字典树的实现了,以下是我按照个人理解实现的源码，如有疑问欢迎沟通

3.1 字典树实现

type Trie struct {	valid      bool                //判断是否是有效节点	value      interface{}         //节点的值	childMap   map[interface{}]int //子节点下是否有这个节点的map记录	childNodes []*Trie             //包含的所有节点}
func NewTrie() *Trie {	t := &Trie{}	return t}//新增数据时间复杂度是O(L),L是新增字符串的长度func (this *Trie) AddWord(word []byte) {	if len(word) < 1 {		return	}	length := len(word)	if this.childNodes == nil {		//不存在子级，需要重新创建		this.childNodes = append(this.childNodes, &Trie{			valid:      length == 1,			value:      word[0],			childNodes: nil,		})		this.childMap = make(map[interface{}]int)		this.childMap[word[0]] = 1		this.childNodes[0].AddWord(word[1:])	} else {		index := this.childMap[word[0]]		if index > 0 {			//子级里面以及有这个值了			if len(word) == 1 && this.childNodes[index-1].valid == false {				//已经存在这样的子级，且此值是有效值，当前节点无效，需要进行修改				this.childNodes[index-1].valid = true				return			} else {				if length <= 1 {					return				}				//已经存在这样的子级，且此值是有效值，当前节点也有效，则重复，进行下一步				this.childNodes[index-1].AddWord(word[1:])			}		} else {			//不存在这样的子级			this.childNodes = append(this.childNodes, &Trie{				valid:      length == 1,				value:      word[0],				childNodes: nil,			})			this.childMap[word[0]] = len(this.childMap)+1			this.childNodes[len(this.childNodes)-1].AddWord(word[1:])		}	}}//删除数据func (this *Trie) DelWord(word []byte) {	//不存在则不删除	if !this.SearchNode(word){		return	}	length:=len(word)	step:=1	prefix:=this	index:=this.childMap[word[step-1]]	node:=this.childNodes[index-1]	for step!=length{		step++		index = node.childMap[word[step-1]]		prefix = node		node = node.childNodes[index-1]	}	//找到了最后一个word节点	if len(node.childNodes)!=0{		//word节点的后面还有值,只是这个节点不生效		node.valid = false	}else{		//word节点后面没有值了,删除前面节点与后续节点所有关系，依靠golanggc进行回收		delindex := prefix.childMap[node.value]		prefix.childMap[node.value] = 0		prefix.childNodes = append(prefix.childNodes[:delindex], prefix.childNodes[delindex+1:]...)	}}
//按层分布func PrintTree(tries []*Trie) {	if len(tries) <1{		return	}	//按照树打印，一次打印一个子树	var inertries []*Trie	for i := 0; i < len(tries); i++ {		fmt.Print(string(tries[i].value.(uint8))+" ")		inertries = append(inertries,tries[i].childNodes...)	}	fmt.Print("\n")	PrintTree(inertries)}
//查询，时间复杂度是O(L),L是查询字符串的长度func (this *Trie) SearchNode(word []byte) bool {	if len(word) < 1 {		return false	}	index := this.childMap[word[0]]	if index == 0 {		return false	} else {		if len(word)==1{			return this.childNodes[index-1].valid		}		return this.childNodes[index-1].SearchNode(word[1:])	}}

复制代码

3.2 测试代码


func main() {	trie := NewTrie()	trie.AddWord([]byte("abc"))	trie.AddWord([]byte("abcd"))	trie.AddWord([]byte("bcd"))	trie.AddWord([]byte("efg"))	trie.AddWord([]byte("hi"))	trie.AddWord([]byte("b"))	PrintTree(trie.childNodes)	fmt.Println(trie.SearchNode([]byte("abcd")))	fmt.Println(trie.SearchNode([]byte("abc")))	fmt.Println(trie.SearchNode([]byte("b")))	fmt.Println(trie.SearchNode([]byte("hi")))	fmt.Println(trie.SearchNode([]byte("efg")))	fmt.Println(trie.SearchNode([]byte("china")))	fmt.Println(trie.SearchNode([]byte("ab")))	fmt.Println(trie.SearchNode([]byte("")))	trie.DelWord([]byte("ab"))//应该没有影响	fmt.Println("==================")	fmt.Println(trie.SearchNode([]byte("abcd")))	fmt.Println(trie.SearchNode([]byte("abc")))	fmt.Println(trie.SearchNode([]byte("abcd")))	fmt.Println(trie.SearchNode([]byte("ab")))	trie.DelWord([]byte("abc"))//导致abc会找不到	fmt.Println("===================")	fmt.Println(trie.SearchNode([]byte("abcd")))	fmt.Println(trie.SearchNode([]byte("abc")))	fmt.Println(trie.SearchNode([]byte("abcd")))	fmt.Println(trie.SearchNode([]byte("ab")))}

复制代码

参考文件

https://www.geeksforgeeks.org/advantages-trie-data-structure/

创作场景