分类: 大数据
2023-01-30 10:50:51
作者:京东物流 马瑞
Trie 树,即字典树,又称单词查找树或键树,是一种树形结构,典型应用是用于统计,排序和保存大量的字符串(但不仅限于字符串),所以经常被搜索引擎系统用于文本词频统计。它的优点是:利用字符串的公共前缀来减少查询时间,{BANNED}最佳大限度地减少无谓的字符串比较,查询效率比哈希树高。
Trie, also called digital tree and sometimes radix tree or prefix tree (as they can be searched by prefixes), is a kind of search tree—an ordered tree data structure that is used to store a dynamic set or associative array where the keys are usually strings. It is one of those data-structures that can be easily implemented.
{BANNED}最佳大限度地减少无谓的字符串比较,查询效率比较高。核心思想是空间换时间,利用字符串的公共前缀来降低查询时间的开销以达到提高效率的目的。
以字符串”hi” 和” 经海路” 的数据为例:
Java 的数据结构定义:
@Data public class TrieTreeNode { private Character data; private Map children; private boolean isEnd; // 前缀,冗余信息,可选 private String prefix; // 后缀,冗余信息,可选 private String suffix;
}
如果只是处理 26 个英文字符,data 可以通过 children 数组是否为空来判断。如果处理程序,默认 children 为空来判断是否为{BANNED}最佳后一个节点,则 isEnd 字段可以省略。
前缀 prefix 和 suffix 可以在数据处理的时候,方便拿到当前节点前缀和后缀信息,如果不需要可以去除。
public class KeyWord implements Serializable { /**
* 关键词内容
*/ private String word; //其他省略 }
public class KWSeeker { /**
* 所有的关键词
*/ private Set sensitiveWords; /**
* 关键词树
*/ private Map<String, Map> wordsTree = new ConcurrentHashMap<String, Map>(); /**
* {BANNED}最佳短的关键词长度。用于对短于这个长度的文本不处理的判断,以节省一定的效率
*/ private int wordLeastLen = 0; //其他处理方法,省略 }
/**
* 将指定的词构造到一棵树中。
*
* @param tree 构造出来的树
* @param word 指定的词
* @param KeyWord 对应的词
* @return
*/ public static Map<String, Map> makeTreeByWord(Map<String, Map> tree, String word,
KeyWord KeyWord) { if (StringUtils.isEmpty(word)) {
tree.putAll(EndTagUtil.buind(KeyWord)); return tree;
} String next = word.substring(0, 1); Map<String, Map> nextTree = tree.get(next); if (nextTree == null) {
nextTree = new HashMap<String, Map>();
} // 递归构造树结构 tree.put(next, makeTreeByWord(nextTree, word.substring(1), KeyWord)); return tree;
}
对关键词字符串,逐个字符进行构建。
/**
* 构造、生成词库树。并返回所有敏感词中{BANNED}最佳短的词的长度。
*
* @param sensitiveWords 词库
* @param wordsTree 聚合词库的树
* @return 返回所有敏感词中{BANNED}最佳短的词的长度。
*/ public int generalTree(Set sensitiveWords, Map wordsTree) { if (sensitiveWords == null || sensitiveWords.isEmpty() || wordsTree == null) { return 0;
}
wordsTreeTmp.clear(); int len = 0; for (KeyWord kw : sensitiveWords) { if (len == 0) {
len = kw.getWordLength();
} else if (kw.getWordLength() < len) {
len = kw.getWordLength();
}
AnalysisUtil
.makeTreeByWord(wordsTreeTmp, StringUtils.trimToEmpty(kw.getWord()), kw);
}
wordsTree.clear();
wordsTree.putAll(wordsTreeTmp); return len;
}
/**
* 将文本中的关键词提取出来。
*/ public List process(Map<String, Map> wordsTree, String text,
AbstractFragment fragment, int minLen) { // 词的前面一个字 String pre = null; // 词匹配的开始位置 int startPosition = 0; // 返回结果 List rs = new ArrayList(); while (true) { try { if (wordsTree == null || wordsTree.isEmpty() || StringUtils.isEmpty(text)) { return rs;
} if (text.length() < minLen) { return rs;
} String chr = text.substring(0, 1);
text = text.substring(1); Map<String, Map> nextWord = wordsTree.get(chr); // 没有对应的下一个字,表示这不是关键词的开头,进行下一个循环 if (nextWord == null) {
pre = chr; continue;
}
List keywords = Lists.newArrayList();
KeyWord kw = AnalysisUtil.getSensitiveWord(chr, pre, nextWord, text, keywords); if (keywords == null || keywords.size() == 0) { // 没有匹配到完整关键字,下一个循环 pre = chr; continue;
} for (KeyWord tmp : keywords) { // 同一个word多次出现记录在一起 SensitiveWordResult result = new SensitiveWordResult(startPosition, tmp.getWord());
int index = rs.indexOf(result); if (index > -1) {
rs.get(index).addPosition(startPosition, tmp.getWord());
} else {
rs.add(result);
}
} // 从text中去除当前已经匹配的内容,进行下一个循环匹配 // 这行注释了,避免"中国人",导致"国人"。搜索不出来,逐个字符遍历 // text = text.substring(kw.getWordLength() - 1); pre = kw.getWord().substring(kw.getWordLength() - 1, kw.getWordLength()); continue;
} finally { if (pre != null) {
startPosition = startPosition + pre.length();
}
}
}
} /**
* 查询文本开头的词是否在词库树中,如果在,则返回对应的词,如果不在,则返回null。return 返回找到的{BANNED}最佳长关键词
*
* @param append 追加的词
* @param pre 词的前一个字,如果为空,则表示前面没有内容
* @param nextWordsTree 下一层树
* @param text 剩余的文本内容
* @param keywords 返回的keywords,可能多个
* @return 返回找到的{BANNED}最佳长关键词
*/ public static KeyWord getSensitiveWord(String append, String pre, Map<String, Map> nextWordsTree, String text, List keywords) { if (nextWordsTree == null || nextWordsTree.isEmpty()) { return null;
} Map<String, Object> endTag = nextWordsTree.get(EndTagUtil.TREE_END_TAG); // 原始文本已到末尾 if (StringUtils.isEmpty(text)) { // 如果有结束符,则表示匹配成功,没有,则返回null if (endTag != null) {
keywords.add(checkPattern(getKeyWord(append, endTag), pre, null)); return checkPattern(getKeyWord(append, endTag), pre, null);
} else { return null;
}
} String next = text.substring(0, 1); String suffix = text.substring(0, 1); Map<String, Map> nextTree = nextWordsTree.get(next); // 没有遇到endTag,继续匹配 if (endTag == null) { if (nextTree != null && nextTree.size() > 0) { // 没有结束标志,则表示关键词没有结束,继续往下走。 return getSensitiveWord(append + next, pre, nextTree, text.substring(1), keywords);
} // 如果没有下一个匹配的字,表示匹配结束! return null;
} else { // endTag , 添加关键字 KeyWord tmp = getKeyWord(append, endTag);
keywords.add(checkPattern(tmp, pre, suffix));
} // 有下一个匹配的词则继续匹配,一直取到{BANNED}最佳大的匹配关键字 KeyWord tmp = null; if (nextTree != null && nextTree.size() > 0) { // 如果大于0,则表示还有更长的词,继续往下找 tmp = getSensitiveWord(append + next, pre, nextTree, text.substring(1), keywords); if (tmp == null) { // 没有更长的词,则就返回这个词。在返回之前,先判断它是模糊的,还是精确的 tmp = getKeyWord(append, endTag);
} return checkPattern(tmp, pre, suffix);
} // 没有往下的词了,返回该关键词。 return checkPattern(getKeyWord(append, endTag), pre, suffix);
}
思路是对某个字符串 text,逐个字符 ch,获取 ch 对应的词库树的 children,然后获取匹配到的单个或多个结果,将匹配到的关键词在 text 中的开始和结束下标进行记录,如后续需要 html 标记,或者字符替换可直接使用。如果未能在词库树中找到对应的 ch 的 children,或者词库树的 children 未能匹配到去除 ch 的子字符串,则继续循环。具体可再详细读一下代码。
Redis 实现了不定长压缩前缀的 radix tree,用在集群模式下存储 slot 对应的的所有 key 信息。
/* Representation of a radix tree as implemented in this file, that contains
* the strings "foo", "foobar" and "footer" after the insertion of each
* word. When the node represents a key inside the radix tree, we write it
* between [], otherwise it is written between ().
*
* This is the vanilla representation:
*
* (f) ""
* \
* (o) "f"
* \
* (o) "fo"
* \
* [t b] "foo"
* / \
* "foot" (e) (a) "foob"
* / \
* "foote" (r) (r) "fooba"
* / \
* "footer" [] [] "foobar"
*
* However, this implementation implements a very common optimization where
* successive nodes having a single child are "compressed" into the node
* itself as a string of characters, each representing a next-level child,
* and only the link to the node representing the last character node is
* provided inside the representation. So the above representation is turned
* into:
*
* ["foo"] ""
* |
* [t b] "foo"
* / \
* "foot" ("er") ("ar") "foob"
* / \
* "footer" [] [] "foobar"
*
* However this optimization makes the implementation a bit more complex.
* For instance if a key "first" is added in the above radix tree, a
* "node splitting" operation is needed, since the "foo" prefix is no longer
* composed of nodes having a single child one after the other. This is the
* above tree and the resulting node splitting after this event happens:
*
*
* (f) ""
* /
* (i o) "f"
* / \
* "firs" ("rst") (o) "fo"
* / \
* "first" [] [t b] "foo"
* / \
* "foot" ("er") ("ar") "foob"
* / \
* "footer" [] [] "foobar"
*
* Similarly after deletion, if a new chain of nodes having a single child
* is created (the chain must also not include nodes that represent keys),
* it must be compressed back into a single node.
*
*/ #define RAX_NODE_MAX_SIZE ((1<<29)-1) typedef struct raxNode { uint32_t iskey:1; /* Does this node contain a key? */ uint32_t isnull:1; /* Associated value is NULL (don't store it). */ uint32_t iscompr:1; /* Node is compressed. */ uint32_t size:29; /* Number of children, or compressed string len. */ unsigned char data[];
} raxNode; typedef struct rax { raxNode *head; uint64_t numele; uint64_t numnodes;
} rax; typedef struct raxStack { void **stack; /* Points to static_items or an heap allocated array. */ size_t items, maxitems; /* Number of items contained and total space. */ void *static_items[RAX_STACK_STATIC_ITEMS]; int oom; /* True if pushing into this stack failed for OOM at some point. */ } raxStack;
如 Redis 源码中的注释所写,RAX 进行了一些优化,并不会将一个字符串直接按照每个字符进行树的构建,而是在 Insert 有冲突时节点分割处理,在 Delete 时如果子节点和父节点都只有一个,则需要进行合并操作。
对于 RAX 有兴趣的同学,可以看一下 rax.h、rax.c 的相关代码。
Linux radix 树{BANNED}最佳广泛的用途是用于内存管理,结构 address_space 通过 radix 树跟踪绑定到地址映射上的核心页,该 radix 树允许内存管理代码快速查找标识为 dirty 或 writeback 的页。Linux radix 树的 API 函数在 lib/radix-tree.c 中实现。
Linux 基数树(radix tree)是将指针与 long 整数键值相关联的机制,它存储有效率,并且可快速查询,用于指针与整数值的映射(如:IDR 机制)、内存管理等。
struct radix_tree_node { unsigned int path; unsigned int count; union { struct { struct radix_tree_node *parent; void *private_data;
}; struct rcu_head rcu_head; }; /* For tree user */ struct list_head private_list; void __rcu *slots[RADIX_TREE_MAP_SIZE]; unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
};
关于 Linux 内核使用 Radix Tree 的具体代码,有兴趣的同学可以继续深入。
Trie 树在单词搜索、统计、排序等领域有大量的应用。文章从基础概念到具体的脏话过滤的应用、Redis 的 RAX 和 Linux 内核的 Radix Tree 对 Trie 树做了介绍。数据结构和算法是程序高性能的基础,本文抛砖引玉,希望大家对 Trie 树有所了解,并在未来开发过程实践和应用 Trie 树解决中类似情景的问题。