[分词] 基于 Lucene8 版本的逆向最大匹配分词器 (依赖本地词典)
作者:alexgaoyh
- 2024-01-24 河南
本文字数:1981 字
阅读完需:约 6 分钟
介绍
在搜索相关业务场景中,强依赖中文分词的结果,本文是基于Lucene8版本的混合分词器(分词合并)的补充,可以将自定义分词的结果进行添加。
实现代码
import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.core.LowerCaseFilter;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.nio.charset.StandardCharsets;import java.nio.file.Files;import java.nio.file.Path;import java.util.HashSet;import java.util.Set;
public class ReverseMaximumMatchingAnalyzer extends Analyzer { private Set<String> dictionary;
public ReverseMaximumMatchingAnalyzer(Path dictionaryPath) { this.dictionary = loadDictionary(dictionaryPath); }
private Set<String> loadDictionary(Path dictionaryPath) { Set<String> dictionary = new HashSet<>(); try (InputStream inputStream = Files.newInputStream(dictionaryPath); InputStreamReader streamReader = new InputStreamReader(inputStream, StandardCharsets.UTF_8); BufferedReader reader = new BufferedReader(streamReader)) { String line; while ((line = reader.readLine()) != null) { dictionary.add(line.trim()); } } catch (Exception e) { } return dictionary; }
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new ReverseMaximumMatchingTokenizer(dictionary); TokenFilter filter = new LowerCaseFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); }
private static class ReverseMaximumMatchingTokenizer extends Tokenizer { private char[] buffer = new char[256]; private int length = 0; private int offset = 0;
private CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); private OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private Set<String> dictionary;
public ReverseMaximumMatchingTokenizer(Set<String> dictionary) { this.dictionary = dictionary; }
@Override public boolean incrementToken() throws IOException { while (true) { if (offset < length) { clearAttributes(); int start = offset; while (offset < length) { offset++; int end = offset; while (end >= start) { String token = new String(buffer, start, end - start); if (dictionary.contains(token)) { termAttribute.copyBuffer(buffer, start, end - start); offsetAttribute.setOffset(correctOffset(start), correctOffset(end)); return true; } end--; } } } else if (input.read(buffer) != -1) { length = buffer.length; offset = 0; } else { return false; } } }
@Override public void reset() throws IOException { super.reset(); offset = length = 0; }
@Override public void end() throws IOException { super.end(); offset = length; } }}
复制代码
总结
此代码在 Lucene8.6.2 版本下通过测试,对中文分词补充基于逆向最大匹配的分词结果,从而提升质量。
参考
划线
评论
复制
发布于: 刚刚阅读数: 2
版权声明: 本文为 InfoQ 作者【alexgaoyh】的原创文章。
原文链接:【http://xie.infoq.cn/article/7a1f2dcd7184cc6c031713e39】。
本文遵守【CC-BY 4.0】协议,转载请保留原文出处及本版权声明。
alexgaoyh
关注
DevOps 2013-12-08 加入
https://gitee.com/alexgaoyh







评论