1500字范文,内容丰富有趣,写作好帮手!
1500字范文 > 搜索引擎之分词器学习

搜索引擎之分词器学习

时间:2020-08-23 02:38:54

相关推荐

搜索引擎之分词器学习

分词器实现代码:

package com.zd.tokenizer;

import java.io.BufferedReader;

import java.io.FileInputStream;

import java.io.IOException;

import java.io.InputStreamReader;

import java.util.*;

public class Tokenizer {

private Map<Character, Object> dictionary;

public Tokenizer(String dictionaryFilePath) throws IOException {//红黑树的实现dictionary = new TreeMap<>();//从文件加载字典到TreeMapthis.loadDictionary(dictionaryFilePath);}private void loadDictionary(String dictionaryFilePath) throws IOException {BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(dictionaryFilePath)));String line = null;while ((line = reader.readLine()) != null) {line = line.trim();if (line.length() == 0) {continue;}char c;Map<Character, Object> child = this.dictionary;//组成以这个字符开头的词的树for (int i = 0; i < line.length(); i++) {c = line.charAt(i);Map<Character, Object> ccMap = (Map<Character, Object>) child.get(c);if (ccMap == null) {ccMap = new HashMap<Character, Object>();child.put(c, ccMap);}child = ccMap;}child.put(' ', null);}}public List<String> participie(String text) {if (text == null) {return null;}text = text.trim();if (text.length() == 0) {return null;}List<String> tokens = new ArrayList<>();char c;for (int i = 0; i < text.length(); ) {StringBuilder token = new StringBuilder();Map<Character, Object> child = this.dictionary;boolean matchToken = false;for (int j = i; j < text.length(); j++) {c = text.charAt(j);Map<Character, Object> ccMap = (Map<Character, Object>) child.get(c);if (ccMap == null) {if (child.containsKey(' ')) {matchToken = true;i = j;}break;} else {token.append(c);child = ccMap;}}//匹配到词if (matchToken) {tokens.add(token.toString());} else {if (child.containsKey(' ')) {//短的也是词,如张三丰,张三tokens.add(token.toString());break;} else {//没有匹配到词,则该字符作为一个词tokens.add("" + text.charAt(i));i++;}}}return tokens;}public static void main(String[] args) throws IOException {Tokenizer tk = new Tokenizer(Tokenizer.class.getResource("/dictionary.txt").getPath());List<String> tokens = tk.participie("想过离开,以这种方式存在,是因为那些旁白那些姿态那些伤害");for (String s : tokens){System.out.println(s);}}

}

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。