當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

Lucene 8.x 中文分词基本使用

發布時間：2024/1/18 编程问答 25 豆豆

生活随笔收集整理的這篇文章主要介紹了 Lucene 8.x 中文分词基本使用小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

Lucene中文分詞基本使用
本文章僅通過document進行簡單示例。

import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.*; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory;import java.io.IOException; import java.nio.file.Paths;/*** @author liangzh* @create 2021-05-05-15:51*/ public class Lucene_Index_CRUD {public static void main(String[] args) {//添加索引到索引庫 // addIndex();searchIndex(); // updateIndex(); // deleteIndex();}private static void searchIndex() {try {Directory directory = FSDirectory.open(Paths.get("Lucene_db"));IndexReader indexReader = DirectoryReader.open(directory);IndexSearcher indexSearcher = new IndexSearcher(indexReader); // //TermQuery 不會對關鍵字進行分詞 // Query query = new TermQuery(new Term("title", "title1"));SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();QueryParser queryParser = new QueryParser("title", analyzer);Query query = queryParser.parse("廣州");TopDocs topDocs = indexSearcher.search(query, 10);ScoreDoc[] scoreDocs = topDocs.scoreDocs;System.out.println("獲取到的記錄數" + scoreDocs.length);for (int i=0;i< scoreDocs.length; i++){int id = scoreDocs[i].doc;float score = scoreDocs[i].score;System.out.println("id:" + id +"; score:" + score);Document document = indexSearcher.doc(id);String articleId = document.get("articleId");String title = document.get("title");String content = document.get("content");System.out.println("articleId: " + articleId + "; title:" + title + "; content:"+ content);}} catch (IOException | ParseException e) {e.printStackTrace();}}private static void deleteIndex() {try {//指定索引庫的位置Directory directory = FSDirectory.open(Paths.get("Lucene_db"));//創建分詞器，此處使用單字分詞器Analyzer analyzer = new SmartChineseAnalyzer();//創建IndexWriterConfig實例，通過IndexConfig實例配置索引創建模式IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);//通過IndexWriter進行索引的添加，刪除，更新IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);//刪除某一文件indexWriter.deleteDocuments(new Term("articleId","0002"));//全部刪除//indexWriter.deleteAll();//提交事務indexWriter.commit();//關閉流indexWriter.close();System.out.println("==========索引刪除成功=========");} catch (IOException e) {e.printStackTrace();}}//更新原理是將原來的刪除，然后重新添加private static void updateIndex() {try {//指定索引庫的位置Directory directory = FSDirectory.open(Paths.get("Lucene_db"));//創建分詞器，此處使用單字分詞器Analyzer analyzer = new SmartChineseAnalyzer();//創建IndexWriterConfig實例，通過IndexConfig實例配置索引創建模式IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);//通過IndexWriter進行索引的添加，刪除，更新IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);//在Lucene中，一個Document實例代表一條記錄Document document2 = new Document();//StringFild不會對數據進行分詞// Store.YES:會對數據進行分詞并存儲document2.add(new StringField("articleId", "0002", Field.Store.YES));document2.add(new TextField("title", "title1", Field.Store.YES));document2.add(new TextField("content", "廣州在哪里怎么走，有什么好玩的地方", Field.Store.YES));//將索引寫至索引庫indexWriter.updateDocument(new Term("articleId","0001"),document2);//提交事務indexWriter.commit();//關閉流indexWriter.close();System.out.println("==========索引更新成功=========");} catch (IOException e) {e.printStackTrace();}}private static void addIndex() {try {//指定索引庫的位置Directory directory = FSDirectory.open(Paths.get("Lucene_db"));//創建分詞器，此處使用漢語分詞器SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();//創建IndexWriterConfig實例，通過IndexConfig實例配置索引創建模式IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);//通過IndexWriter進行索引的添加，刪除，更新IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);//在Lucene中，一個Document實例代表一條記錄Document document = new Document();//StringFild不會對數據進行分詞// Store.YES:會對數據進行分詞并存儲document.add(new StringField("articleId", "0003", Field.Store.YES));document.add(new TextField("title", "廣州怎么走", Field.Store.YES));document.add(new TextField("content", "廣州在哪里怎么走，有什么好玩的地方", Field.Store.YES));//將索引寫至索引庫indexWriter.addDocument(document);//提交事務indexWriter.commit();//關閉流indexWriter.close();System.out.println("==========索引添加成功=========");} catch (IOException e) {e.printStackTrace();}} }

總結

以上是生活随笔為你收集整理的Lucene 8.x 中文分词基本使用的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： spring-security验证登录h
下一篇：留住核心人才