当前位置：首页 > news >正文

使用Apache Lucene构建高效的全文搜索服务

news 2025/9/24 15:07:35

使用Apache Lucene构建高效的全文搜索服务

在现代应用程序中，全文搜索功能是不可或缺的一部分。无论是电子商务网站、内容管理系统，还是数据分析平台，快速、准确地搜索大量数据是提升用户体验的关键。Apache Lucene 是一个强大的全文搜索引擎库，它提供了高效的索引和搜索功能，能够轻松集成到Java应用程序中。本文将介绍如何使用Apache Lucene构建一个高效的全文搜索服务，并通过一个实际的Java代码示例来展示其核心功能。

1. Lucene简介

Apache Lucene 是一个高性能、全功能的文本搜索引擎库，使用Java编写。它提供了强大的索引和搜索功能，支持多种查询类型，如布尔查询、范围查询、模糊查询等。Lucene的核心优势在于其高效的索引结构和灵活的API，使得开发者可以轻松地构建复杂的搜索功能。

2. 项目结构

在这个示例中，我们将构建一个简单的搜索服务，用于索引和搜索拍卖交易历史记录（AtcoinDealhistory）。项目的主要类 LuceneService 负责管理索引的创建、更新和搜索操作。

3. 索引创建与更新

在Lucene中，索引的创建和更新是通过 IndexWriter 来完成的。IndexWriter 负责将文档（Document）添加到索引中，并确保索引的高效存储和检索。

public void indexDocument(List<AtcoinDealhistory> list) throws IOException {long startTime = System.currentTimeMillis(); // 记录开始时间// 配置IndexWriterIndexWriterConfig config = new IndexWriterConfig(analyzer);config.setMaxBufferedDocs(20); // 设置最大缓冲文档数config.setRAMBufferSizeMB(2048.0); // 设置RAM缓冲区大小config.setUseCompoundFile(true); // 使用复合文件格式// 使用try-with-resources确保IndexWriter正确关闭try (IndexWriter indexWriter = new IndexWriter(directory, config)) {for (AtcoinDealhistory atcoinDealhistory : list) {Document doc = new Document();if (atcoinDealhistory.getAuctionId() != null) {// 添加字段到文档doc.add(new StringField("id", atcoinDealhistory.getId().toString(), Store.YES));doc.add(new StringField("auction_id", atcoinDealhistory.getAuctionId(), Store.YES));doc.add(new TextField("auction_name", atcoinDealhistory.getAuctionName(), Store.YES));doc.add(new StringField("amount", atcoinDealhistory.getAmount(), Store.YES));doc.add(new TextField("data", atcoinDealhistory.getData(), Store.YES));doc.add(new StringField("picture", atcoinDealhistory.getPicture(), Store.YES));// 处理日期字段if (atcoinDealhistory.getDealdate() != null) {try {SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");String dateStr = dateFormat.format(atcoinDealhistory.getDealdate());// 添加日期字段，用于排序doc.add(new SortedNumericDocValuesField("date", atcoinDealhistory.getDealdate().getTime()));// 添加日期字段，用于存储doc.add(new StringField("dealdate", dateStr, Store.YES));} catch (Exception e) {e.printStackTrace();}}}// 将文档添加到索引indexWriter.addDocument(doc);indexWriter.commit(); // 提交更改}} catch (Exception e) {e.printStackTrace();}long endTime = System.currentTimeMillis(); // 记录结束时间System.out.println("Index creation time: " + (endTime - startTime) + " milliseconds");
}

在这个方法中，我们首先配置了 IndexWriter，然后遍历 AtcoinDealhistory 对象列表，将每个对象的字段添加到 Document 中，并将其写入索引。我们还处理了日期字段，确保它们可以用于排序和存储。

4. 搜索功能

Lucene 的搜索功能是通过 IndexSearcher 来实现的。IndexSearcher 负责执行查询并返回匹配的文档。我们可以使用多种查询类型来构建复杂的搜索条件。

public List<AtcoinDealhistory> search(AtcoinDealhistory atcoinDealhistory) throws IOException, org.apache.lucene.queryparser.classic.ParseException {List<AtcoinDealhistory> results = new ArrayList<>();long startTime = System.currentTimeMillis(); // 记录开始时间// 打开索引目录try (DirectoryReader reader = DirectoryReader.open(directory)) {IndexSearcher searcher = new IndexSearcher(reader);BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();// 处理日期范围查询String startQueryDealDate = atcoinDealhistory.getStartQueryDealDate();String endQueryDealDate = atcoinDealhistory.getEndQueryDealDate();if (startQueryDealDate != null && endQueryDealDate != null) {TermRangeQuery dateRangeQuery = TermRangeQuery.newStringRange("dealdate", startQueryDealDate, endQueryDealDate, true, true);booleanQueryBuilder.add(dateRangeQuery, Occur.MUST);}// 处理金额范围查询Integer startQueryAmount = atcoinDealhistory.getStartQueryAmount();Integer endQueryAmount = atcoinDealhistory.getEndQueryAmount();if (startQueryAmount != null && endQueryAmount != null) {TermRangeQuery amountRangeQuery = TermRangeQuery.newStringRange("amount", Integer.toString(startQueryAmount), Integer.toString(endQueryAmount), true, true);booleanQueryBuilder.add(amountRangeQuery, Occur.MUST);}// 处理拍卖名称的关键词查询List<String> terms = analyzeQueryString(analyzer, atcoinDealhistory.getAuctionName());for (String term : terms) {TermQuery termQuery = new TermQuery(new Term("auction_name", term));booleanQueryBuilder.add(termQuery, Occur.MUST);}// 设置排序规则，根据日期字段降序排列SortField sortDate = new SortField("date", SortField.Type.LONG, true);Sort sort = new Sort(SortField.FIELD_SCORE, sortDate);// 执行查询ScoreDoc[] hits = searcher.search(booleanQueryBuilder.build(), 100, sort).scoreDocs;SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");// 遍历查询结果for (ScoreDoc hit : hits) {Document doc = searcher.doc(hit.doc);AtcoinDealhistory result = new AtcoinDealhistory();result.setId(Long.valueOf(doc.get("id")));result.setAuctionId(doc.get("auction_id"));result.setAuctionName(doc.get("auction_name"));result.setAmount(doc.get("amount"));result.setData(doc.get("data"));result.setPicture(doc.get("picture"));// 处理日期字段if (doc.get("dealdate") != null) {try {Date dealdate = dateFormat.parse(doc.get("dealdate"));result.setDealdate(dealdate);} catch (Exception e) {e.printStackTrace();}}results.add(result);}}long endTime = System.currentTimeMillis(); // 记录结束时间System.out.println("Index search time: " + (endTime - startTime) + " milliseconds");return results;
}

在这个方法中，我们首先打开索引目录并创建 IndexSearcher。然后，我们构建了一个布尔查询（BooleanQuery），用于处理日期范围、金额范围和关键词查询。最后，我们执行查询并遍历结果，将匹配的文档转换为 AtcoinDealhistory 对象并返回。

5. 分词器

Lucene 的分词器（Analyzer）用于将文本分解为单词或词语。在这个示例中，我们使用了 StandardAnalyzer，它是Lucene提供的一个标准分词器，适用于大多数英文文本。

	public LuceneService() throws IOException {// 索引目录的路径this.directory = FSDirectory.open(Paths.get(INDEX_DIR));// 标准索引解析器this.analyzer = new StandardAnalyzer();// 第三方分词器解析器
//		this.analyzer = new ReIKAnalyzer(false);}

public List<String> analyzeQueryString(Analyzer analyzer, String queryString) throws IOException {List<String> terms = new ArrayList<>();// 使用分词器处理查询字符串try (TokenStream tokenStream = analyzer.tokenStream("auction_name", new StringReader(queryString))) {CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);tokenStream.reset();// 遍历分词结果while (tokenStream.incrementToken()) {terms.add(charTermAttribute.toString());}tokenStream.end();}return terms;
}

这个方法将查询字符串分解为多个词语，并将它们添加到列表中，以便在搜索时使用。

6. 总结

通过这个示例，我们展示了如何使用Apache Lucene构建一个高效的全文搜索服务。Lucene提供了强大的索引和搜索功能，使得开发者可以轻松地处理复杂的搜索需求。无论是处理结构化数据还是非结构化文本，Lucene都能提供高效的解决方案。

7. 附录

maven 依赖

<!-- 搜索模块 --><!-- Lucene Core Dependency --><dependency><groupId>org.apache.lucene</groupId><artifactId>lucene-core</artifactId><version>8.11.0</version> <!-- 请根据需要选择合适的版本 --></dependency><!-- Lucene Analyzers Dependency --><dependency><groupId>org.apache.lucene</groupId><artifactId>lucene-analyzers-common</artifactId><version>8.11.0</version></dependency><!-- 如果需要其他Lucene模块，也可以继续添加 --><dependency><groupId>org.apache.lucene</groupId><artifactId>lucene-queryparser</artifactId><version>8.11.0</version></dependency><dependency><groupId>org.apache.lucene</groupId><artifactId>lucene-highlighter</artifactId><version>8.11.0</version></dependency><dependency><groupId>org.apache.lucene</groupId><artifactId>lucene-memory</artifactId><version>8.11.0</version></dependency><dependency><groupId>org.apache.lucene</groupId><artifactId>lucene-queries</artifactId><version>8.11.0</version></dependency><!-- <dependency><groupId>org.truenewx</groupId><artifactId>ik-analyzer-lucene</artifactId><version>5.0.1</version></dependency>--><dependency><groupId>com.github.keran213539</groupId><artifactId>IK_Analyzer</artifactId><version>2012FF_hf1_1</version></dependency><!-- https://mvnrepository.com/artifact/org.ansj/ansj_seg --><!-- <dependency><groupId>org.ansj</groupId><artifactId>ansj_seg</artifactId><version>5.1.6</version></dependency>-->

多线程索引创建

索引创建的时间有点久了，增加多线程处理

// 创建或更新索引public void indexDocument(List<AtcoinDealhistory> list) throws IOException {long startTime = System.currentTimeMillis(); // 记录开始时间//引入多线程ExecutorService executorService = Executors.newFixedThreadPool(numThreads);try {for (AtcoinDealhistory atcoinDealhistory : list) {executorService.submit(() -> {try {indexDocument(this.directory,this.analyzer,atcoinDealhistory);} catch (IOException e) {e.printStackTrace();}});}executorService.shutdown();executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);long endTime = System.currentTimeMillis(); // 记录结束时间// 计算索引创建时间long indexCreationTime = endTime - startTime;System.out.println("Index creation time: " + indexCreationTime/1000 + " milliseconds");} catch (Exception e) {System.out.println(e.getMessage());e.printStackTrace();} }

IK分词器重写

因为Lucene版本问题，IKAnalyzer 需要进行重写

package com.atcoin.busi.test;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.wltea.analyzer.cfg.Configuration;public class ReIKAnalyzer extends Analyzer {private boolean useSmart;//分词器配置项private Configuration cfg;public Configuration getCfg() {return cfg;}public void setCfg(Configuration cfg) {this.cfg = cfg;}public boolean useSmart() {return useSmart;}public void setUseSmart(boolean useSmart) {this.useSmart = useSmart;}/*** IK分词器Lucene 5.4.0 Analyzer接口实现类** 默认细粒度切分算法*/public ReIKAnalyzer() {this(false);}/*** IK分词器Lucene 5.4.0 Analyzer接口实现类** @param useSmart*            当为true时，分词器进行智能切分*/public ReIKAnalyzer(boolean useSmart) {super();this.useSmart = useSmart;}/*** IK分词器Lucene 5.4.0 Analyzer接口实现类** @param cfg*/public ReIKAnalyzer(Configuration cfg) {super();this.setCfg(cfg);}/*** 重载Analyzer接口，构造分词组件** @param fieldName*            the name of the fields content passed to the*            TokenStreamComponents sink as a reader*/@Overrideprotected TokenStreamComponents createComponents(String fieldName) {Tokenizer _IKTokenizer = new ReIKTokenizer(this.useSmart());return new TokenStreamComponents(_IKTokenizer);}}

package com.atcoin.busi.test;
import java.io.IOException;import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;public class ReIKTokenizer extends Tokenizer {// IK分词器实现private IKSegmenter _IKImplement;// 词元文本属性private final CharTermAttribute termAtt;// 词元位移属性private final OffsetAttribute offsetAtt;// 词元分类属性（该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量）private final TypeAttribute typeAtt;// 记录最后一个词元的结束位置private int endPosition;/*** Lucene 5.4.0 Tokenizer适配器类构造函数** @param in* @param useSmart*/public ReIKTokenizer(boolean useSmart) {super();offsetAtt = addAttribute(OffsetAttribute.class);termAtt = addAttribute(CharTermAttribute.class);typeAtt = addAttribute(TypeAttribute.class);_IKImplement = new IKSegmenter(input, useSmart);}/*** Lucene 5.4.0 Tokenizer适配器类构造函数** @param in* @param cfg*/public ReIKTokenizer(Configuration cfg) {super();offsetAtt = addAttribute(OffsetAttribute.class);termAtt = addAttribute(CharTermAttribute.class);typeAtt = addAttribute(TypeAttribute.class);_IKImplement = new IKSegmenter(input, cfg);}@Overridepublic boolean incrementToken() throws IOException {// 清除所有的词元属性clearAttributes();Lexeme nextLexeme = _IKImplement.next();if (nextLexeme != null) {// 将Lexeme转成Attributes// 设置词元文本termAtt.append(nextLexeme.getLexemeText());// 设置词元长度termAtt.setLength(nextLexeme.getLength());// 设置词元位移offsetAtt.setOffset(nextLexeme.getBeginPosition(),nextLexeme.getEndPosition());// 记录分词的最后位置endPosition = nextLexeme.getEndPosition();// 记录词元分类typeAtt.setType(nextLexeme.getLexemeTypeString());// 返会true告知还有下个词元return true;}// 返会false告知词元输出完毕return false;}@Overridepublic void reset() throws IOException {super.reset();_IKImplement.reset(input);}@Overridepublic final void end() {// set final offsetint finalOffset = correctOffset(this.endPosition);offsetAtt.setOffset(finalOffset, finalOffset);}
}

接口调用

package com.atcoin.busi.controller;
import java.io.IOException;
import java.util.List;import javax.annotation.PreDestroy;import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;import com.atcoin.busi.domain.AtcoinDealhistory;
import com.atcoin.busi.service.IAtcoinDealhistoryService;
import com.atcoin.busi.service.impl.LuceneService;@RestController
@RequestMapping("/lucene")
public class AtcoinLuceneController {@Autowiredprivate IAtcoinDealhistoryService atcoinDealhistoryService;@Autowiredprivate LuceneService luceneService;//    @PostMapping("/index")
//    public String indexDocument(@RequestParam String id, @RequestParam String content) {
//        try {
//            luceneService.indexDocument(id, content);
//            return "Document indexed successfully.";
//        } catch (IOException e) {
//            e.printStackTrace();
//            return "Failed to index document.";
//        }
//    }@GetMapping("/createIndex")public String createIndex() {try {List<AtcoinDealhistory> list = atcoinDealhistoryService.selectAtcoinDealhistoryIndex();luceneService.indexDocument(list);return "Document indexed successfully.";} catch (IOException e) {e.printStackTrace();return "Failed to index document.";}}@GetMapping("/search")public List<AtcoinDealhistory> search(@RequestParam String keywords) {try {return luceneService.search(keywords);} catch (IOException | org.apache.lucene.queryparser.classic.ParseException e) {e.printStackTrace();return null;}}// 在应用关闭时关闭Lucene资源@PreDestroypublic void close() {try {luceneService.close();} catch (IOException e) {e.printStackTrace();}}
}