Refactor phases 5 - search - tests
This commit is contained in:
parent
039b5a5f0a
commit
c8659bd45d
|
|
@ -0,0 +1,16 @@
|
|||
Slice 3 patch for the generic search platform.
|
||||
|
||||
Contents:
|
||||
- long-text CHUNK representations for generic and TED documents
|
||||
- representation selection mode for generic search (PRIMARY_ONLY / PRIMARY_AND_CHUNKS / ALL)
|
||||
- chunk-aware document collapse and matchedRepresentationCount in fused results
|
||||
- recency-aware scoring boost
|
||||
- lightweight search metrics endpoint: GET /api/search/metrics
|
||||
|
||||
Assumptions:
|
||||
- apply on top of Slice 2 and the Slice 2 fix patch
|
||||
- no additional DB migration is required in this slice
|
||||
|
||||
Notes:
|
||||
- Maven compile was not available in the patch generation environment
|
||||
- this patch intentionally keeps TED and Mail structured search for later slices
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
# Generic Search Slice Test Plan
|
||||
|
||||
This patch adds a minimal but useful integration-test baseline for the new generic search slices.
|
||||
|
||||
## What is covered
|
||||
|
||||
- PostgreSQL full-text search over `DOC.doc_text_representation.search_vector`
|
||||
- PostgreSQL trigram search over document title / summary / representation text
|
||||
- hybrid orchestration and document-level collapse
|
||||
- representation selection modes (`PRIMARY_ONLY`, `PRIMARY_AND_CHUNKS`)
|
||||
- REST endpoint smoke tests for:
|
||||
- `POST /api/search`
|
||||
- `POST /api/search/debug`
|
||||
- `GET /api/search/metrics`
|
||||
|
||||
## Recommended execution order
|
||||
|
||||
1. Apply the search-slice DB migration(s) or ensure the runtime schema already contains the lexical search columns.
|
||||
2. Run the new integration tests with PostgreSQL Testcontainers.
|
||||
3. Start the application locally and try the included Postman requests.
|
||||
4. Only after lexical tests are green, add semantic engine integration tests.
|
||||
|
||||
## Notes
|
||||
|
||||
- The test application intentionally imports only the DOC domain services and lexical search beans.
|
||||
- Semantic/vector beans are left out to keep the test context small and deterministic.
|
||||
- The base test class adds the `search_config` and `search_vector` columns if they are not already present.
|
||||
4
pom.xml
4
pom.xml
|
|
@ -238,6 +238,10 @@
|
|||
<version>1.21.4</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.datatype</groupId>
|
||||
<artifactId>jackson-datatype-jsr310</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
|
|
|
|||
|
|
@ -0,0 +1,92 @@
|
|||
{
|
||||
"info": {
|
||||
"name": "DIP Generic Search",
|
||||
"_postman_id": "2d8f227e-4f38-45c0-9d59-b0642773c993",
|
||||
"description": "Sample requests for the generic lexical search slices (full-text, trigram, hybrid, debug, metrics).",
|
||||
"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
|
||||
},
|
||||
"variable": [
|
||||
{"key": "baseUrl", "value": "http://localhost:8889/api"}
|
||||
],
|
||||
"item": [
|
||||
{
|
||||
"name": "Search - fulltext exact",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [{"key": "Content-Type", "value": "application/json"}],
|
||||
"url": "{{baseUrl}}/search",
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"queryText\": \"framework agreement\",\n \"modes\": [\"FULLTEXT\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Search - trigram fuzzy title",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [{"key": "Content-Type", "value": "application/json"}],
|
||||
"url": "{{baseUrl}}/search",
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"queryText\": \"Viena school renovtion\",\n \"modes\": [\"TRIGRAM\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Search - hybrid lexical",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [{"key": "Content-Type", "value": "application/json"}],
|
||||
"url": "{{baseUrl}}/search",
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"queryText\": \"Maintenance manual\",\n \"modes\": [\"HYBRID\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Search - chunk-aware",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [{"key": "Content-Type", "value": "application/json"}],
|
||||
"url": "{{baseUrl}}/search",
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"queryText\": \"district heating optimization\",\n \"modes\": [\"FULLTEXT\"],\n \"documentTypes\": [\"TEXT\"],\n \"documentFamilies\": [\"GENERIC\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_AND_CHUNKS\",\n \"page\": 0,\n \"size\": 10\n}"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Search - createdFrom filter",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [{"key": "Content-Type", "value": "application/json"}],
|
||||
"url": "{{baseUrl}}/search",
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"queryText\": \"framework agreement\",\n \"modes\": [\"FULLTEXT\"],\n \"createdFrom\": \"2026-01-01T00:00:00Z\",\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Search - debug",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [{"key": "Content-Type", "value": "application/json"}],
|
||||
"url": "{{baseUrl}}/search/debug",
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"queryText\": \"maintenence manual\",\n \"modes\": [\"HYBRID\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Search - metrics",
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": "{{baseUrl}}/search/metrics"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -15,5 +15,9 @@ public interface DocumentTextRepresentationRepository extends JpaRepository<Docu
|
|||
|
||||
List<DocumentTextRepresentation> findByPrimaryRepresentationTrue();
|
||||
|
||||
long countByPrimaryRepresentationTrue();
|
||||
|
||||
long countByRepresentationType(RepresentationType representationType);
|
||||
|
||||
Optional<DocumentTextRepresentation> findFirstByDocument_IdAndPrimaryRepresentationTrue(UUID documentId);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ public class DocumentRepresentationService {
|
|||
.primaryRepresentation(command.primaryRepresentation())
|
||||
.textBody(command.textBody())
|
||||
.build();
|
||||
DocumentTextRepresentation saved = representationRepository.save(representation);
|
||||
DocumentTextRepresentation saved = representationRepository.saveAndFlush(representation);
|
||||
lexicalIndexService.indexRepresentation(saved.getId());
|
||||
return saved;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,7 +12,6 @@ import at.procon.ted.config.TedProcessorProperties;
|
|||
import at.procon.ted.model.entity.Organization;
|
||||
import at.procon.ted.model.entity.ProcurementDocument;
|
||||
import at.procon.ted.model.entity.ProcurementLot;
|
||||
import at.procon.ted.service.TedPhase2GenericDocumentService;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
|
@ -30,7 +29,7 @@ import org.springframework.transaction.annotation.Transactional;
|
|||
public class TedNoticeProjectionService {
|
||||
|
||||
private final TedProcessorProperties properties;
|
||||
private final TedPhase2GenericDocumentService tedPhase2GenericDocumentService;
|
||||
private final TedGenericDocumentRootService tedGenericDocumentRootService;
|
||||
private final DocumentRepository documentRepository;
|
||||
private final TedNoticeProjectionRepository projectionRepository;
|
||||
private final TedNoticeLotRepository lotRepository;
|
||||
|
|
@ -42,9 +41,8 @@ public class TedNoticeProjectionService {
|
|||
return null;
|
||||
}
|
||||
|
||||
TedPhase2GenericDocumentService.TedGenericDocumentSyncResult syncResult =
|
||||
tedPhase2GenericDocumentService.syncTedDocument(legacyDocument);
|
||||
return registerOrRefreshProjection(legacyDocument, syncResult.documentId());
|
||||
UUID genericDocumentId = tedGenericDocumentRootService.ensureGenericTedDocumentRoot(legacyDocument);
|
||||
return registerOrRefreshProjection(legacyDocument, genericDocumentId);
|
||||
}
|
||||
|
||||
@Transactional
|
||||
|
|
@ -55,7 +53,7 @@ public class TedNoticeProjectionService {
|
|||
|
||||
UUID resolvedDocumentId = genericDocumentId;
|
||||
if (resolvedDocumentId == null) {
|
||||
resolvedDocumentId = tedPhase2GenericDocumentService.ensureGenericTedDocument(legacyDocument);
|
||||
resolvedDocumentId = tedGenericDocumentRootService.ensureGenericTedDocumentRoot(legacyDocument);
|
||||
}
|
||||
|
||||
UUID finalResolvedDocumentId = resolvedDocumentId;
|
||||
|
|
|
|||
|
|
@ -425,8 +425,8 @@ public class GenericDocumentImportService {
|
|||
draft.languageCode(),
|
||||
null,
|
||||
draft.chunkIndex(),
|
||||
null,
|
||||
null,
|
||||
draft.chunkStartOffset(),
|
||||
draft.chunkEndOffset(),
|
||||
draft.primary(),
|
||||
draft.textBody()
|
||||
));
|
||||
|
|
|
|||
|
|
@ -0,0 +1,97 @@
|
|||
package at.procon.dip.normalization.impl;
|
||||
|
||||
import at.procon.dip.domain.document.ContentRole;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
|
||||
import at.procon.dip.normalization.spi.TextRepresentationBuilder;
|
||||
import at.procon.dip.normalization.spi.TextRepresentationDraft;
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.core.annotation.Order;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.util.StringUtils;
|
||||
|
||||
@Component
|
||||
@Order(200)
|
||||
@RequiredArgsConstructor
|
||||
public class ChunkedLongTextRepresentationBuilder implements TextRepresentationBuilder {
|
||||
|
||||
public static final String BUILDER_KEY = "long-text-chunker";
|
||||
|
||||
private final TedProcessorProperties properties;
|
||||
|
||||
@Override
|
||||
public boolean supports(DocumentType documentType) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<TextRepresentationDraft> build(RepresentationBuildRequest request) {
|
||||
if (!properties.getSearch().isChunkingEnabled()) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
String baseText = request.extractionResult().derivedTextByRole().get(ContentRole.NORMALIZED_TEXT);
|
||||
if (!StringUtils.hasText(baseText)) {
|
||||
baseText = request.extractionResult().derivedTextByRole().get(ContentRole.HTML_CLEAN);
|
||||
}
|
||||
if (!StringUtils.hasText(baseText)) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
int target = Math.max(400, properties.getSearch().getChunkTargetChars());
|
||||
int overlap = Math.max(0, Math.min(target / 3, properties.getSearch().getChunkOverlapChars()));
|
||||
if (baseText.length() <= target + overlap) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
List<TextRepresentationDraft> drafts = new ArrayList<>();
|
||||
int start = 0;
|
||||
int chunkIndex = 0;
|
||||
while (start < baseText.length() && chunkIndex < properties.getSearch().getMaxChunksPerDocument()) {
|
||||
int end = Math.min(baseText.length(), start + target);
|
||||
if (end < baseText.length()) {
|
||||
int boundary = findBoundary(baseText, end, Math.min(baseText.length(), end + 160));
|
||||
if (boundary > start + 200) {
|
||||
end = boundary;
|
||||
}
|
||||
}
|
||||
|
||||
String chunk = baseText.substring(start, end).trim();
|
||||
if (StringUtils.hasText(chunk)) {
|
||||
drafts.add(new TextRepresentationDraft(
|
||||
RepresentationType.CHUNK,
|
||||
BUILDER_KEY,
|
||||
request.detectionResult().languageCode(),
|
||||
chunk,
|
||||
false,
|
||||
chunkIndex,
|
||||
start,
|
||||
end,
|
||||
ContentRole.NORMALIZED_TEXT,
|
||||
Boolean.TRUE
|
||||
));
|
||||
chunkIndex++;
|
||||
}
|
||||
|
||||
if (end >= baseText.length()) {
|
||||
break;
|
||||
}
|
||||
start = Math.max(end - overlap, start + 1);
|
||||
}
|
||||
return drafts;
|
||||
}
|
||||
|
||||
private int findBoundary(String text, int preferred, int max) {
|
||||
for (int i = preferred; i < max; i++) {
|
||||
char c = text.charAt(i);
|
||||
if (c == '\n' || c == '.' || c == '!' || c == '?' || c == ';') {
|
||||
return i + 1;
|
||||
}
|
||||
}
|
||||
return preferred;
|
||||
}
|
||||
}
|
||||
|
|
@ -41,7 +41,6 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
|
|||
String semantic = buildSemanticText(title, summary, request.detectionResult().documentType());
|
||||
|
||||
List<TextRepresentationDraft> drafts = new ArrayList<>();
|
||||
/*
|
||||
drafts.add(new TextRepresentationDraft(
|
||||
RepresentationType.FULLTEXT,
|
||||
BUILDER_KEY,
|
||||
|
|
@ -49,10 +48,11 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
|
|||
baseText,
|
||||
false,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
ContentRole.NORMALIZED_TEXT,
|
||||
Boolean.TRUE
|
||||
Boolean.FALSE
|
||||
));
|
||||
*/
|
||||
drafts.add(new TextRepresentationDraft(
|
||||
RepresentationType.SEMANTIC_TEXT,
|
||||
BUILDER_KEY,
|
||||
|
|
@ -60,10 +60,11 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
|
|||
semantic,
|
||||
true,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
ContentRole.NORMALIZED_TEXT,
|
||||
Boolean.TRUE
|
||||
));
|
||||
/*
|
||||
if (StringUtils.hasText(title)) {
|
||||
drafts.add(new TextRepresentationDraft(
|
||||
RepresentationType.TITLE_ABSTRACT,
|
||||
|
|
@ -72,11 +73,24 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
|
|||
title + "\n\n" + summary,
|
||||
false,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
ContentRole.NORMALIZED_TEXT,
|
||||
Boolean.FALSE
|
||||
));
|
||||
}
|
||||
*/
|
||||
drafts.add(new TextRepresentationDraft(
|
||||
RepresentationType.SUMMARY,
|
||||
BUILDER_KEY,
|
||||
request.detectionResult().languageCode(),
|
||||
summary,
|
||||
false,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
ContentRole.NORMALIZED_TEXT,
|
||||
Boolean.FALSE
|
||||
));
|
||||
return drafts;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -61,10 +61,11 @@ public class TedStructuredTextRepresentationBuilder implements TextRepresentatio
|
|||
semanticText,
|
||||
true,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
ContentRole.NORMALIZED_TEXT,
|
||||
Boolean.TRUE
|
||||
));
|
||||
/*
|
||||
drafts.add(new TextRepresentationDraft(
|
||||
RepresentationType.FULLTEXT,
|
||||
BUILDER_KEY,
|
||||
|
|
@ -72,8 +73,10 @@ public class TedStructuredTextRepresentationBuilder implements TextRepresentatio
|
|||
normalizedText,
|
||||
false,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
ContentRole.NORMALIZED_TEXT,
|
||||
Boolean.TRUE
|
||||
Boolean.FALSE
|
||||
));
|
||||
if (StringUtils.hasText(title)) {
|
||||
drafts.add(new TextRepresentationDraft(
|
||||
|
|
@ -83,6 +86,8 @@ public class TedStructuredTextRepresentationBuilder implements TextRepresentatio
|
|||
title + "\n\n" + summary,
|
||||
false,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
ContentRole.NORMALIZED_TEXT,
|
||||
Boolean.FALSE
|
||||
));
|
||||
|
|
@ -94,10 +99,11 @@ public class TedStructuredTextRepresentationBuilder implements TextRepresentatio
|
|||
summary,
|
||||
false,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
ContentRole.NORMALIZED_TEXT,
|
||||
Boolean.FALSE
|
||||
));
|
||||
*/
|
||||
return drafts;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -13,6 +13,8 @@ public record TextRepresentationDraft(
|
|||
String textBody,
|
||||
boolean primary,
|
||||
Integer chunkIndex,
|
||||
Integer chunkStartOffset,
|
||||
Integer chunkEndOffset,
|
||||
ContentRole sourceContentRole,
|
||||
Boolean queueForEmbedding
|
||||
) {
|
||||
|
|
@ -22,6 +24,7 @@ public record TextRepresentationDraft(
|
|||
String textBody,
|
||||
boolean primary,
|
||||
Integer chunkIndex) {
|
||||
this(representationType, null, languageCode, textBody, primary, chunkIndex, ContentRole.NORMALIZED_TEXT, null);
|
||||
this(representationType, null, languageCode, textBody, primary, chunkIndex, null, null, ContentRole.NORMALIZED_TEXT, null);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -14,7 +14,6 @@ import at.procon.dip.processing.spi.DocumentProcessingPolicy;
|
|||
import at.procon.dip.processing.spi.StructuredDocumentProcessor;
|
||||
import at.procon.dip.processing.spi.StructuredProcessingRequest;
|
||||
import at.procon.ted.model.entity.ProcurementDocument;
|
||||
import at.procon.ted.service.TedPhase2GenericDocumentService;
|
||||
import at.procon.ted.service.XmlParserService;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.LinkedHashMap;
|
||||
|
|
@ -32,7 +31,6 @@ public class TedStructuredDocumentProcessor implements StructuredDocumentProcess
|
|||
|
||||
private final XmlParserService xmlParserService;
|
||||
private final DocumentService documentService;
|
||||
private final TedPhase2GenericDocumentService tedPhase2GenericDocumentService;
|
||||
private final TedNoticeProjectionService tedNoticeProjectionService;
|
||||
|
||||
@Override
|
||||
|
|
@ -77,7 +75,6 @@ public class TedStructuredDocumentProcessor implements StructuredDocumentProcess
|
|||
}
|
||||
documentService.save(canonical);
|
||||
|
||||
tedPhase2GenericDocumentService.syncTedDocument(tedDocument);
|
||||
tedNoticeProjectionService.registerOrRefreshProjection(tedDocument, canonical.getId());
|
||||
|
||||
Map<String, Object> payload = new LinkedHashMap<>();
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package at.procon.dip.search.dto;
|
|||
import at.procon.dip.domain.access.DocumentVisibility;
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.UUID;
|
||||
import lombok.AllArgsConstructor;
|
||||
|
|
@ -27,6 +28,13 @@ public class SearchHit {
|
|||
private String languageCode;
|
||||
private String mimeType;
|
||||
|
||||
private RepresentationType representationType;
|
||||
private boolean primaryRepresentation;
|
||||
private Integer chunkIndex;
|
||||
private Integer chunkStartOffset;
|
||||
private Integer chunkEndOffset;
|
||||
private int matchedRepresentationCount;
|
||||
|
||||
private SearchEngineType primaryEngine;
|
||||
private SearchMatchField matchedField;
|
||||
private String snippet;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,22 @@
|
|||
package at.procon.dip.search.dto;
|
||||
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import java.util.Map;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class SearchMetricsResponse {
|
||||
private long totalSearchRequests;
|
||||
private long totalDebugRequests;
|
||||
private long totalCollapsedHitsReturned;
|
||||
private Map<SearchEngineType, Long> engineExecutions;
|
||||
private Map<RepresentationType, Long> representationCounts;
|
||||
private long primaryRepresentationCount;
|
||||
private long chunkRepresentationCount;
|
||||
}
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
package at.procon.dip.search.dto;
|
||||
|
||||
/**
|
||||
* Controls which document text representations participate in generic search
|
||||
* when no explicit representationTypes filter is supplied.
|
||||
*/
|
||||
public enum SearchRepresentationSelectionMode {
|
||||
PRIMARY_ONLY,
|
||||
PRIMARY_AND_CHUNKS,
|
||||
ALL
|
||||
}
|
||||
|
|
@ -40,4 +40,8 @@ public class SearchRequest {
|
|||
|
||||
@Builder.Default
|
||||
private boolean collapseByDocument = true;
|
||||
|
||||
@Builder.Default
|
||||
private SearchRepresentationSelectionMode representationSelectionMode =
|
||||
SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package at.procon.dip.search.rank;
|
|||
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import at.procon.dip.search.api.SearchExecutionPlan;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import at.procon.dip.search.dto.SearchResponse;
|
||||
|
|
@ -57,8 +58,20 @@ public class DefaultSearchResultFusionService implements SearchResultFusionServi
|
|||
normalized.forEach((engine, hits) -> {
|
||||
for (SearchHit hit : hits) {
|
||||
Aggregate aggregate = aggregates.computeIfAbsent(hit.getDocumentId(), id -> new Aggregate());
|
||||
aggregate.bestByEngine.put(engine, hit);
|
||||
if (aggregate.representative == null || hit.getNormalizedScore() > aggregate.representative.getNormalizedScore()) {
|
||||
SearchHit currentBestForEngine = aggregate.bestByEngine.get(engine);
|
||||
if (currentBestForEngine == null
|
||||
|| hit.getNormalizedScore() > currentBestForEngine.getNormalizedScore()
|
||||
|| (Double.compare(hit.getNormalizedScore(), currentBestForEngine.getNormalizedScore()) == 0
|
||||
&& representationPriority(hit) < representationPriority(currentBestForEngine))) {
|
||||
aggregate.bestByEngine.put(engine, hit);
|
||||
}
|
||||
if (hit.getRepresentationId() != null) {
|
||||
aggregate.representationIds.add(hit.getRepresentationId());
|
||||
}
|
||||
if (aggregate.representative == null
|
||||
|| hit.getNormalizedScore() > aggregate.representative.getNormalizedScore()
|
||||
|| (Double.compare(hit.getNormalizedScore(), aggregate.representative.getNormalizedScore()) == 0
|
||||
&& representationPriority(hit) < representationPriority(aggregate.representative))) {
|
||||
aggregate.representative = hit;
|
||||
}
|
||||
}
|
||||
|
|
@ -69,8 +82,12 @@ public class DefaultSearchResultFusionService implements SearchResultFusionServi
|
|||
SearchHit representative = aggregate.representative;
|
||||
double finalScore = weight(SearchEngineType.POSTGRES_FULLTEXT, aggregate) +
|
||||
weight(SearchEngineType.POSTGRES_TRIGRAM, aggregate) +
|
||||
weight(SearchEngineType.PGVECTOR_SEMANTIC, aggregate);
|
||||
fused.add(representative.toBuilder().finalScore(finalScore).build());
|
||||
weight(SearchEngineType.PGVECTOR_SEMANTIC, aggregate) +
|
||||
recencyBoost(representative);
|
||||
fused.add(representative.toBuilder()
|
||||
.finalScore(finalScore)
|
||||
.matchedRepresentationCount(aggregate.representationIds.size())
|
||||
.build());
|
||||
}
|
||||
return fused;
|
||||
}
|
||||
|
|
@ -97,7 +114,10 @@ public class DefaultSearchResultFusionService implements SearchResultFusionServi
|
|||
case POSTGRES_TRIGRAM -> hit.getNormalizedScore() * properties.getSearch().getTrigramWeight();
|
||||
case PGVECTOR_SEMANTIC -> hit.getNormalizedScore() * properties.getSearch().getSemanticWeight();
|
||||
};
|
||||
merged.add(hit.toBuilder().finalScore(finalScore).build());
|
||||
merged.add(hit.toBuilder()
|
||||
.finalScore(finalScore + recencyBoost(hit))
|
||||
.matchedRepresentationCount(1)
|
||||
.build());
|
||||
}
|
||||
});
|
||||
return merged;
|
||||
|
|
@ -117,8 +137,42 @@ public class DefaultSearchResultFusionService implements SearchResultFusionServi
|
|||
hits.sort(comparator);
|
||||
}
|
||||
|
||||
private double recencyBoost(SearchHit hit) {
|
||||
if (properties.getSearch().getRecencyBoostWeight() <= 0.0d || hit.getCreatedAt() == null) {
|
||||
return 0.0d;
|
||||
}
|
||||
double halfLifeDays = Math.max(1.0d, properties.getSearch().getRecencyHalfLifeDays());
|
||||
double ageDays = Math.max(0.0d, java.time.Duration.between(hit.getCreatedAt(), java.time.OffsetDateTime.now()).toSeconds() / 86400.0d);
|
||||
double normalized = Math.exp(-Math.log(2.0d) * (ageDays / halfLifeDays));
|
||||
return normalized * properties.getSearch().getRecencyBoostWeight();
|
||||
}
|
||||
|
||||
private int representationPriority(SearchHit hit) {
|
||||
if (hit == null) {
|
||||
return Integer.MAX_VALUE;
|
||||
}
|
||||
if (hit.isPrimaryRepresentation()) {
|
||||
return 0;
|
||||
}
|
||||
RepresentationType type = hit.getRepresentationType();
|
||||
if (type == RepresentationType.SEMANTIC_TEXT) {
|
||||
return 1;
|
||||
}
|
||||
if (type == RepresentationType.TITLE_ABSTRACT) {
|
||||
return 2;
|
||||
}
|
||||
if (type == RepresentationType.SUMMARY) {
|
||||
return 3;
|
||||
}
|
||||
if (type == RepresentationType.CHUNK) {
|
||||
return 4;
|
||||
}
|
||||
return 5;
|
||||
}
|
||||
|
||||
private static final class Aggregate {
|
||||
private final Map<SearchEngineType, SearchHit> bestByEngine = new EnumMap<>(SearchEngineType.class);
|
||||
private final Set<UUID> representationIds = new java.util.LinkedHashSet<>();
|
||||
private SearchHit representative;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ public class DocumentFullTextSearchRepositoryImpl implements DocumentFullTextSea
|
|||
SELECT
|
||||
d.id AS document_id,
|
||||
dtr.id AS representation_id,
|
||||
CAST(dtr.representation_type AS text) AS representation_type,
|
||||
CAST(d.document_type AS text) AS document_type,
|
||||
CAST(d.document_family AS text) AS document_family,
|
||||
CAST(d.visibility AS text) AS visibility,
|
||||
|
|
@ -31,23 +32,56 @@ public class DocumentFullTextSearchRepositoryImpl implements DocumentFullTextSea
|
|||
d.mime_type AS mime_type,
|
||||
d.created_at AS created_at,
|
||||
d.updated_at AS updated_at,
|
||||
ts_headline('simple', COALESCE(dtr.text_body, ''), websearch_to_tsquery('simple', :queryText),
|
||||
'MaxFragments=2, MinWords=5, MaxWords=20') AS snippet,
|
||||
ts_rank_cd(dtr.search_vector, websearch_to_tsquery('simple', :queryText)) AS score
|
||||
ts_headline(
|
||||
CASE
|
||||
WHEN NULLIF(dtr.search_config, '') IS NULL THEN 'simple'::regconfig
|
||||
ELSE dtr.search_config::regconfig
|
||||
END,
|
||||
COALESCE(dtr.text_body, ''),
|
||||
websearch_to_tsquery(
|
||||
CASE
|
||||
WHEN NULLIF(dtr.search_config, '') IS NULL THEN 'simple'::regconfig
|
||||
ELSE dtr.search_config::regconfig
|
||||
END,
|
||||
:queryText
|
||||
),
|
||||
'MaxFragments=2, MinWords=5, MaxWords=20'
|
||||
) AS snippet,
|
||||
ts_rank_cd(
|
||||
dtr.search_vector,
|
||||
websearch_to_tsquery(
|
||||
CASE
|
||||
WHEN NULLIF(dtr.search_config, '') IS NULL THEN 'simple'::regconfig
|
||||
ELSE dtr.search_config::regconfig
|
||||
END,
|
||||
:queryText
|
||||
)
|
||||
) AS score
|
||||
FROM doc.doc_text_representation dtr
|
||||
JOIN doc.doc_document d ON d.id = dtr.document_id
|
||||
LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id
|
||||
WHERE dtr.search_vector IS NOT NULL
|
||||
AND dtr.search_vector @@ websearch_to_tsquery('simple', :queryText)
|
||||
AND dtr.search_vector @@ websearch_to_tsquery(
|
||||
CASE
|
||||
WHEN NULLIF(dtr.search_config, '') IS NULL THEN 'simple'::regconfig
|
||||
ELSE dtr.search_config::regconfig
|
||||
END,
|
||||
:queryText
|
||||
)
|
||||
""");
|
||||
|
||||
MapSqlParameterSource params = new MapSqlParameterSource();
|
||||
params.addValue("queryText", context.getRequest().getQueryText());
|
||||
|
||||
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
|
||||
|
||||
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
|
||||
params.addValue("limit", limit);
|
||||
|
||||
return jdbcTemplate.query(sql.toString(), params,
|
||||
new SearchHitRowMapper(SearchEngineType.POSTGRES_FULLTEXT, SearchMatchField.REPRESENTATION_TEXT));
|
||||
return jdbcTemplate.query(
|
||||
sql.toString(),
|
||||
params,
|
||||
new SearchHitRowMapper(SearchEngineType.POSTGRES_FULLTEXT, SearchMatchField.REPRESENTATION_TEXT)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -33,6 +33,11 @@ public class DocumentSemanticSearchRepository {
|
|||
d.summary AS summary,
|
||||
COALESCE(dtr.language_code, d.language_code) AS language_code,
|
||||
d.mime_type AS mime_type,
|
||||
CAST(dtr.representation_type AS text) AS representation_type,
|
||||
dtr.is_primary AS is_primary,
|
||||
dtr.chunk_index AS chunk_index,
|
||||
dtr.chunk_start_offset AS chunk_start_offset,
|
||||
dtr.chunk_end_offset AS chunk_end_offset,
|
||||
d.created_at AS created_at,
|
||||
d.updated_at AS updated_at,
|
||||
LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet,
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package at.procon.dip.search.repository;
|
|||
import at.procon.dip.domain.access.DocumentVisibility;
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import at.procon.dip.search.dto.SearchMatchField;
|
||||
|
|
@ -33,6 +34,11 @@ final class SearchHitRowMapper implements RowMapper<SearchHit> {
|
|||
.summary(safeGetString(rs, "summary"))
|
||||
.languageCode(safeGetString(rs, "language_code"))
|
||||
.mimeType(safeGetString(rs, "mime_type"))
|
||||
.representationType(parseRepresentationType(safeGetString(rs, "representation_type")))
|
||||
.primaryRepresentation(safeGetBoolean(rs, "is_primary"))
|
||||
.chunkIndex(safeGetInteger(rs, "chunk_index"))
|
||||
.chunkStartOffset(safeGetInteger(rs, "chunk_start_offset"))
|
||||
.chunkEndOffset(safeGetInteger(rs, "chunk_end_offset"))
|
||||
.primaryEngine(engineType)
|
||||
.matchedField(matchedField == null || matchedField.isBlank()
|
||||
? defaultField
|
||||
|
|
@ -51,4 +57,25 @@ final class SearchHitRowMapper implements RowMapper<SearchHit> {
|
|||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private Integer safeGetInteger(ResultSet rs, String column) {
|
||||
try {
|
||||
int value = rs.getInt(column);
|
||||
return rs.wasNull() ? null : value;
|
||||
} catch (SQLException ignore) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private boolean safeGetBoolean(ResultSet rs, String column) {
|
||||
try {
|
||||
return rs.getBoolean(column) && !rs.wasNull();
|
||||
} catch (SQLException ignore) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private RepresentationType parseRepresentationType(String value) {
|
||||
return value == null || value.isBlank() ? null : RepresentationType.valueOf(value);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import at.procon.dip.domain.document.DocumentFamily;
|
|||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import at.procon.dip.search.dto.SearchRepresentationSelectionMode;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
|
@ -56,7 +57,19 @@ final class SearchSqlFilterSupport {
|
|||
sql.append(" AND CAST(").append(representationAlias).append(".representation_type AS text) IN (:representationTypes)");
|
||||
params.addValue("representationTypes", enumNames(representationTypes));
|
||||
} else {
|
||||
sql.append(" AND ").append(representationAlias).append(".is_primary = true");
|
||||
SearchRepresentationSelectionMode selectionMode = context.getRequest().getRepresentationSelectionMode();
|
||||
if (selectionMode == null) {
|
||||
selectionMode = SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS;
|
||||
}
|
||||
switch (selectionMode) {
|
||||
case PRIMARY_ONLY -> sql.append(" AND ").append(representationAlias).append(".is_primary = true");
|
||||
case PRIMARY_AND_CHUNKS -> sql.append(" AND (")
|
||||
.append(representationAlias).append(".is_primary = true OR CAST(")
|
||||
.append(representationAlias).append(".representation_type AS text) = 'CHUNK')");
|
||||
case ALL -> {
|
||||
// no implicit representation restriction
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (context.getRequest().getCreatedFrom() != null) {
|
||||
|
|
|
|||
|
|
@ -28,17 +28,21 @@ public class DefaultSearchOrchestrator implements SearchOrchestrator {
|
|||
private final SearchPlanner planner;
|
||||
private final List<SearchEngine> engines;
|
||||
private final SearchResultFusionService fusionService;
|
||||
private final SearchMetricsService metricsService;
|
||||
|
||||
@Override
|
||||
public SearchResponse search(SearchRequest request, SearchDocumentScope scope) {
|
||||
SearchExecution execution = executeInternal(request, scope);
|
||||
return fusionService.fuse(execution.context(), execution.plan(), execution.engineResults());
|
||||
SearchResponse response = fusionService.fuse(execution.context(), execution.plan(), execution.engineResults());
|
||||
metricsService.recordSearch(execution.engineResults(), response.getHits().size(), false);
|
||||
return response;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SearchDebugResponse debug(SearchRequest request, SearchDocumentScope scope) {
|
||||
SearchExecution execution = executeInternal(request, scope);
|
||||
SearchResponse fused = fusionService.fuse(execution.context(), execution.plan(), execution.engineResults());
|
||||
metricsService.recordSearch(execution.engineResults(), fused.getHits().size(), true);
|
||||
|
||||
List<SearchEngineDebugResult> debugResults = new ArrayList<>();
|
||||
int topLimit = properties.getSearch().getDebugTopHitsPerEngine();
|
||||
|
|
@ -56,6 +60,11 @@ public class DefaultSearchOrchestrator implements SearchOrchestrator {
|
|||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public at.procon.dip.search.dto.SearchMetricsResponse metrics() {
|
||||
return metricsService.snapshot();
|
||||
}
|
||||
|
||||
private SearchExecution executeInternal(SearchRequest request, SearchDocumentScope scope) {
|
||||
int page = request.getPage() == null || request.getPage() < 0 ? 0 : request.getPage();
|
||||
int requestedSize = request.getSize() == null || request.getSize() <= 0
|
||||
|
|
|
|||
|
|
@ -2,6 +2,9 @@ package at.procon.dip.search.service;
|
|||
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
import jakarta.persistence.EntityManager;
|
||||
import jakarta.persistence.PersistenceContext;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.jdbc.core.JdbcTemplate;
|
||||
|
|
@ -18,6 +21,9 @@ public class DocumentLexicalIndexService {
|
|||
private final NamedParameterJdbcTemplate namedParameterJdbcTemplate;
|
||||
private final JdbcTemplate jdbcTemplate;
|
||||
|
||||
@PersistenceContext
|
||||
private EntityManager entityManager;
|
||||
|
||||
/**
|
||||
* New Slice 2 name kept for current code.
|
||||
*/
|
||||
|
|
@ -26,9 +32,6 @@ public class DocumentLexicalIndexService {
|
|||
refreshRepresentationLexicalIndex(representationId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Backward-compatible Slice 1 method name.
|
||||
*/
|
||||
@Transactional
|
||||
public void refreshRepresentationLexicalIndex(UUID representationId) {
|
||||
if (!isLexicalSearchSchemaAvailable()) {
|
||||
|
|
@ -36,25 +39,32 @@ public class DocumentLexicalIndexService {
|
|||
return;
|
||||
}
|
||||
|
||||
entityManager.flush();
|
||||
|
||||
MapSqlParameterSource params = new MapSqlParameterSource();
|
||||
params.addValue("representationId", representationId);
|
||||
namedParameterJdbcTemplate.update("""
|
||||
UPDATE doc.doc_text_representation
|
||||
SET search_config = CASE
|
||||
WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german'
|
||||
WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english'
|
||||
ELSE 'simple'
|
||||
|
||||
int updated = namedParameterJdbcTemplate.update("""
|
||||
UPDATE doc.doc_text_representation
|
||||
SET search_config = CASE
|
||||
WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german'
|
||||
WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english'
|
||||
ELSE 'simple'
|
||||
END,
|
||||
search_vector = to_tsvector(
|
||||
CASE
|
||||
WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german'::regconfig
|
||||
WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english'::regconfig
|
||||
ELSE 'simple'::regconfig
|
||||
END,
|
||||
search_vector = to_tsvector(
|
||||
CASE
|
||||
WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german'::regconfig
|
||||
WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english'::regconfig
|
||||
ELSE 'simple'::regconfig
|
||||
END,
|
||||
coalesce(text_body, '')
|
||||
)
|
||||
WHERE id = :representationId
|
||||
""", params);
|
||||
coalesce(text_body, '')
|
||||
)
|
||||
WHERE id = :representationId
|
||||
""", params);
|
||||
|
||||
if (updated == 0) {
|
||||
log.warn("Lexical indexing updated 0 rows for representation {}", representationId);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -0,0 +1,55 @@
|
|||
package at.procon.dip.search.service;
|
||||
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
||||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchMetricsResponse;
|
||||
import java.util.Arrays;
|
||||
import java.util.EnumMap;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class SearchMetricsService {
|
||||
|
||||
private final DocumentTextRepresentationRepository representationRepository;
|
||||
|
||||
private final AtomicLong totalSearchRequests = new AtomicLong();
|
||||
private final AtomicLong totalDebugRequests = new AtomicLong();
|
||||
private final AtomicLong totalCollapsedHitsReturned = new AtomicLong();
|
||||
private final Map<SearchEngineType, AtomicLong> engineExecutions = new ConcurrentHashMap<>();
|
||||
|
||||
public void recordSearch(Map<SearchEngineType, ?> engineResults, int collapsedHits, boolean debug) {
|
||||
totalSearchRequests.incrementAndGet();
|
||||
if (debug) {
|
||||
totalDebugRequests.incrementAndGet();
|
||||
}
|
||||
totalCollapsedHitsReturned.addAndGet(collapsedHits);
|
||||
engineResults.keySet().forEach(engine -> engineExecutions
|
||||
.computeIfAbsent(engine, key -> new AtomicLong())
|
||||
.incrementAndGet());
|
||||
}
|
||||
|
||||
public SearchMetricsResponse snapshot() {
|
||||
Map<SearchEngineType, Long> engineCounts = new EnumMap<>(SearchEngineType.class);
|
||||
engineExecutions.forEach((engine, value) -> engineCounts.put(engine, value.get()));
|
||||
|
||||
Map<RepresentationType, Long> representationCounts = new EnumMap<>(RepresentationType.class);
|
||||
Arrays.stream(RepresentationType.values())
|
||||
.forEach(type -> representationCounts.put(type, representationRepository.countByRepresentationType(type)));
|
||||
|
||||
return SearchMetricsResponse.builder()
|
||||
.totalSearchRequests(totalSearchRequests.get())
|
||||
.totalDebugRequests(totalDebugRequests.get())
|
||||
.totalCollapsedHitsReturned(totalCollapsedHitsReturned.get())
|
||||
.engineExecutions(engineCounts)
|
||||
.representationCounts(representationCounts)
|
||||
.primaryRepresentationCount(representationRepository.countByPrimaryRepresentationTrue())
|
||||
.chunkRepresentationCount(representationRepository.countByRepresentationType(RepresentationType.CHUNK))
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
package at.procon.dip.search.service;
|
||||
|
||||
import at.procon.dip.search.dto.SearchDebugResponse;
|
||||
import at.procon.dip.search.dto.SearchMetricsResponse;
|
||||
import at.procon.dip.search.dto.SearchRequest;
|
||||
import at.procon.dip.search.dto.SearchResponse;
|
||||
import at.procon.dip.search.spi.SearchDocumentScope;
|
||||
|
|
@ -8,4 +9,6 @@ import at.procon.dip.search.spi.SearchDocumentScope;
|
|||
public interface SearchOrchestrator {
|
||||
SearchResponse search(SearchRequest request, SearchDocumentScope scope);
|
||||
SearchDebugResponse debug(SearchRequest request, SearchDocumentScope scope);
|
||||
|
||||
SearchMetricsResponse metrics();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package at.procon.dip.search.web;
|
||||
|
||||
import at.procon.dip.search.dto.SearchDebugResponse;
|
||||
import at.procon.dip.search.dto.SearchMetricsResponse;
|
||||
import at.procon.dip.search.dto.SearchRequest;
|
||||
import at.procon.dip.search.dto.SearchResponse;
|
||||
import at.procon.dip.search.service.SearchOrchestrator;
|
||||
|
|
@ -9,6 +10,7 @@ import jakarta.validation.Valid;
|
|||
import java.util.Set;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
|
@ -30,6 +32,11 @@ public class GenericSearchController {
|
|||
return searchOrchestrator.debug(request, buildScope(request));
|
||||
}
|
||||
|
||||
@GetMapping("/metrics")
|
||||
public SearchMetricsResponse metrics() {
|
||||
return searchOrchestrator.metrics();
|
||||
}
|
||||
|
||||
private SearchDocumentScope buildScope(SearchRequest request) {
|
||||
String scopeLanguage = (request.getLanguageCodes() == null || request.getLanguageCodes().isEmpty())
|
||||
? null
|
||||
|
|
|
|||
|
|
@ -234,6 +234,41 @@ public class TedProcessorProperties {
|
|||
private double trigramWeight = 0.20;
|
||||
private double semanticWeight = 0.45;
|
||||
|
||||
|
||||
/**
|
||||
* Enable chunk representations for long documents.
|
||||
*/
|
||||
private boolean chunkingEnabled = true;
|
||||
|
||||
/**
|
||||
* Target chunk size in characters for CHUNK representations.
|
||||
*/
|
||||
@Positive
|
||||
private int chunkTargetChars = 1800;
|
||||
|
||||
/**
|
||||
* Overlap between consecutive chunks in characters.
|
||||
*/
|
||||
@Min(0)
|
||||
private int chunkOverlapChars = 200;
|
||||
|
||||
/**
|
||||
* Maximum CHUNK representations generated per document.
|
||||
*/
|
||||
@Positive
|
||||
private int maxChunksPerDocument = 12;
|
||||
|
||||
/**
|
||||
* Additional score weight for recency.
|
||||
*/
|
||||
private double recencyBoostWeight = 0.05;
|
||||
|
||||
/**
|
||||
* Half-life in days used for recency decay.
|
||||
*/
|
||||
@Positive
|
||||
private int recencyHalfLifeDays = 30;
|
||||
|
||||
/**
|
||||
* Startup backfill limit for missing DOC lexical vectors.
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -134,6 +134,18 @@ ted:
|
|||
fulltext-weight: 0.35
|
||||
trigram-weight: 0.20
|
||||
semantic-weight: 0.45
|
||||
# Additional score weight for recency
|
||||
recency-boost-weight: 0.05
|
||||
# Recency half-life in days
|
||||
recency-half-life-days: 30
|
||||
# Enable chunk representations for long documents
|
||||
chunking-enabled: true
|
||||
# Target chunk size in characters
|
||||
chunk-target-chars: 1800
|
||||
# Overlap between consecutive chunks
|
||||
chunk-overlap-chars: 200
|
||||
# Maximum number of chunks generated per document
|
||||
max-chunks-per-document: 12
|
||||
# Startup backfill limit for missing lexical vectors
|
||||
startup-lexical-backfill-limit: 500
|
||||
# Number of top hits per engine returned by /search/debug
|
||||
|
|
@ -142,7 +154,7 @@ ted:
|
|||
# TED Daily Package Download configuration
|
||||
download:
|
||||
# Enable/disable automatic package download
|
||||
enabled: false
|
||||
enabled: true
|
||||
# User service-based camel route
|
||||
use-service-based: false
|
||||
# Base URL for TED Daily Packages
|
||||
|
|
@ -177,7 +189,7 @@ ted:
|
|||
# IMAP Mail configuration
|
||||
mail:
|
||||
# Enable/disable mail processing
|
||||
enabled: true
|
||||
enabled: false
|
||||
# IMAP server hostname
|
||||
host: mail.mymagenta.business
|
||||
# IMAP server port (993 for IMAPS)
|
||||
|
|
|
|||
|
|
@ -100,8 +100,10 @@ import static org.assertj.core.api.Assertions.assertThat;
|
|||
})
|
||||
class MailBundleProcessingIntegrationTest {
|
||||
|
||||
private static final int HOST_PORT = 15433;
|
||||
|
||||
@Container
|
||||
static PostgreSQLContainer<?> postgres = new FixedPortPostgreSQLContainer<>("postgres:16-alpine", 15432)
|
||||
static PostgreSQLContainer<?> postgres = new FixedPortPostgreSQLContainer<>("postgres:16-alpine", HOST_PORT)
|
||||
.withDatabaseName("dip_test")
|
||||
.withUsername("test")
|
||||
.withPassword("test")
|
||||
|
|
|
|||
Loading…
Reference in New Issue