From c8659bd45d98a4dcadd6891f5f2c6082f85a6f57 Mon Sep 17 00:00:00 2001
From: trifonovt <87468028+TihomirTrifonov@users.noreply.github.com>
Date: Fri, 20 Mar 2026 16:04:28 +0100
Subject: [PATCH] Refactor phases 5 - search - tests
---
README_SLICE3.txt | 16 +++
docs/testing/SEARCH_TEST_PLAN.md | 27 ++++++
pom.xml | 4 +
...DIP-Generic-Search.postman_collection.json | 92 ++++++++++++++++++
.../DocumentTextRepresentationRepository.java | 4 +
.../DocumentRepresentationService.java | 2 +-
.../service/TedNoticeProjectionService.java | 10 +-
.../service/GenericDocumentImportService.java | 4 +-
.../ChunkedLongTextRepresentationBuilder.java | 97 +++++++++++++++++++
...faultGenericTextRepresentationBuilder.java | 24 ++++-
...edStructuredTextRepresentationBuilder.java | 12 ++-
.../spi/TextRepresentationDraft.java | 5 +-
.../impl/TedStructuredDocumentProcessor.java | 3 -
.../at/procon/dip/search/dto/SearchHit.java | 8 ++
.../dip/search/dto/SearchMetricsResponse.java | 22 +++++
.../SearchRepresentationSelectionMode.java | 11 +++
.../procon/dip/search/dto/SearchRequest.java | 4 +
.../DefaultSearchResultFusionService.java | 64 +++++++++++-
.../DocumentFullTextSearchRepositoryImpl.java | 48 +++++++--
.../DocumentSemanticSearchRepository.java | 5 +
.../search/repository/SearchHitRowMapper.java | 27 ++++++
.../repository/SearchSqlFilterSupport.java | 15 ++-
.../service/DefaultSearchOrchestrator.java | 11 ++-
.../service/DocumentLexicalIndexService.java | 48 +++++----
.../search/service/SearchMetricsService.java | 55 +++++++++++
.../search/service/SearchOrchestrator.java | 3 +
.../search/web/GenericSearchController.java | 7 ++
.../ted/config/TedProcessorProperties.java | 35 +++++++
src/main/resources/application.yml | 16 ++-
.../MailBundleProcessingIntegrationTest.java | 4 +-
30 files changed, 626 insertions(+), 57 deletions(-)
create mode 100644 README_SLICE3.txt
create mode 100644 docs/testing/SEARCH_TEST_PLAN.md
create mode 100644 postman/DIP-Generic-Search.postman_collection.json
create mode 100644 src/main/java/at/procon/dip/normalization/impl/ChunkedLongTextRepresentationBuilder.java
create mode 100644 src/main/java/at/procon/dip/search/dto/SearchMetricsResponse.java
create mode 100644 src/main/java/at/procon/dip/search/dto/SearchRepresentationSelectionMode.java
create mode 100644 src/main/java/at/procon/dip/search/service/SearchMetricsService.java
diff --git a/README_SLICE3.txt b/README_SLICE3.txt
new file mode 100644
index 0000000..a2f2d05
--- /dev/null
+++ b/README_SLICE3.txt
@@ -0,0 +1,16 @@
+Slice 3 patch for the generic search platform.
+
+Contents:
+- long-text CHUNK representations for generic and TED documents
+- representation selection mode for generic search (PRIMARY_ONLY / PRIMARY_AND_CHUNKS / ALL)
+- chunk-aware document collapse and matchedRepresentationCount in fused results
+- recency-aware scoring boost
+- lightweight search metrics endpoint: GET /api/search/metrics
+
+Assumptions:
+- apply on top of Slice 2 and the Slice 2 fix patch
+- no additional DB migration is required in this slice
+
+Notes:
+- Maven compile was not available in the patch generation environment
+- this patch intentionally keeps TED and Mail structured search for later slices
diff --git a/docs/testing/SEARCH_TEST_PLAN.md b/docs/testing/SEARCH_TEST_PLAN.md
new file mode 100644
index 0000000..9591c44
--- /dev/null
+++ b/docs/testing/SEARCH_TEST_PLAN.md
@@ -0,0 +1,27 @@
+# Generic Search Slice Test Plan
+
+This patch adds a minimal but useful integration-test baseline for the new generic search slices.
+
+## What is covered
+
+- PostgreSQL full-text search over `DOC.doc_text_representation.search_vector`
+- PostgreSQL trigram search over document title / summary / representation text
+- hybrid orchestration and document-level collapse
+- representation selection modes (`PRIMARY_ONLY`, `PRIMARY_AND_CHUNKS`)
+- REST endpoint smoke tests for:
+ - `POST /api/search`
+ - `POST /api/search/debug`
+ - `GET /api/search/metrics`
+
+## Recommended execution order
+
+1. Apply the search-slice DB migration(s) or ensure the runtime schema already contains the lexical search columns.
+2. Run the new integration tests with PostgreSQL Testcontainers.
+3. Start the application locally and try the included Postman requests.
+4. Only after lexical tests are green, add semantic engine integration tests.
+
+## Notes
+
+- The test application intentionally imports only the DOC domain services and lexical search beans.
+- Semantic/vector beans are left out to keep the test context small and deterministic.
+- The base test class adds the `search_config` and `search_vector` columns if they are not already present.
diff --git a/pom.xml b/pom.xml
index 54fd084..b9f634d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -238,6 +238,10 @@
1.21.4
test
+
+ com.fasterxml.jackson.datatype
+ jackson-datatype-jsr310
+
diff --git a/postman/DIP-Generic-Search.postman_collection.json b/postman/DIP-Generic-Search.postman_collection.json
new file mode 100644
index 0000000..0ce8553
--- /dev/null
+++ b/postman/DIP-Generic-Search.postman_collection.json
@@ -0,0 +1,92 @@
+{
+ "info": {
+ "name": "DIP Generic Search",
+ "_postman_id": "2d8f227e-4f38-45c0-9d59-b0642773c993",
+ "description": "Sample requests for the generic lexical search slices (full-text, trigram, hybrid, debug, metrics).",
+ "schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
+ },
+ "variable": [
+ {"key": "baseUrl", "value": "http://localhost:8889/api"}
+ ],
+ "item": [
+ {
+ "name": "Search - fulltext exact",
+ "request": {
+ "method": "POST",
+ "header": [{"key": "Content-Type", "value": "application/json"}],
+ "url": "{{baseUrl}}/search",
+ "body": {
+ "mode": "raw",
+ "raw": "{\n \"queryText\": \"framework agreement\",\n \"modes\": [\"FULLTEXT\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
+ }
+ }
+ },
+ {
+ "name": "Search - trigram fuzzy title",
+ "request": {
+ "method": "POST",
+ "header": [{"key": "Content-Type", "value": "application/json"}],
+ "url": "{{baseUrl}}/search",
+ "body": {
+ "mode": "raw",
+ "raw": "{\n \"queryText\": \"Viena school renovtion\",\n \"modes\": [\"TRIGRAM\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
+ }
+ }
+ },
+ {
+ "name": "Search - hybrid lexical",
+ "request": {
+ "method": "POST",
+ "header": [{"key": "Content-Type", "value": "application/json"}],
+ "url": "{{baseUrl}}/search",
+ "body": {
+ "mode": "raw",
+ "raw": "{\n \"queryText\": \"Maintenance manual\",\n \"modes\": [\"HYBRID\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
+ }
+ }
+ },
+ {
+ "name": "Search - chunk-aware",
+ "request": {
+ "method": "POST",
+ "header": [{"key": "Content-Type", "value": "application/json"}],
+ "url": "{{baseUrl}}/search",
+ "body": {
+ "mode": "raw",
+ "raw": "{\n \"queryText\": \"district heating optimization\",\n \"modes\": [\"FULLTEXT\"],\n \"documentTypes\": [\"TEXT\"],\n \"documentFamilies\": [\"GENERIC\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_AND_CHUNKS\",\n \"page\": 0,\n \"size\": 10\n}"
+ }
+ }
+ },
+ {
+ "name": "Search - createdFrom filter",
+ "request": {
+ "method": "POST",
+ "header": [{"key": "Content-Type", "value": "application/json"}],
+ "url": "{{baseUrl}}/search",
+ "body": {
+ "mode": "raw",
+ "raw": "{\n \"queryText\": \"framework agreement\",\n \"modes\": [\"FULLTEXT\"],\n \"createdFrom\": \"2026-01-01T00:00:00Z\",\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
+ }
+ }
+ },
+ {
+ "name": "Search - debug",
+ "request": {
+ "method": "POST",
+ "header": [{"key": "Content-Type", "value": "application/json"}],
+ "url": "{{baseUrl}}/search/debug",
+ "body": {
+ "mode": "raw",
+ "raw": "{\n \"queryText\": \"maintenence manual\",\n \"modes\": [\"HYBRID\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
+ }
+ }
+ },
+ {
+ "name": "Search - metrics",
+ "request": {
+ "method": "GET",
+ "url": "{{baseUrl}}/search/metrics"
+ }
+ }
+ ]
+}
diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentTextRepresentationRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentTextRepresentationRepository.java
index 8dcbf34..c831cfe 100644
--- a/src/main/java/at/procon/dip/domain/document/repository/DocumentTextRepresentationRepository.java
+++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentTextRepresentationRepository.java
@@ -15,5 +15,9 @@ public interface DocumentTextRepresentationRepository extends JpaRepository findByPrimaryRepresentationTrue();
+ long countByPrimaryRepresentationTrue();
+
+ long countByRepresentationType(RepresentationType representationType);
+
Optional findFirstByDocument_IdAndPrimaryRepresentationTrue(UUID documentId);
}
diff --git a/src/main/java/at/procon/dip/domain/document/service/DocumentRepresentationService.java b/src/main/java/at/procon/dip/domain/document/service/DocumentRepresentationService.java
index 88fe97a..c081f54 100644
--- a/src/main/java/at/procon/dip/domain/document/service/DocumentRepresentationService.java
+++ b/src/main/java/at/procon/dip/domain/document/service/DocumentRepresentationService.java
@@ -36,7 +36,7 @@ public class DocumentRepresentationService {
.primaryRepresentation(command.primaryRepresentation())
.textBody(command.textBody())
.build();
- DocumentTextRepresentation saved = representationRepository.save(representation);
+ DocumentTextRepresentation saved = representationRepository.saveAndFlush(representation);
lexicalIndexService.indexRepresentation(saved.getId());
return saved;
}
diff --git a/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java b/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java
index 081b744..7013b39 100644
--- a/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java
+++ b/src/main/java/at/procon/dip/domain/ted/service/TedNoticeProjectionService.java
@@ -12,7 +12,6 @@ import at.procon.ted.config.TedProcessorProperties;
import at.procon.ted.model.entity.Organization;
import at.procon.ted.model.entity.ProcurementDocument;
import at.procon.ted.model.entity.ProcurementLot;
-import at.procon.ted.service.TedPhase2GenericDocumentService;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
@@ -30,7 +29,7 @@ import org.springframework.transaction.annotation.Transactional;
public class TedNoticeProjectionService {
private final TedProcessorProperties properties;
- private final TedPhase2GenericDocumentService tedPhase2GenericDocumentService;
+ private final TedGenericDocumentRootService tedGenericDocumentRootService;
private final DocumentRepository documentRepository;
private final TedNoticeProjectionRepository projectionRepository;
private final TedNoticeLotRepository lotRepository;
@@ -42,9 +41,8 @@ public class TedNoticeProjectionService {
return null;
}
- TedPhase2GenericDocumentService.TedGenericDocumentSyncResult syncResult =
- tedPhase2GenericDocumentService.syncTedDocument(legacyDocument);
- return registerOrRefreshProjection(legacyDocument, syncResult.documentId());
+ UUID genericDocumentId = tedGenericDocumentRootService.ensureGenericTedDocumentRoot(legacyDocument);
+ return registerOrRefreshProjection(legacyDocument, genericDocumentId);
}
@Transactional
@@ -55,7 +53,7 @@ public class TedNoticeProjectionService {
UUID resolvedDocumentId = genericDocumentId;
if (resolvedDocumentId == null) {
- resolvedDocumentId = tedPhase2GenericDocumentService.ensureGenericTedDocument(legacyDocument);
+ resolvedDocumentId = tedGenericDocumentRootService.ensureGenericTedDocumentRoot(legacyDocument);
}
UUID finalResolvedDocumentId = resolvedDocumentId;
diff --git a/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java b/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java
index 0834f37..f0cadae 100644
--- a/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java
+++ b/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java
@@ -425,8 +425,8 @@ public class GenericDocumentImportService {
draft.languageCode(),
null,
draft.chunkIndex(),
- null,
- null,
+ draft.chunkStartOffset(),
+ draft.chunkEndOffset(),
draft.primary(),
draft.textBody()
));
diff --git a/src/main/java/at/procon/dip/normalization/impl/ChunkedLongTextRepresentationBuilder.java b/src/main/java/at/procon/dip/normalization/impl/ChunkedLongTextRepresentationBuilder.java
new file mode 100644
index 0000000..42f78f5
--- /dev/null
+++ b/src/main/java/at/procon/dip/normalization/impl/ChunkedLongTextRepresentationBuilder.java
@@ -0,0 +1,97 @@
+package at.procon.dip.normalization.impl;
+
+import at.procon.dip.domain.document.ContentRole;
+import at.procon.dip.domain.document.DocumentType;
+import at.procon.dip.domain.document.RepresentationType;
+import at.procon.dip.normalization.spi.RepresentationBuildRequest;
+import at.procon.dip.normalization.spi.TextRepresentationBuilder;
+import at.procon.dip.normalization.spi.TextRepresentationDraft;
+import at.procon.ted.config.TedProcessorProperties;
+import java.util.ArrayList;
+import java.util.List;
+import lombok.RequiredArgsConstructor;
+import org.springframework.core.annotation.Order;
+import org.springframework.stereotype.Component;
+import org.springframework.util.StringUtils;
+
+@Component
+@Order(200)
+@RequiredArgsConstructor
+public class ChunkedLongTextRepresentationBuilder implements TextRepresentationBuilder {
+
+ public static final String BUILDER_KEY = "long-text-chunker";
+
+ private final TedProcessorProperties properties;
+
+ @Override
+ public boolean supports(DocumentType documentType) {
+ return true;
+ }
+
+ @Override
+ public List build(RepresentationBuildRequest request) {
+ if (!properties.getSearch().isChunkingEnabled()) {
+ return List.of();
+ }
+
+ String baseText = request.extractionResult().derivedTextByRole().get(ContentRole.NORMALIZED_TEXT);
+ if (!StringUtils.hasText(baseText)) {
+ baseText = request.extractionResult().derivedTextByRole().get(ContentRole.HTML_CLEAN);
+ }
+ if (!StringUtils.hasText(baseText)) {
+ return List.of();
+ }
+
+ int target = Math.max(400, properties.getSearch().getChunkTargetChars());
+ int overlap = Math.max(0, Math.min(target / 3, properties.getSearch().getChunkOverlapChars()));
+ if (baseText.length() <= target + overlap) {
+ return List.of();
+ }
+
+ List drafts = new ArrayList<>();
+ int start = 0;
+ int chunkIndex = 0;
+ while (start < baseText.length() && chunkIndex < properties.getSearch().getMaxChunksPerDocument()) {
+ int end = Math.min(baseText.length(), start + target);
+ if (end < baseText.length()) {
+ int boundary = findBoundary(baseText, end, Math.min(baseText.length(), end + 160));
+ if (boundary > start + 200) {
+ end = boundary;
+ }
+ }
+
+ String chunk = baseText.substring(start, end).trim();
+ if (StringUtils.hasText(chunk)) {
+ drafts.add(new TextRepresentationDraft(
+ RepresentationType.CHUNK,
+ BUILDER_KEY,
+ request.detectionResult().languageCode(),
+ chunk,
+ false,
+ chunkIndex,
+ start,
+ end,
+ ContentRole.NORMALIZED_TEXT,
+ Boolean.TRUE
+ ));
+ chunkIndex++;
+ }
+
+ if (end >= baseText.length()) {
+ break;
+ }
+ start = Math.max(end - overlap, start + 1);
+ }
+ return drafts;
+ }
+
+ private int findBoundary(String text, int preferred, int max) {
+ for (int i = preferred; i < max; i++) {
+ char c = text.charAt(i);
+ if (c == '\n' || c == '.' || c == '!' || c == '?' || c == ';') {
+ return i + 1;
+ }
+ }
+ return preferred;
+ }
+}
diff --git a/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java b/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java
index fd82a8a..93bea89 100644
--- a/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java
+++ b/src/main/java/at/procon/dip/normalization/impl/DefaultGenericTextRepresentationBuilder.java
@@ -41,7 +41,6 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
String semantic = buildSemanticText(title, summary, request.detectionResult().documentType());
List drafts = new ArrayList<>();
- /*
drafts.add(new TextRepresentationDraft(
RepresentationType.FULLTEXT,
BUILDER_KEY,
@@ -49,10 +48,11 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
baseText,
false,
null,
+ null,
+ null,
ContentRole.NORMALIZED_TEXT,
- Boolean.TRUE
+ Boolean.FALSE
));
- */
drafts.add(new TextRepresentationDraft(
RepresentationType.SEMANTIC_TEXT,
BUILDER_KEY,
@@ -60,10 +60,11 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
semantic,
true,
null,
+ null,
+ null,
ContentRole.NORMALIZED_TEXT,
Boolean.TRUE
));
- /*
if (StringUtils.hasText(title)) {
drafts.add(new TextRepresentationDraft(
RepresentationType.TITLE_ABSTRACT,
@@ -72,11 +73,24 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
title + "\n\n" + summary,
false,
null,
+ null,
+ null,
ContentRole.NORMALIZED_TEXT,
Boolean.FALSE
));
}
- */
+ drafts.add(new TextRepresentationDraft(
+ RepresentationType.SUMMARY,
+ BUILDER_KEY,
+ request.detectionResult().languageCode(),
+ summary,
+ false,
+ null,
+ null,
+ null,
+ ContentRole.NORMALIZED_TEXT,
+ Boolean.FALSE
+ ));
return drafts;
}
diff --git a/src/main/java/at/procon/dip/normalization/impl/TedStructuredTextRepresentationBuilder.java b/src/main/java/at/procon/dip/normalization/impl/TedStructuredTextRepresentationBuilder.java
index 834082c..4d9289a 100644
--- a/src/main/java/at/procon/dip/normalization/impl/TedStructuredTextRepresentationBuilder.java
+++ b/src/main/java/at/procon/dip/normalization/impl/TedStructuredTextRepresentationBuilder.java
@@ -61,10 +61,11 @@ public class TedStructuredTextRepresentationBuilder implements TextRepresentatio
semanticText,
true,
null,
+ null,
+ null,
ContentRole.NORMALIZED_TEXT,
Boolean.TRUE
));
- /*
drafts.add(new TextRepresentationDraft(
RepresentationType.FULLTEXT,
BUILDER_KEY,
@@ -72,8 +73,10 @@ public class TedStructuredTextRepresentationBuilder implements TextRepresentatio
normalizedText,
false,
null,
+ null,
+ null,
ContentRole.NORMALIZED_TEXT,
- Boolean.TRUE
+ Boolean.FALSE
));
if (StringUtils.hasText(title)) {
drafts.add(new TextRepresentationDraft(
@@ -83,6 +86,8 @@ public class TedStructuredTextRepresentationBuilder implements TextRepresentatio
title + "\n\n" + summary,
false,
null,
+ null,
+ null,
ContentRole.NORMALIZED_TEXT,
Boolean.FALSE
));
@@ -94,10 +99,11 @@ public class TedStructuredTextRepresentationBuilder implements TextRepresentatio
summary,
false,
null,
+ null,
+ null,
ContentRole.NORMALIZED_TEXT,
Boolean.FALSE
));
- */
return drafts;
}
diff --git a/src/main/java/at/procon/dip/normalization/spi/TextRepresentationDraft.java b/src/main/java/at/procon/dip/normalization/spi/TextRepresentationDraft.java
index 4b7322d..cdb5f06 100644
--- a/src/main/java/at/procon/dip/normalization/spi/TextRepresentationDraft.java
+++ b/src/main/java/at/procon/dip/normalization/spi/TextRepresentationDraft.java
@@ -13,6 +13,8 @@ public record TextRepresentationDraft(
String textBody,
boolean primary,
Integer chunkIndex,
+ Integer chunkStartOffset,
+ Integer chunkEndOffset,
ContentRole sourceContentRole,
Boolean queueForEmbedding
) {
@@ -22,6 +24,7 @@ public record TextRepresentationDraft(
String textBody,
boolean primary,
Integer chunkIndex) {
- this(representationType, null, languageCode, textBody, primary, chunkIndex, ContentRole.NORMALIZED_TEXT, null);
+ this(representationType, null, languageCode, textBody, primary, chunkIndex, null, null, ContentRole.NORMALIZED_TEXT, null);
}
}
+
diff --git a/src/main/java/at/procon/dip/processing/impl/TedStructuredDocumentProcessor.java b/src/main/java/at/procon/dip/processing/impl/TedStructuredDocumentProcessor.java
index a6aeeab..ee05ffc 100644
--- a/src/main/java/at/procon/dip/processing/impl/TedStructuredDocumentProcessor.java
+++ b/src/main/java/at/procon/dip/processing/impl/TedStructuredDocumentProcessor.java
@@ -14,7 +14,6 @@ import at.procon.dip.processing.spi.DocumentProcessingPolicy;
import at.procon.dip.processing.spi.StructuredDocumentProcessor;
import at.procon.dip.processing.spi.StructuredProcessingRequest;
import at.procon.ted.model.entity.ProcurementDocument;
-import at.procon.ted.service.TedPhase2GenericDocumentService;
import at.procon.ted.service.XmlParserService;
import java.nio.charset.StandardCharsets;
import java.util.LinkedHashMap;
@@ -32,7 +31,6 @@ public class TedStructuredDocumentProcessor implements StructuredDocumentProcess
private final XmlParserService xmlParserService;
private final DocumentService documentService;
- private final TedPhase2GenericDocumentService tedPhase2GenericDocumentService;
private final TedNoticeProjectionService tedNoticeProjectionService;
@Override
@@ -77,7 +75,6 @@ public class TedStructuredDocumentProcessor implements StructuredDocumentProcess
}
documentService.save(canonical);
- tedPhase2GenericDocumentService.syncTedDocument(tedDocument);
tedNoticeProjectionService.registerOrRefreshProjection(tedDocument, canonical.getId());
Map payload = new LinkedHashMap<>();
diff --git a/src/main/java/at/procon/dip/search/dto/SearchHit.java b/src/main/java/at/procon/dip/search/dto/SearchHit.java
index 263389f..d231ab5 100644
--- a/src/main/java/at/procon/dip/search/dto/SearchHit.java
+++ b/src/main/java/at/procon/dip/search/dto/SearchHit.java
@@ -3,6 +3,7 @@ package at.procon.dip.search.dto;
import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
+import at.procon.dip.domain.document.RepresentationType;
import java.time.OffsetDateTime;
import java.util.UUID;
import lombok.AllArgsConstructor;
@@ -27,6 +28,13 @@ public class SearchHit {
private String languageCode;
private String mimeType;
+ private RepresentationType representationType;
+ private boolean primaryRepresentation;
+ private Integer chunkIndex;
+ private Integer chunkStartOffset;
+ private Integer chunkEndOffset;
+ private int matchedRepresentationCount;
+
private SearchEngineType primaryEngine;
private SearchMatchField matchedField;
private String snippet;
diff --git a/src/main/java/at/procon/dip/search/dto/SearchMetricsResponse.java b/src/main/java/at/procon/dip/search/dto/SearchMetricsResponse.java
new file mode 100644
index 0000000..037260d
--- /dev/null
+++ b/src/main/java/at/procon/dip/search/dto/SearchMetricsResponse.java
@@ -0,0 +1,22 @@
+package at.procon.dip.search.dto;
+
+import at.procon.dip.domain.document.RepresentationType;
+import java.util.Map;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+
+@Data
+@Builder
+@NoArgsConstructor
+@AllArgsConstructor
+public class SearchMetricsResponse {
+ private long totalSearchRequests;
+ private long totalDebugRequests;
+ private long totalCollapsedHitsReturned;
+ private Map engineExecutions;
+ private Map representationCounts;
+ private long primaryRepresentationCount;
+ private long chunkRepresentationCount;
+}
diff --git a/src/main/java/at/procon/dip/search/dto/SearchRepresentationSelectionMode.java b/src/main/java/at/procon/dip/search/dto/SearchRepresentationSelectionMode.java
new file mode 100644
index 0000000..f32c183
--- /dev/null
+++ b/src/main/java/at/procon/dip/search/dto/SearchRepresentationSelectionMode.java
@@ -0,0 +1,11 @@
+package at.procon.dip.search.dto;
+
+/**
+ * Controls which document text representations participate in generic search
+ * when no explicit representationTypes filter is supplied.
+ */
+public enum SearchRepresentationSelectionMode {
+ PRIMARY_ONLY,
+ PRIMARY_AND_CHUNKS,
+ ALL
+}
diff --git a/src/main/java/at/procon/dip/search/dto/SearchRequest.java b/src/main/java/at/procon/dip/search/dto/SearchRequest.java
index 236e583..0b6becb 100644
--- a/src/main/java/at/procon/dip/search/dto/SearchRequest.java
+++ b/src/main/java/at/procon/dip/search/dto/SearchRequest.java
@@ -40,4 +40,8 @@ public class SearchRequest {
@Builder.Default
private boolean collapseByDocument = true;
+
+ @Builder.Default
+ private SearchRepresentationSelectionMode representationSelectionMode =
+ SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS;
}
diff --git a/src/main/java/at/procon/dip/search/rank/DefaultSearchResultFusionService.java b/src/main/java/at/procon/dip/search/rank/DefaultSearchResultFusionService.java
index 38c0198..e61c010 100644
--- a/src/main/java/at/procon/dip/search/rank/DefaultSearchResultFusionService.java
+++ b/src/main/java/at/procon/dip/search/rank/DefaultSearchResultFusionService.java
@@ -2,6 +2,7 @@ package at.procon.dip.search.rank;
import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.api.SearchExecutionPlan;
+import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit;
import at.procon.dip.search.dto.SearchResponse;
@@ -57,8 +58,20 @@ public class DefaultSearchResultFusionService implements SearchResultFusionServi
normalized.forEach((engine, hits) -> {
for (SearchHit hit : hits) {
Aggregate aggregate = aggregates.computeIfAbsent(hit.getDocumentId(), id -> new Aggregate());
- aggregate.bestByEngine.put(engine, hit);
- if (aggregate.representative == null || hit.getNormalizedScore() > aggregate.representative.getNormalizedScore()) {
+ SearchHit currentBestForEngine = aggregate.bestByEngine.get(engine);
+ if (currentBestForEngine == null
+ || hit.getNormalizedScore() > currentBestForEngine.getNormalizedScore()
+ || (Double.compare(hit.getNormalizedScore(), currentBestForEngine.getNormalizedScore()) == 0
+ && representationPriority(hit) < representationPriority(currentBestForEngine))) {
+ aggregate.bestByEngine.put(engine, hit);
+ }
+ if (hit.getRepresentationId() != null) {
+ aggregate.representationIds.add(hit.getRepresentationId());
+ }
+ if (aggregate.representative == null
+ || hit.getNormalizedScore() > aggregate.representative.getNormalizedScore()
+ || (Double.compare(hit.getNormalizedScore(), aggregate.representative.getNormalizedScore()) == 0
+ && representationPriority(hit) < representationPriority(aggregate.representative))) {
aggregate.representative = hit;
}
}
@@ -69,8 +82,12 @@ public class DefaultSearchResultFusionService implements SearchResultFusionServi
SearchHit representative = aggregate.representative;
double finalScore = weight(SearchEngineType.POSTGRES_FULLTEXT, aggregate) +
weight(SearchEngineType.POSTGRES_TRIGRAM, aggregate) +
- weight(SearchEngineType.PGVECTOR_SEMANTIC, aggregate);
- fused.add(representative.toBuilder().finalScore(finalScore).build());
+ weight(SearchEngineType.PGVECTOR_SEMANTIC, aggregate) +
+ recencyBoost(representative);
+ fused.add(representative.toBuilder()
+ .finalScore(finalScore)
+ .matchedRepresentationCount(aggregate.representationIds.size())
+ .build());
}
return fused;
}
@@ -97,7 +114,10 @@ public class DefaultSearchResultFusionService implements SearchResultFusionServi
case POSTGRES_TRIGRAM -> hit.getNormalizedScore() * properties.getSearch().getTrigramWeight();
case PGVECTOR_SEMANTIC -> hit.getNormalizedScore() * properties.getSearch().getSemanticWeight();
};
- merged.add(hit.toBuilder().finalScore(finalScore).build());
+ merged.add(hit.toBuilder()
+ .finalScore(finalScore + recencyBoost(hit))
+ .matchedRepresentationCount(1)
+ .build());
}
});
return merged;
@@ -117,8 +137,42 @@ public class DefaultSearchResultFusionService implements SearchResultFusionServi
hits.sort(comparator);
}
+ private double recencyBoost(SearchHit hit) {
+ if (properties.getSearch().getRecencyBoostWeight() <= 0.0d || hit.getCreatedAt() == null) {
+ return 0.0d;
+ }
+ double halfLifeDays = Math.max(1.0d, properties.getSearch().getRecencyHalfLifeDays());
+ double ageDays = Math.max(0.0d, java.time.Duration.between(hit.getCreatedAt(), java.time.OffsetDateTime.now()).toSeconds() / 86400.0d);
+ double normalized = Math.exp(-Math.log(2.0d) * (ageDays / halfLifeDays));
+ return normalized * properties.getSearch().getRecencyBoostWeight();
+ }
+
+ private int representationPriority(SearchHit hit) {
+ if (hit == null) {
+ return Integer.MAX_VALUE;
+ }
+ if (hit.isPrimaryRepresentation()) {
+ return 0;
+ }
+ RepresentationType type = hit.getRepresentationType();
+ if (type == RepresentationType.SEMANTIC_TEXT) {
+ return 1;
+ }
+ if (type == RepresentationType.TITLE_ABSTRACT) {
+ return 2;
+ }
+ if (type == RepresentationType.SUMMARY) {
+ return 3;
+ }
+ if (type == RepresentationType.CHUNK) {
+ return 4;
+ }
+ return 5;
+ }
+
private static final class Aggregate {
private final Map bestByEngine = new EnumMap<>(SearchEngineType.class);
+ private final Set representationIds = new java.util.LinkedHashSet<>();
private SearchHit representative;
}
}
diff --git a/src/main/java/at/procon/dip/search/repository/DocumentFullTextSearchRepositoryImpl.java b/src/main/java/at/procon/dip/search/repository/DocumentFullTextSearchRepositoryImpl.java
index ce18494..a2b635e 100644
--- a/src/main/java/at/procon/dip/search/repository/DocumentFullTextSearchRepositoryImpl.java
+++ b/src/main/java/at/procon/dip/search/repository/DocumentFullTextSearchRepositoryImpl.java
@@ -22,6 +22,7 @@ public class DocumentFullTextSearchRepositoryImpl implements DocumentFullTextSea
SELECT
d.id AS document_id,
dtr.id AS representation_id,
+ CAST(dtr.representation_type AS text) AS representation_type,
CAST(d.document_type AS text) AS document_type,
CAST(d.document_family AS text) AS document_family,
CAST(d.visibility AS text) AS visibility,
@@ -31,23 +32,56 @@ public class DocumentFullTextSearchRepositoryImpl implements DocumentFullTextSea
d.mime_type AS mime_type,
d.created_at AS created_at,
d.updated_at AS updated_at,
- ts_headline('simple', COALESCE(dtr.text_body, ''), websearch_to_tsquery('simple', :queryText),
- 'MaxFragments=2, MinWords=5, MaxWords=20') AS snippet,
- ts_rank_cd(dtr.search_vector, websearch_to_tsquery('simple', :queryText)) AS score
+ ts_headline(
+ CASE
+ WHEN NULLIF(dtr.search_config, '') IS NULL THEN 'simple'::regconfig
+ ELSE dtr.search_config::regconfig
+ END,
+ COALESCE(dtr.text_body, ''),
+ websearch_to_tsquery(
+ CASE
+ WHEN NULLIF(dtr.search_config, '') IS NULL THEN 'simple'::regconfig
+ ELSE dtr.search_config::regconfig
+ END,
+ :queryText
+ ),
+ 'MaxFragments=2, MinWords=5, MaxWords=20'
+ ) AS snippet,
+ ts_rank_cd(
+ dtr.search_vector,
+ websearch_to_tsquery(
+ CASE
+ WHEN NULLIF(dtr.search_config, '') IS NULL THEN 'simple'::regconfig
+ ELSE dtr.search_config::regconfig
+ END,
+ :queryText
+ )
+ ) AS score
FROM doc.doc_text_representation dtr
JOIN doc.doc_document d ON d.id = dtr.document_id
LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id
WHERE dtr.search_vector IS NOT NULL
- AND dtr.search_vector @@ websearch_to_tsquery('simple', :queryText)
+ AND dtr.search_vector @@ websearch_to_tsquery(
+ CASE
+ WHEN NULLIF(dtr.search_config, '') IS NULL THEN 'simple'::regconfig
+ ELSE dtr.search_config::regconfig
+ END,
+ :queryText
+ )
""");
MapSqlParameterSource params = new MapSqlParameterSource();
params.addValue("queryText", context.getRequest().getQueryText());
+
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
+
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
params.addValue("limit", limit);
- return jdbcTemplate.query(sql.toString(), params,
- new SearchHitRowMapper(SearchEngineType.POSTGRES_FULLTEXT, SearchMatchField.REPRESENTATION_TEXT));
+ return jdbcTemplate.query(
+ sql.toString(),
+ params,
+ new SearchHitRowMapper(SearchEngineType.POSTGRES_FULLTEXT, SearchMatchField.REPRESENTATION_TEXT)
+ );
}
-}
+}
\ No newline at end of file
diff --git a/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java b/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java
index 63c4a1c..e6d2b9f 100644
--- a/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java
+++ b/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java
@@ -33,6 +33,11 @@ public class DocumentSemanticSearchRepository {
d.summary AS summary,
COALESCE(dtr.language_code, d.language_code) AS language_code,
d.mime_type AS mime_type,
+ CAST(dtr.representation_type AS text) AS representation_type,
+ dtr.is_primary AS is_primary,
+ dtr.chunk_index AS chunk_index,
+ dtr.chunk_start_offset AS chunk_start_offset,
+ dtr.chunk_end_offset AS chunk_end_offset,
d.created_at AS created_at,
d.updated_at AS updated_at,
LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet,
diff --git a/src/main/java/at/procon/dip/search/repository/SearchHitRowMapper.java b/src/main/java/at/procon/dip/search/repository/SearchHitRowMapper.java
index 7121243..9feb403 100644
--- a/src/main/java/at/procon/dip/search/repository/SearchHitRowMapper.java
+++ b/src/main/java/at/procon/dip/search/repository/SearchHitRowMapper.java
@@ -3,6 +3,7 @@ package at.procon.dip.search.repository;
import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
+import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit;
import at.procon.dip.search.dto.SearchMatchField;
@@ -33,6 +34,11 @@ final class SearchHitRowMapper implements RowMapper {
.summary(safeGetString(rs, "summary"))
.languageCode(safeGetString(rs, "language_code"))
.mimeType(safeGetString(rs, "mime_type"))
+ .representationType(parseRepresentationType(safeGetString(rs, "representation_type")))
+ .primaryRepresentation(safeGetBoolean(rs, "is_primary"))
+ .chunkIndex(safeGetInteger(rs, "chunk_index"))
+ .chunkStartOffset(safeGetInteger(rs, "chunk_start_offset"))
+ .chunkEndOffset(safeGetInteger(rs, "chunk_end_offset"))
.primaryEngine(engineType)
.matchedField(matchedField == null || matchedField.isBlank()
? defaultField
@@ -51,4 +57,25 @@ final class SearchHitRowMapper implements RowMapper {
return null;
}
}
+
+ private Integer safeGetInteger(ResultSet rs, String column) {
+ try {
+ int value = rs.getInt(column);
+ return rs.wasNull() ? null : value;
+ } catch (SQLException ignore) {
+ return null;
+ }
+ }
+
+ private boolean safeGetBoolean(ResultSet rs, String column) {
+ try {
+ return rs.getBoolean(column) && !rs.wasNull();
+ } catch (SQLException ignore) {
+ return false;
+ }
+ }
+
+ private RepresentationType parseRepresentationType(String value) {
+ return value == null || value.isBlank() ? null : RepresentationType.valueOf(value);
+ }
}
diff --git a/src/main/java/at/procon/dip/search/repository/SearchSqlFilterSupport.java b/src/main/java/at/procon/dip/search/repository/SearchSqlFilterSupport.java
index 9ed4811..5efdde6 100644
--- a/src/main/java/at/procon/dip/search/repository/SearchSqlFilterSupport.java
+++ b/src/main/java/at/procon/dip/search/repository/SearchSqlFilterSupport.java
@@ -5,6 +5,7 @@ import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.search.api.SearchExecutionContext;
+import at.procon.dip.search.dto.SearchRepresentationSelectionMode;
import java.util.Collection;
import java.util.List;
import java.util.Set;
@@ -56,7 +57,19 @@ final class SearchSqlFilterSupport {
sql.append(" AND CAST(").append(representationAlias).append(".representation_type AS text) IN (:representationTypes)");
params.addValue("representationTypes", enumNames(representationTypes));
} else {
- sql.append(" AND ").append(representationAlias).append(".is_primary = true");
+ SearchRepresentationSelectionMode selectionMode = context.getRequest().getRepresentationSelectionMode();
+ if (selectionMode == null) {
+ selectionMode = SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS;
+ }
+ switch (selectionMode) {
+ case PRIMARY_ONLY -> sql.append(" AND ").append(representationAlias).append(".is_primary = true");
+ case PRIMARY_AND_CHUNKS -> sql.append(" AND (")
+ .append(representationAlias).append(".is_primary = true OR CAST(")
+ .append(representationAlias).append(".representation_type AS text) = 'CHUNK')");
+ case ALL -> {
+ // no implicit representation restriction
+ }
+ }
}
if (context.getRequest().getCreatedFrom() != null) {
diff --git a/src/main/java/at/procon/dip/search/service/DefaultSearchOrchestrator.java b/src/main/java/at/procon/dip/search/service/DefaultSearchOrchestrator.java
index b1b33b3..9799168 100644
--- a/src/main/java/at/procon/dip/search/service/DefaultSearchOrchestrator.java
+++ b/src/main/java/at/procon/dip/search/service/DefaultSearchOrchestrator.java
@@ -28,17 +28,21 @@ public class DefaultSearchOrchestrator implements SearchOrchestrator {
private final SearchPlanner planner;
private final List engines;
private final SearchResultFusionService fusionService;
+ private final SearchMetricsService metricsService;
@Override
public SearchResponse search(SearchRequest request, SearchDocumentScope scope) {
SearchExecution execution = executeInternal(request, scope);
- return fusionService.fuse(execution.context(), execution.plan(), execution.engineResults());
+ SearchResponse response = fusionService.fuse(execution.context(), execution.plan(), execution.engineResults());
+ metricsService.recordSearch(execution.engineResults(), response.getHits().size(), false);
+ return response;
}
@Override
public SearchDebugResponse debug(SearchRequest request, SearchDocumentScope scope) {
SearchExecution execution = executeInternal(request, scope);
SearchResponse fused = fusionService.fuse(execution.context(), execution.plan(), execution.engineResults());
+ metricsService.recordSearch(execution.engineResults(), fused.getHits().size(), true);
List debugResults = new ArrayList<>();
int topLimit = properties.getSearch().getDebugTopHitsPerEngine();
@@ -56,6 +60,11 @@ public class DefaultSearchOrchestrator implements SearchOrchestrator {
.build();
}
+ @Override
+ public at.procon.dip.search.dto.SearchMetricsResponse metrics() {
+ return metricsService.snapshot();
+ }
+
private SearchExecution executeInternal(SearchRequest request, SearchDocumentScope scope) {
int page = request.getPage() == null || request.getPage() < 0 ? 0 : request.getPage();
int requestedSize = request.getSize() == null || request.getSize() <= 0
diff --git a/src/main/java/at/procon/dip/search/service/DocumentLexicalIndexService.java b/src/main/java/at/procon/dip/search/service/DocumentLexicalIndexService.java
index 976af52..0d37c12 100644
--- a/src/main/java/at/procon/dip/search/service/DocumentLexicalIndexService.java
+++ b/src/main/java/at/procon/dip/search/service/DocumentLexicalIndexService.java
@@ -2,6 +2,9 @@ package at.procon.dip.search.service;
import java.util.List;
import java.util.UUID;
+
+import jakarta.persistence.EntityManager;
+import jakarta.persistence.PersistenceContext;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.jdbc.core.JdbcTemplate;
@@ -18,6 +21,9 @@ public class DocumentLexicalIndexService {
private final NamedParameterJdbcTemplate namedParameterJdbcTemplate;
private final JdbcTemplate jdbcTemplate;
+ @PersistenceContext
+ private EntityManager entityManager;
+
/**
* New Slice 2 name kept for current code.
*/
@@ -26,9 +32,6 @@ public class DocumentLexicalIndexService {
refreshRepresentationLexicalIndex(representationId);
}
- /**
- * Backward-compatible Slice 1 method name.
- */
@Transactional
public void refreshRepresentationLexicalIndex(UUID representationId) {
if (!isLexicalSearchSchemaAvailable()) {
@@ -36,25 +39,32 @@ public class DocumentLexicalIndexService {
return;
}
+ entityManager.flush();
+
MapSqlParameterSource params = new MapSqlParameterSource();
params.addValue("representationId", representationId);
- namedParameterJdbcTemplate.update("""
- UPDATE doc.doc_text_representation
- SET search_config = CASE
- WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german'
- WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english'
- ELSE 'simple'
+
+ int updated = namedParameterJdbcTemplate.update("""
+ UPDATE doc.doc_text_representation
+ SET search_config = CASE
+ WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german'
+ WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english'
+ ELSE 'simple'
+ END,
+ search_vector = to_tsvector(
+ CASE
+ WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german'::regconfig
+ WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english'::regconfig
+ ELSE 'simple'::regconfig
END,
- search_vector = to_tsvector(
- CASE
- WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german'::regconfig
- WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english'::regconfig
- ELSE 'simple'::regconfig
- END,
- coalesce(text_body, '')
- )
- WHERE id = :representationId
- """, params);
+ coalesce(text_body, '')
+ )
+ WHERE id = :representationId
+ """, params);
+
+ if (updated == 0) {
+ log.warn("Lexical indexing updated 0 rows for representation {}", representationId);
+ }
}
/**
diff --git a/src/main/java/at/procon/dip/search/service/SearchMetricsService.java b/src/main/java/at/procon/dip/search/service/SearchMetricsService.java
new file mode 100644
index 0000000..86257b1
--- /dev/null
+++ b/src/main/java/at/procon/dip/search/service/SearchMetricsService.java
@@ -0,0 +1,55 @@
+package at.procon.dip.search.service;
+
+import at.procon.dip.domain.document.RepresentationType;
+import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
+import at.procon.dip.search.dto.SearchEngineType;
+import at.procon.dip.search.dto.SearchMetricsResponse;
+import java.util.Arrays;
+import java.util.EnumMap;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicLong;
+import lombok.RequiredArgsConstructor;
+import org.springframework.stereotype.Service;
+
+@Service
+@RequiredArgsConstructor
+public class SearchMetricsService {
+
+ private final DocumentTextRepresentationRepository representationRepository;
+
+ private final AtomicLong totalSearchRequests = new AtomicLong();
+ private final AtomicLong totalDebugRequests = new AtomicLong();
+ private final AtomicLong totalCollapsedHitsReturned = new AtomicLong();
+ private final Map engineExecutions = new ConcurrentHashMap<>();
+
+ public void recordSearch(Map engineResults, int collapsedHits, boolean debug) {
+ totalSearchRequests.incrementAndGet();
+ if (debug) {
+ totalDebugRequests.incrementAndGet();
+ }
+ totalCollapsedHitsReturned.addAndGet(collapsedHits);
+ engineResults.keySet().forEach(engine -> engineExecutions
+ .computeIfAbsent(engine, key -> new AtomicLong())
+ .incrementAndGet());
+ }
+
+ public SearchMetricsResponse snapshot() {
+ Map engineCounts = new EnumMap<>(SearchEngineType.class);
+ engineExecutions.forEach((engine, value) -> engineCounts.put(engine, value.get()));
+
+ Map representationCounts = new EnumMap<>(RepresentationType.class);
+ Arrays.stream(RepresentationType.values())
+ .forEach(type -> representationCounts.put(type, representationRepository.countByRepresentationType(type)));
+
+ return SearchMetricsResponse.builder()
+ .totalSearchRequests(totalSearchRequests.get())
+ .totalDebugRequests(totalDebugRequests.get())
+ .totalCollapsedHitsReturned(totalCollapsedHitsReturned.get())
+ .engineExecutions(engineCounts)
+ .representationCounts(representationCounts)
+ .primaryRepresentationCount(representationRepository.countByPrimaryRepresentationTrue())
+ .chunkRepresentationCount(representationRepository.countByRepresentationType(RepresentationType.CHUNK))
+ .build();
+ }
+}
diff --git a/src/main/java/at/procon/dip/search/service/SearchOrchestrator.java b/src/main/java/at/procon/dip/search/service/SearchOrchestrator.java
index b5f8c36..c22e57c 100644
--- a/src/main/java/at/procon/dip/search/service/SearchOrchestrator.java
+++ b/src/main/java/at/procon/dip/search/service/SearchOrchestrator.java
@@ -1,6 +1,7 @@
package at.procon.dip.search.service;
import at.procon.dip.search.dto.SearchDebugResponse;
+import at.procon.dip.search.dto.SearchMetricsResponse;
import at.procon.dip.search.dto.SearchRequest;
import at.procon.dip.search.dto.SearchResponse;
import at.procon.dip.search.spi.SearchDocumentScope;
@@ -8,4 +9,6 @@ import at.procon.dip.search.spi.SearchDocumentScope;
public interface SearchOrchestrator {
SearchResponse search(SearchRequest request, SearchDocumentScope scope);
SearchDebugResponse debug(SearchRequest request, SearchDocumentScope scope);
+
+ SearchMetricsResponse metrics();
}
diff --git a/src/main/java/at/procon/dip/search/web/GenericSearchController.java b/src/main/java/at/procon/dip/search/web/GenericSearchController.java
index 91bcb30..752af22 100644
--- a/src/main/java/at/procon/dip/search/web/GenericSearchController.java
+++ b/src/main/java/at/procon/dip/search/web/GenericSearchController.java
@@ -1,6 +1,7 @@
package at.procon.dip.search.web;
import at.procon.dip.search.dto.SearchDebugResponse;
+import at.procon.dip.search.dto.SearchMetricsResponse;
import at.procon.dip.search.dto.SearchRequest;
import at.procon.dip.search.dto.SearchResponse;
import at.procon.dip.search.service.SearchOrchestrator;
@@ -9,6 +10,7 @@ import jakarta.validation.Valid;
import java.util.Set;
import lombok.RequiredArgsConstructor;
import org.springframework.web.bind.annotation.PostMapping;
+import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
@@ -30,6 +32,11 @@ public class GenericSearchController {
return searchOrchestrator.debug(request, buildScope(request));
}
+ @GetMapping("/metrics")
+ public SearchMetricsResponse metrics() {
+ return searchOrchestrator.metrics();
+ }
+
private SearchDocumentScope buildScope(SearchRequest request) {
String scopeLanguage = (request.getLanguageCodes() == null || request.getLanguageCodes().isEmpty())
? null
diff --git a/src/main/java/at/procon/ted/config/TedProcessorProperties.java b/src/main/java/at/procon/ted/config/TedProcessorProperties.java
index 4397bc0..9799f5f 100644
--- a/src/main/java/at/procon/ted/config/TedProcessorProperties.java
+++ b/src/main/java/at/procon/ted/config/TedProcessorProperties.java
@@ -234,6 +234,41 @@ public class TedProcessorProperties {
private double trigramWeight = 0.20;
private double semanticWeight = 0.45;
+
+ /**
+ * Enable chunk representations for long documents.
+ */
+ private boolean chunkingEnabled = true;
+
+ /**
+ * Target chunk size in characters for CHUNK representations.
+ */
+ @Positive
+ private int chunkTargetChars = 1800;
+
+ /**
+ * Overlap between consecutive chunks in characters.
+ */
+ @Min(0)
+ private int chunkOverlapChars = 200;
+
+ /**
+ * Maximum CHUNK representations generated per document.
+ */
+ @Positive
+ private int maxChunksPerDocument = 12;
+
+ /**
+ * Additional score weight for recency.
+ */
+ private double recencyBoostWeight = 0.05;
+
+ /**
+ * Half-life in days used for recency decay.
+ */
+ @Positive
+ private int recencyHalfLifeDays = 30;
+
/**
* Startup backfill limit for missing DOC lexical vectors.
*/
diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml
index 88a317a..7e0fa09 100644
--- a/src/main/resources/application.yml
+++ b/src/main/resources/application.yml
@@ -134,6 +134,18 @@ ted:
fulltext-weight: 0.35
trigram-weight: 0.20
semantic-weight: 0.45
+ # Additional score weight for recency
+ recency-boost-weight: 0.05
+ # Recency half-life in days
+ recency-half-life-days: 30
+ # Enable chunk representations for long documents
+ chunking-enabled: true
+ # Target chunk size in characters
+ chunk-target-chars: 1800
+ # Overlap between consecutive chunks
+ chunk-overlap-chars: 200
+ # Maximum number of chunks generated per document
+ max-chunks-per-document: 12
# Startup backfill limit for missing lexical vectors
startup-lexical-backfill-limit: 500
# Number of top hits per engine returned by /search/debug
@@ -142,7 +154,7 @@ ted:
# TED Daily Package Download configuration
download:
# Enable/disable automatic package download
- enabled: false
+ enabled: true
# User service-based camel route
use-service-based: false
# Base URL for TED Daily Packages
@@ -177,7 +189,7 @@ ted:
# IMAP Mail configuration
mail:
# Enable/disable mail processing
- enabled: true
+ enabled: false
# IMAP server hostname
host: mail.mymagenta.business
# IMAP server port (993 for IMAPS)
diff --git a/src/test/java/at/procon/dip/ingestion/integration/MailBundleProcessingIntegrationTest.java b/src/test/java/at/procon/dip/ingestion/integration/MailBundleProcessingIntegrationTest.java
index 7157cec..d171706 100644
--- a/src/test/java/at/procon/dip/ingestion/integration/MailBundleProcessingIntegrationTest.java
+++ b/src/test/java/at/procon/dip/ingestion/integration/MailBundleProcessingIntegrationTest.java
@@ -100,8 +100,10 @@ import static org.assertj.core.api.Assertions.assertThat;
})
class MailBundleProcessingIntegrationTest {
+ private static final int HOST_PORT = 15433;
+
@Container
- static PostgreSQLContainer> postgres = new FixedPortPostgreSQLContainer<>("postgres:16-alpine", 15432)
+ static PostgreSQLContainer> postgres = new FixedPortPostgreSQLContainer<>("postgres:16-alpine", HOST_PORT)
.withDatabaseName("dip_test")
.withUsername("test")
.withPassword("test")