Refactor phases 5 - search - tests
This commit is contained in:
parent
039b5a5f0a
commit
c8659bd45d
|
|
@ -0,0 +1,16 @@
|
||||||
|
Slice 3 patch for the generic search platform.
|
||||||
|
|
||||||
|
Contents:
|
||||||
|
- long-text CHUNK representations for generic and TED documents
|
||||||
|
- representation selection mode for generic search (PRIMARY_ONLY / PRIMARY_AND_CHUNKS / ALL)
|
||||||
|
- chunk-aware document collapse and matchedRepresentationCount in fused results
|
||||||
|
- recency-aware scoring boost
|
||||||
|
- lightweight search metrics endpoint: GET /api/search/metrics
|
||||||
|
|
||||||
|
Assumptions:
|
||||||
|
- apply on top of Slice 2 and the Slice 2 fix patch
|
||||||
|
- no additional DB migration is required in this slice
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Maven compile was not available in the patch generation environment
|
||||||
|
- this patch intentionally keeps TED and Mail structured search for later slices
|
||||||
|
|
@ -0,0 +1,27 @@
|
||||||
|
# Generic Search Slice Test Plan
|
||||||
|
|
||||||
|
This patch adds a minimal but useful integration-test baseline for the new generic search slices.
|
||||||
|
|
||||||
|
## What is covered
|
||||||
|
|
||||||
|
- PostgreSQL full-text search over `DOC.doc_text_representation.search_vector`
|
||||||
|
- PostgreSQL trigram search over document title / summary / representation text
|
||||||
|
- hybrid orchestration and document-level collapse
|
||||||
|
- representation selection modes (`PRIMARY_ONLY`, `PRIMARY_AND_CHUNKS`)
|
||||||
|
- REST endpoint smoke tests for:
|
||||||
|
- `POST /api/search`
|
||||||
|
- `POST /api/search/debug`
|
||||||
|
- `GET /api/search/metrics`
|
||||||
|
|
||||||
|
## Recommended execution order
|
||||||
|
|
||||||
|
1. Apply the search-slice DB migration(s) or ensure the runtime schema already contains the lexical search columns.
|
||||||
|
2. Run the new integration tests with PostgreSQL Testcontainers.
|
||||||
|
3. Start the application locally and try the included Postman requests.
|
||||||
|
4. Only after lexical tests are green, add semantic engine integration tests.
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- The test application intentionally imports only the DOC domain services and lexical search beans.
|
||||||
|
- Semantic/vector beans are left out to keep the test context small and deterministic.
|
||||||
|
- The base test class adds the `search_config` and `search_vector` columns if they are not already present.
|
||||||
4
pom.xml
4
pom.xml
|
|
@ -238,6 +238,10 @@
|
||||||
<version>1.21.4</version>
|
<version>1.21.4</version>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.fasterxml.jackson.datatype</groupId>
|
||||||
|
<artifactId>jackson-datatype-jsr310</artifactId>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,92 @@
|
||||||
|
{
|
||||||
|
"info": {
|
||||||
|
"name": "DIP Generic Search",
|
||||||
|
"_postman_id": "2d8f227e-4f38-45c0-9d59-b0642773c993",
|
||||||
|
"description": "Sample requests for the generic lexical search slices (full-text, trigram, hybrid, debug, metrics).",
|
||||||
|
"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
|
||||||
|
},
|
||||||
|
"variable": [
|
||||||
|
{"key": "baseUrl", "value": "http://localhost:8889/api"}
|
||||||
|
],
|
||||||
|
"item": [
|
||||||
|
{
|
||||||
|
"name": "Search - fulltext exact",
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"header": [{"key": "Content-Type", "value": "application/json"}],
|
||||||
|
"url": "{{baseUrl}}/search",
|
||||||
|
"body": {
|
||||||
|
"mode": "raw",
|
||||||
|
"raw": "{\n \"queryText\": \"framework agreement\",\n \"modes\": [\"FULLTEXT\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Search - trigram fuzzy title",
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"header": [{"key": "Content-Type", "value": "application/json"}],
|
||||||
|
"url": "{{baseUrl}}/search",
|
||||||
|
"body": {
|
||||||
|
"mode": "raw",
|
||||||
|
"raw": "{\n \"queryText\": \"Viena school renovtion\",\n \"modes\": [\"TRIGRAM\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Search - hybrid lexical",
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"header": [{"key": "Content-Type", "value": "application/json"}],
|
||||||
|
"url": "{{baseUrl}}/search",
|
||||||
|
"body": {
|
||||||
|
"mode": "raw",
|
||||||
|
"raw": "{\n \"queryText\": \"Maintenance manual\",\n \"modes\": [\"HYBRID\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Search - chunk-aware",
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"header": [{"key": "Content-Type", "value": "application/json"}],
|
||||||
|
"url": "{{baseUrl}}/search",
|
||||||
|
"body": {
|
||||||
|
"mode": "raw",
|
||||||
|
"raw": "{\n \"queryText\": \"district heating optimization\",\n \"modes\": [\"FULLTEXT\"],\n \"documentTypes\": [\"TEXT\"],\n \"documentFamilies\": [\"GENERIC\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_AND_CHUNKS\",\n \"page\": 0,\n \"size\": 10\n}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Search - createdFrom filter",
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"header": [{"key": "Content-Type", "value": "application/json"}],
|
||||||
|
"url": "{{baseUrl}}/search",
|
||||||
|
"body": {
|
||||||
|
"mode": "raw",
|
||||||
|
"raw": "{\n \"queryText\": \"framework agreement\",\n \"modes\": [\"FULLTEXT\"],\n \"createdFrom\": \"2026-01-01T00:00:00Z\",\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Search - debug",
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"header": [{"key": "Content-Type", "value": "application/json"}],
|
||||||
|
"url": "{{baseUrl}}/search/debug",
|
||||||
|
"body": {
|
||||||
|
"mode": "raw",
|
||||||
|
"raw": "{\n \"queryText\": \"maintenence manual\",\n \"modes\": [\"HYBRID\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Search - metrics",
|
||||||
|
"request": {
|
||||||
|
"method": "GET",
|
||||||
|
"url": "{{baseUrl}}/search/metrics"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
@ -15,5 +15,9 @@ public interface DocumentTextRepresentationRepository extends JpaRepository<Docu
|
||||||
|
|
||||||
List<DocumentTextRepresentation> findByPrimaryRepresentationTrue();
|
List<DocumentTextRepresentation> findByPrimaryRepresentationTrue();
|
||||||
|
|
||||||
|
long countByPrimaryRepresentationTrue();
|
||||||
|
|
||||||
|
long countByRepresentationType(RepresentationType representationType);
|
||||||
|
|
||||||
Optional<DocumentTextRepresentation> findFirstByDocument_IdAndPrimaryRepresentationTrue(UUID documentId);
|
Optional<DocumentTextRepresentation> findFirstByDocument_IdAndPrimaryRepresentationTrue(UUID documentId);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -36,7 +36,7 @@ public class DocumentRepresentationService {
|
||||||
.primaryRepresentation(command.primaryRepresentation())
|
.primaryRepresentation(command.primaryRepresentation())
|
||||||
.textBody(command.textBody())
|
.textBody(command.textBody())
|
||||||
.build();
|
.build();
|
||||||
DocumentTextRepresentation saved = representationRepository.save(representation);
|
DocumentTextRepresentation saved = representationRepository.saveAndFlush(representation);
|
||||||
lexicalIndexService.indexRepresentation(saved.getId());
|
lexicalIndexService.indexRepresentation(saved.getId());
|
||||||
return saved;
|
return saved;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,6 @@ import at.procon.ted.config.TedProcessorProperties;
|
||||||
import at.procon.ted.model.entity.Organization;
|
import at.procon.ted.model.entity.Organization;
|
||||||
import at.procon.ted.model.entity.ProcurementDocument;
|
import at.procon.ted.model.entity.ProcurementDocument;
|
||||||
import at.procon.ted.model.entity.ProcurementLot;
|
import at.procon.ted.model.entity.ProcurementLot;
|
||||||
import at.procon.ted.service.TedPhase2GenericDocumentService;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
|
@ -30,7 +29,7 @@ import org.springframework.transaction.annotation.Transactional;
|
||||||
public class TedNoticeProjectionService {
|
public class TedNoticeProjectionService {
|
||||||
|
|
||||||
private final TedProcessorProperties properties;
|
private final TedProcessorProperties properties;
|
||||||
private final TedPhase2GenericDocumentService tedPhase2GenericDocumentService;
|
private final TedGenericDocumentRootService tedGenericDocumentRootService;
|
||||||
private final DocumentRepository documentRepository;
|
private final DocumentRepository documentRepository;
|
||||||
private final TedNoticeProjectionRepository projectionRepository;
|
private final TedNoticeProjectionRepository projectionRepository;
|
||||||
private final TedNoticeLotRepository lotRepository;
|
private final TedNoticeLotRepository lotRepository;
|
||||||
|
|
@ -42,9 +41,8 @@ public class TedNoticeProjectionService {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
TedPhase2GenericDocumentService.TedGenericDocumentSyncResult syncResult =
|
UUID genericDocumentId = tedGenericDocumentRootService.ensureGenericTedDocumentRoot(legacyDocument);
|
||||||
tedPhase2GenericDocumentService.syncTedDocument(legacyDocument);
|
return registerOrRefreshProjection(legacyDocument, genericDocumentId);
|
||||||
return registerOrRefreshProjection(legacyDocument, syncResult.documentId());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Transactional
|
@Transactional
|
||||||
|
|
@ -55,7 +53,7 @@ public class TedNoticeProjectionService {
|
||||||
|
|
||||||
UUID resolvedDocumentId = genericDocumentId;
|
UUID resolvedDocumentId = genericDocumentId;
|
||||||
if (resolvedDocumentId == null) {
|
if (resolvedDocumentId == null) {
|
||||||
resolvedDocumentId = tedPhase2GenericDocumentService.ensureGenericTedDocument(legacyDocument);
|
resolvedDocumentId = tedGenericDocumentRootService.ensureGenericTedDocumentRoot(legacyDocument);
|
||||||
}
|
}
|
||||||
|
|
||||||
UUID finalResolvedDocumentId = resolvedDocumentId;
|
UUID finalResolvedDocumentId = resolvedDocumentId;
|
||||||
|
|
|
||||||
|
|
@ -425,8 +425,8 @@ public class GenericDocumentImportService {
|
||||||
draft.languageCode(),
|
draft.languageCode(),
|
||||||
null,
|
null,
|
||||||
draft.chunkIndex(),
|
draft.chunkIndex(),
|
||||||
null,
|
draft.chunkStartOffset(),
|
||||||
null,
|
draft.chunkEndOffset(),
|
||||||
draft.primary(),
|
draft.primary(),
|
||||||
draft.textBody()
|
draft.textBody()
|
||||||
));
|
));
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,97 @@
|
||||||
|
package at.procon.dip.normalization.impl;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.ContentRole;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
|
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
|
||||||
|
import at.procon.dip.normalization.spi.TextRepresentationBuilder;
|
||||||
|
import at.procon.dip.normalization.spi.TextRepresentationDraft;
|
||||||
|
import at.procon.ted.config.TedProcessorProperties;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.core.annotation.Order;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.util.StringUtils;
|
||||||
|
|
||||||
|
@Component
|
||||||
|
@Order(200)
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class ChunkedLongTextRepresentationBuilder implements TextRepresentationBuilder {
|
||||||
|
|
||||||
|
public static final String BUILDER_KEY = "long-text-chunker";
|
||||||
|
|
||||||
|
private final TedProcessorProperties properties;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean supports(DocumentType documentType) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<TextRepresentationDraft> build(RepresentationBuildRequest request) {
|
||||||
|
if (!properties.getSearch().isChunkingEnabled()) {
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
|
||||||
|
String baseText = request.extractionResult().derivedTextByRole().get(ContentRole.NORMALIZED_TEXT);
|
||||||
|
if (!StringUtils.hasText(baseText)) {
|
||||||
|
baseText = request.extractionResult().derivedTextByRole().get(ContentRole.HTML_CLEAN);
|
||||||
|
}
|
||||||
|
if (!StringUtils.hasText(baseText)) {
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
|
||||||
|
int target = Math.max(400, properties.getSearch().getChunkTargetChars());
|
||||||
|
int overlap = Math.max(0, Math.min(target / 3, properties.getSearch().getChunkOverlapChars()));
|
||||||
|
if (baseText.length() <= target + overlap) {
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
|
||||||
|
List<TextRepresentationDraft> drafts = new ArrayList<>();
|
||||||
|
int start = 0;
|
||||||
|
int chunkIndex = 0;
|
||||||
|
while (start < baseText.length() && chunkIndex < properties.getSearch().getMaxChunksPerDocument()) {
|
||||||
|
int end = Math.min(baseText.length(), start + target);
|
||||||
|
if (end < baseText.length()) {
|
||||||
|
int boundary = findBoundary(baseText, end, Math.min(baseText.length(), end + 160));
|
||||||
|
if (boundary > start + 200) {
|
||||||
|
end = boundary;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
String chunk = baseText.substring(start, end).trim();
|
||||||
|
if (StringUtils.hasText(chunk)) {
|
||||||
|
drafts.add(new TextRepresentationDraft(
|
||||||
|
RepresentationType.CHUNK,
|
||||||
|
BUILDER_KEY,
|
||||||
|
request.detectionResult().languageCode(),
|
||||||
|
chunk,
|
||||||
|
false,
|
||||||
|
chunkIndex,
|
||||||
|
start,
|
||||||
|
end,
|
||||||
|
ContentRole.NORMALIZED_TEXT,
|
||||||
|
Boolean.TRUE
|
||||||
|
));
|
||||||
|
chunkIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (end >= baseText.length()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
start = Math.max(end - overlap, start + 1);
|
||||||
|
}
|
||||||
|
return drafts;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int findBoundary(String text, int preferred, int max) {
|
||||||
|
for (int i = preferred; i < max; i++) {
|
||||||
|
char c = text.charAt(i);
|
||||||
|
if (c == '\n' || c == '.' || c == '!' || c == '?' || c == ';') {
|
||||||
|
return i + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return preferred;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -41,7 +41,6 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
|
||||||
String semantic = buildSemanticText(title, summary, request.detectionResult().documentType());
|
String semantic = buildSemanticText(title, summary, request.detectionResult().documentType());
|
||||||
|
|
||||||
List<TextRepresentationDraft> drafts = new ArrayList<>();
|
List<TextRepresentationDraft> drafts = new ArrayList<>();
|
||||||
/*
|
|
||||||
drafts.add(new TextRepresentationDraft(
|
drafts.add(new TextRepresentationDraft(
|
||||||
RepresentationType.FULLTEXT,
|
RepresentationType.FULLTEXT,
|
||||||
BUILDER_KEY,
|
BUILDER_KEY,
|
||||||
|
|
@ -49,10 +48,11 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
|
||||||
baseText,
|
baseText,
|
||||||
false,
|
false,
|
||||||
null,
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
ContentRole.NORMALIZED_TEXT,
|
ContentRole.NORMALIZED_TEXT,
|
||||||
Boolean.TRUE
|
Boolean.FALSE
|
||||||
));
|
));
|
||||||
*/
|
|
||||||
drafts.add(new TextRepresentationDraft(
|
drafts.add(new TextRepresentationDraft(
|
||||||
RepresentationType.SEMANTIC_TEXT,
|
RepresentationType.SEMANTIC_TEXT,
|
||||||
BUILDER_KEY,
|
BUILDER_KEY,
|
||||||
|
|
@ -60,10 +60,11 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
|
||||||
semantic,
|
semantic,
|
||||||
true,
|
true,
|
||||||
null,
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
ContentRole.NORMALIZED_TEXT,
|
ContentRole.NORMALIZED_TEXT,
|
||||||
Boolean.TRUE
|
Boolean.TRUE
|
||||||
));
|
));
|
||||||
/*
|
|
||||||
if (StringUtils.hasText(title)) {
|
if (StringUtils.hasText(title)) {
|
||||||
drafts.add(new TextRepresentationDraft(
|
drafts.add(new TextRepresentationDraft(
|
||||||
RepresentationType.TITLE_ABSTRACT,
|
RepresentationType.TITLE_ABSTRACT,
|
||||||
|
|
@ -72,11 +73,24 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
|
||||||
title + "\n\n" + summary,
|
title + "\n\n" + summary,
|
||||||
false,
|
false,
|
||||||
null,
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
ContentRole.NORMALIZED_TEXT,
|
ContentRole.NORMALIZED_TEXT,
|
||||||
Boolean.FALSE
|
Boolean.FALSE
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
*/
|
drafts.add(new TextRepresentationDraft(
|
||||||
|
RepresentationType.SUMMARY,
|
||||||
|
BUILDER_KEY,
|
||||||
|
request.detectionResult().languageCode(),
|
||||||
|
summary,
|
||||||
|
false,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
ContentRole.NORMALIZED_TEXT,
|
||||||
|
Boolean.FALSE
|
||||||
|
));
|
||||||
return drafts;
|
return drafts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -61,10 +61,11 @@ public class TedStructuredTextRepresentationBuilder implements TextRepresentatio
|
||||||
semanticText,
|
semanticText,
|
||||||
true,
|
true,
|
||||||
null,
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
ContentRole.NORMALIZED_TEXT,
|
ContentRole.NORMALIZED_TEXT,
|
||||||
Boolean.TRUE
|
Boolean.TRUE
|
||||||
));
|
));
|
||||||
/*
|
|
||||||
drafts.add(new TextRepresentationDraft(
|
drafts.add(new TextRepresentationDraft(
|
||||||
RepresentationType.FULLTEXT,
|
RepresentationType.FULLTEXT,
|
||||||
BUILDER_KEY,
|
BUILDER_KEY,
|
||||||
|
|
@ -72,8 +73,10 @@ public class TedStructuredTextRepresentationBuilder implements TextRepresentatio
|
||||||
normalizedText,
|
normalizedText,
|
||||||
false,
|
false,
|
||||||
null,
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
ContentRole.NORMALIZED_TEXT,
|
ContentRole.NORMALIZED_TEXT,
|
||||||
Boolean.TRUE
|
Boolean.FALSE
|
||||||
));
|
));
|
||||||
if (StringUtils.hasText(title)) {
|
if (StringUtils.hasText(title)) {
|
||||||
drafts.add(new TextRepresentationDraft(
|
drafts.add(new TextRepresentationDraft(
|
||||||
|
|
@ -83,6 +86,8 @@ public class TedStructuredTextRepresentationBuilder implements TextRepresentatio
|
||||||
title + "\n\n" + summary,
|
title + "\n\n" + summary,
|
||||||
false,
|
false,
|
||||||
null,
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
ContentRole.NORMALIZED_TEXT,
|
ContentRole.NORMALIZED_TEXT,
|
||||||
Boolean.FALSE
|
Boolean.FALSE
|
||||||
));
|
));
|
||||||
|
|
@ -94,10 +99,11 @@ public class TedStructuredTextRepresentationBuilder implements TextRepresentatio
|
||||||
summary,
|
summary,
|
||||||
false,
|
false,
|
||||||
null,
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
ContentRole.NORMALIZED_TEXT,
|
ContentRole.NORMALIZED_TEXT,
|
||||||
Boolean.FALSE
|
Boolean.FALSE
|
||||||
));
|
));
|
||||||
*/
|
|
||||||
return drafts;
|
return drafts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,8 @@ public record TextRepresentationDraft(
|
||||||
String textBody,
|
String textBody,
|
||||||
boolean primary,
|
boolean primary,
|
||||||
Integer chunkIndex,
|
Integer chunkIndex,
|
||||||
|
Integer chunkStartOffset,
|
||||||
|
Integer chunkEndOffset,
|
||||||
ContentRole sourceContentRole,
|
ContentRole sourceContentRole,
|
||||||
Boolean queueForEmbedding
|
Boolean queueForEmbedding
|
||||||
) {
|
) {
|
||||||
|
|
@ -22,6 +24,7 @@ public record TextRepresentationDraft(
|
||||||
String textBody,
|
String textBody,
|
||||||
boolean primary,
|
boolean primary,
|
||||||
Integer chunkIndex) {
|
Integer chunkIndex) {
|
||||||
this(representationType, null, languageCode, textBody, primary, chunkIndex, ContentRole.NORMALIZED_TEXT, null);
|
this(representationType, null, languageCode, textBody, primary, chunkIndex, null, null, ContentRole.NORMALIZED_TEXT, null);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,6 @@ import at.procon.dip.processing.spi.DocumentProcessingPolicy;
|
||||||
import at.procon.dip.processing.spi.StructuredDocumentProcessor;
|
import at.procon.dip.processing.spi.StructuredDocumentProcessor;
|
||||||
import at.procon.dip.processing.spi.StructuredProcessingRequest;
|
import at.procon.dip.processing.spi.StructuredProcessingRequest;
|
||||||
import at.procon.ted.model.entity.ProcurementDocument;
|
import at.procon.ted.model.entity.ProcurementDocument;
|
||||||
import at.procon.ted.service.TedPhase2GenericDocumentService;
|
|
||||||
import at.procon.ted.service.XmlParserService;
|
import at.procon.ted.service.XmlParserService;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.LinkedHashMap;
|
import java.util.LinkedHashMap;
|
||||||
|
|
@ -32,7 +31,6 @@ public class TedStructuredDocumentProcessor implements StructuredDocumentProcess
|
||||||
|
|
||||||
private final XmlParserService xmlParserService;
|
private final XmlParserService xmlParserService;
|
||||||
private final DocumentService documentService;
|
private final DocumentService documentService;
|
||||||
private final TedPhase2GenericDocumentService tedPhase2GenericDocumentService;
|
|
||||||
private final TedNoticeProjectionService tedNoticeProjectionService;
|
private final TedNoticeProjectionService tedNoticeProjectionService;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
@ -77,7 +75,6 @@ public class TedStructuredDocumentProcessor implements StructuredDocumentProcess
|
||||||
}
|
}
|
||||||
documentService.save(canonical);
|
documentService.save(canonical);
|
||||||
|
|
||||||
tedPhase2GenericDocumentService.syncTedDocument(tedDocument);
|
|
||||||
tedNoticeProjectionService.registerOrRefreshProjection(tedDocument, canonical.getId());
|
tedNoticeProjectionService.registerOrRefreshProjection(tedDocument, canonical.getId());
|
||||||
|
|
||||||
Map<String, Object> payload = new LinkedHashMap<>();
|
Map<String, Object> payload = new LinkedHashMap<>();
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ package at.procon.dip.search.dto;
|
||||||
import at.procon.dip.domain.access.DocumentVisibility;
|
import at.procon.dip.domain.access.DocumentVisibility;
|
||||||
import at.procon.dip.domain.document.DocumentFamily;
|
import at.procon.dip.domain.document.DocumentFamily;
|
||||||
import at.procon.dip.domain.document.DocumentType;
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
import java.time.OffsetDateTime;
|
import java.time.OffsetDateTime;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
|
|
@ -27,6 +28,13 @@ public class SearchHit {
|
||||||
private String languageCode;
|
private String languageCode;
|
||||||
private String mimeType;
|
private String mimeType;
|
||||||
|
|
||||||
|
private RepresentationType representationType;
|
||||||
|
private boolean primaryRepresentation;
|
||||||
|
private Integer chunkIndex;
|
||||||
|
private Integer chunkStartOffset;
|
||||||
|
private Integer chunkEndOffset;
|
||||||
|
private int matchedRepresentationCount;
|
||||||
|
|
||||||
private SearchEngineType primaryEngine;
|
private SearchEngineType primaryEngine;
|
||||||
private SearchMatchField matchedField;
|
private SearchMatchField matchedField;
|
||||||
private String snippet;
|
private String snippet;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,22 @@
|
||||||
|
package at.procon.dip.search.dto;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
|
import java.util.Map;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class SearchMetricsResponse {
|
||||||
|
private long totalSearchRequests;
|
||||||
|
private long totalDebugRequests;
|
||||||
|
private long totalCollapsedHitsReturned;
|
||||||
|
private Map<SearchEngineType, Long> engineExecutions;
|
||||||
|
private Map<RepresentationType, Long> representationCounts;
|
||||||
|
private long primaryRepresentationCount;
|
||||||
|
private long chunkRepresentationCount;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,11 @@
|
||||||
|
package at.procon.dip.search.dto;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Controls which document text representations participate in generic search
|
||||||
|
* when no explicit representationTypes filter is supplied.
|
||||||
|
*/
|
||||||
|
public enum SearchRepresentationSelectionMode {
|
||||||
|
PRIMARY_ONLY,
|
||||||
|
PRIMARY_AND_CHUNKS,
|
||||||
|
ALL
|
||||||
|
}
|
||||||
|
|
@ -40,4 +40,8 @@ public class SearchRequest {
|
||||||
|
|
||||||
@Builder.Default
|
@Builder.Default
|
||||||
private boolean collapseByDocument = true;
|
private boolean collapseByDocument = true;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
private SearchRepresentationSelectionMode representationSelectionMode =
|
||||||
|
SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ package at.procon.dip.search.rank;
|
||||||
|
|
||||||
import at.procon.dip.search.api.SearchExecutionContext;
|
import at.procon.dip.search.api.SearchExecutionContext;
|
||||||
import at.procon.dip.search.api.SearchExecutionPlan;
|
import at.procon.dip.search.api.SearchExecutionPlan;
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
import at.procon.dip.search.dto.SearchEngineType;
|
import at.procon.dip.search.dto.SearchEngineType;
|
||||||
import at.procon.dip.search.dto.SearchHit;
|
import at.procon.dip.search.dto.SearchHit;
|
||||||
import at.procon.dip.search.dto.SearchResponse;
|
import at.procon.dip.search.dto.SearchResponse;
|
||||||
|
|
@ -57,8 +58,20 @@ public class DefaultSearchResultFusionService implements SearchResultFusionServi
|
||||||
normalized.forEach((engine, hits) -> {
|
normalized.forEach((engine, hits) -> {
|
||||||
for (SearchHit hit : hits) {
|
for (SearchHit hit : hits) {
|
||||||
Aggregate aggregate = aggregates.computeIfAbsent(hit.getDocumentId(), id -> new Aggregate());
|
Aggregate aggregate = aggregates.computeIfAbsent(hit.getDocumentId(), id -> new Aggregate());
|
||||||
|
SearchHit currentBestForEngine = aggregate.bestByEngine.get(engine);
|
||||||
|
if (currentBestForEngine == null
|
||||||
|
|| hit.getNormalizedScore() > currentBestForEngine.getNormalizedScore()
|
||||||
|
|| (Double.compare(hit.getNormalizedScore(), currentBestForEngine.getNormalizedScore()) == 0
|
||||||
|
&& representationPriority(hit) < representationPriority(currentBestForEngine))) {
|
||||||
aggregate.bestByEngine.put(engine, hit);
|
aggregate.bestByEngine.put(engine, hit);
|
||||||
if (aggregate.representative == null || hit.getNormalizedScore() > aggregate.representative.getNormalizedScore()) {
|
}
|
||||||
|
if (hit.getRepresentationId() != null) {
|
||||||
|
aggregate.representationIds.add(hit.getRepresentationId());
|
||||||
|
}
|
||||||
|
if (aggregate.representative == null
|
||||||
|
|| hit.getNormalizedScore() > aggregate.representative.getNormalizedScore()
|
||||||
|
|| (Double.compare(hit.getNormalizedScore(), aggregate.representative.getNormalizedScore()) == 0
|
||||||
|
&& representationPriority(hit) < representationPriority(aggregate.representative))) {
|
||||||
aggregate.representative = hit;
|
aggregate.representative = hit;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -69,8 +82,12 @@ public class DefaultSearchResultFusionService implements SearchResultFusionServi
|
||||||
SearchHit representative = aggregate.representative;
|
SearchHit representative = aggregate.representative;
|
||||||
double finalScore = weight(SearchEngineType.POSTGRES_FULLTEXT, aggregate) +
|
double finalScore = weight(SearchEngineType.POSTGRES_FULLTEXT, aggregate) +
|
||||||
weight(SearchEngineType.POSTGRES_TRIGRAM, aggregate) +
|
weight(SearchEngineType.POSTGRES_TRIGRAM, aggregate) +
|
||||||
weight(SearchEngineType.PGVECTOR_SEMANTIC, aggregate);
|
weight(SearchEngineType.PGVECTOR_SEMANTIC, aggregate) +
|
||||||
fused.add(representative.toBuilder().finalScore(finalScore).build());
|
recencyBoost(representative);
|
||||||
|
fused.add(representative.toBuilder()
|
||||||
|
.finalScore(finalScore)
|
||||||
|
.matchedRepresentationCount(aggregate.representationIds.size())
|
||||||
|
.build());
|
||||||
}
|
}
|
||||||
return fused;
|
return fused;
|
||||||
}
|
}
|
||||||
|
|
@ -97,7 +114,10 @@ public class DefaultSearchResultFusionService implements SearchResultFusionServi
|
||||||
case POSTGRES_TRIGRAM -> hit.getNormalizedScore() * properties.getSearch().getTrigramWeight();
|
case POSTGRES_TRIGRAM -> hit.getNormalizedScore() * properties.getSearch().getTrigramWeight();
|
||||||
case PGVECTOR_SEMANTIC -> hit.getNormalizedScore() * properties.getSearch().getSemanticWeight();
|
case PGVECTOR_SEMANTIC -> hit.getNormalizedScore() * properties.getSearch().getSemanticWeight();
|
||||||
};
|
};
|
||||||
merged.add(hit.toBuilder().finalScore(finalScore).build());
|
merged.add(hit.toBuilder()
|
||||||
|
.finalScore(finalScore + recencyBoost(hit))
|
||||||
|
.matchedRepresentationCount(1)
|
||||||
|
.build());
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
return merged;
|
return merged;
|
||||||
|
|
@ -117,8 +137,42 @@ public class DefaultSearchResultFusionService implements SearchResultFusionServi
|
||||||
hits.sort(comparator);
|
hits.sort(comparator);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private double recencyBoost(SearchHit hit) {
|
||||||
|
if (properties.getSearch().getRecencyBoostWeight() <= 0.0d || hit.getCreatedAt() == null) {
|
||||||
|
return 0.0d;
|
||||||
|
}
|
||||||
|
double halfLifeDays = Math.max(1.0d, properties.getSearch().getRecencyHalfLifeDays());
|
||||||
|
double ageDays = Math.max(0.0d, java.time.Duration.between(hit.getCreatedAt(), java.time.OffsetDateTime.now()).toSeconds() / 86400.0d);
|
||||||
|
double normalized = Math.exp(-Math.log(2.0d) * (ageDays / halfLifeDays));
|
||||||
|
return normalized * properties.getSearch().getRecencyBoostWeight();
|
||||||
|
}
|
||||||
|
|
||||||
|
private int representationPriority(SearchHit hit) {
|
||||||
|
if (hit == null) {
|
||||||
|
return Integer.MAX_VALUE;
|
||||||
|
}
|
||||||
|
if (hit.isPrimaryRepresentation()) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
RepresentationType type = hit.getRepresentationType();
|
||||||
|
if (type == RepresentationType.SEMANTIC_TEXT) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (type == RepresentationType.TITLE_ABSTRACT) {
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
if (type == RepresentationType.SUMMARY) {
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
if (type == RepresentationType.CHUNK) {
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
return 5;
|
||||||
|
}
|
||||||
|
|
||||||
private static final class Aggregate {
|
private static final class Aggregate {
|
||||||
private final Map<SearchEngineType, SearchHit> bestByEngine = new EnumMap<>(SearchEngineType.class);
|
private final Map<SearchEngineType, SearchHit> bestByEngine = new EnumMap<>(SearchEngineType.class);
|
||||||
|
private final Set<UUID> representationIds = new java.util.LinkedHashSet<>();
|
||||||
private SearchHit representative;
|
private SearchHit representative;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,7 @@ public class DocumentFullTextSearchRepositoryImpl implements DocumentFullTextSea
|
||||||
SELECT
|
SELECT
|
||||||
d.id AS document_id,
|
d.id AS document_id,
|
||||||
dtr.id AS representation_id,
|
dtr.id AS representation_id,
|
||||||
|
CAST(dtr.representation_type AS text) AS representation_type,
|
||||||
CAST(d.document_type AS text) AS document_type,
|
CAST(d.document_type AS text) AS document_type,
|
||||||
CAST(d.document_family AS text) AS document_family,
|
CAST(d.document_family AS text) AS document_family,
|
||||||
CAST(d.visibility AS text) AS visibility,
|
CAST(d.visibility AS text) AS visibility,
|
||||||
|
|
@ -31,23 +32,56 @@ public class DocumentFullTextSearchRepositoryImpl implements DocumentFullTextSea
|
||||||
d.mime_type AS mime_type,
|
d.mime_type AS mime_type,
|
||||||
d.created_at AS created_at,
|
d.created_at AS created_at,
|
||||||
d.updated_at AS updated_at,
|
d.updated_at AS updated_at,
|
||||||
ts_headline('simple', COALESCE(dtr.text_body, ''), websearch_to_tsquery('simple', :queryText),
|
ts_headline(
|
||||||
'MaxFragments=2, MinWords=5, MaxWords=20') AS snippet,
|
CASE
|
||||||
ts_rank_cd(dtr.search_vector, websearch_to_tsquery('simple', :queryText)) AS score
|
WHEN NULLIF(dtr.search_config, '') IS NULL THEN 'simple'::regconfig
|
||||||
|
ELSE dtr.search_config::regconfig
|
||||||
|
END,
|
||||||
|
COALESCE(dtr.text_body, ''),
|
||||||
|
websearch_to_tsquery(
|
||||||
|
CASE
|
||||||
|
WHEN NULLIF(dtr.search_config, '') IS NULL THEN 'simple'::regconfig
|
||||||
|
ELSE dtr.search_config::regconfig
|
||||||
|
END,
|
||||||
|
:queryText
|
||||||
|
),
|
||||||
|
'MaxFragments=2, MinWords=5, MaxWords=20'
|
||||||
|
) AS snippet,
|
||||||
|
ts_rank_cd(
|
||||||
|
dtr.search_vector,
|
||||||
|
websearch_to_tsquery(
|
||||||
|
CASE
|
||||||
|
WHEN NULLIF(dtr.search_config, '') IS NULL THEN 'simple'::regconfig
|
||||||
|
ELSE dtr.search_config::regconfig
|
||||||
|
END,
|
||||||
|
:queryText
|
||||||
|
)
|
||||||
|
) AS score
|
||||||
FROM doc.doc_text_representation dtr
|
FROM doc.doc_text_representation dtr
|
||||||
JOIN doc.doc_document d ON d.id = dtr.document_id
|
JOIN doc.doc_document d ON d.id = dtr.document_id
|
||||||
LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id
|
LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id
|
||||||
WHERE dtr.search_vector IS NOT NULL
|
WHERE dtr.search_vector IS NOT NULL
|
||||||
AND dtr.search_vector @@ websearch_to_tsquery('simple', :queryText)
|
AND dtr.search_vector @@ websearch_to_tsquery(
|
||||||
|
CASE
|
||||||
|
WHEN NULLIF(dtr.search_config, '') IS NULL THEN 'simple'::regconfig
|
||||||
|
ELSE dtr.search_config::regconfig
|
||||||
|
END,
|
||||||
|
:queryText
|
||||||
|
)
|
||||||
""");
|
""");
|
||||||
|
|
||||||
MapSqlParameterSource params = new MapSqlParameterSource();
|
MapSqlParameterSource params = new MapSqlParameterSource();
|
||||||
params.addValue("queryText", context.getRequest().getQueryText());
|
params.addValue("queryText", context.getRequest().getQueryText());
|
||||||
|
|
||||||
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
|
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
|
||||||
|
|
||||||
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
|
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
|
||||||
params.addValue("limit", limit);
|
params.addValue("limit", limit);
|
||||||
|
|
||||||
return jdbcTemplate.query(sql.toString(), params,
|
return jdbcTemplate.query(
|
||||||
new SearchHitRowMapper(SearchEngineType.POSTGRES_FULLTEXT, SearchMatchField.REPRESENTATION_TEXT));
|
sql.toString(),
|
||||||
|
params,
|
||||||
|
new SearchHitRowMapper(SearchEngineType.POSTGRES_FULLTEXT, SearchMatchField.REPRESENTATION_TEXT)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -33,6 +33,11 @@ public class DocumentSemanticSearchRepository {
|
||||||
d.summary AS summary,
|
d.summary AS summary,
|
||||||
COALESCE(dtr.language_code, d.language_code) AS language_code,
|
COALESCE(dtr.language_code, d.language_code) AS language_code,
|
||||||
d.mime_type AS mime_type,
|
d.mime_type AS mime_type,
|
||||||
|
CAST(dtr.representation_type AS text) AS representation_type,
|
||||||
|
dtr.is_primary AS is_primary,
|
||||||
|
dtr.chunk_index AS chunk_index,
|
||||||
|
dtr.chunk_start_offset AS chunk_start_offset,
|
||||||
|
dtr.chunk_end_offset AS chunk_end_offset,
|
||||||
d.created_at AS created_at,
|
d.created_at AS created_at,
|
||||||
d.updated_at AS updated_at,
|
d.updated_at AS updated_at,
|
||||||
LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet,
|
LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet,
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ package at.procon.dip.search.repository;
|
||||||
import at.procon.dip.domain.access.DocumentVisibility;
|
import at.procon.dip.domain.access.DocumentVisibility;
|
||||||
import at.procon.dip.domain.document.DocumentFamily;
|
import at.procon.dip.domain.document.DocumentFamily;
|
||||||
import at.procon.dip.domain.document.DocumentType;
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
import at.procon.dip.search.dto.SearchEngineType;
|
import at.procon.dip.search.dto.SearchEngineType;
|
||||||
import at.procon.dip.search.dto.SearchHit;
|
import at.procon.dip.search.dto.SearchHit;
|
||||||
import at.procon.dip.search.dto.SearchMatchField;
|
import at.procon.dip.search.dto.SearchMatchField;
|
||||||
|
|
@ -33,6 +34,11 @@ final class SearchHitRowMapper implements RowMapper<SearchHit> {
|
||||||
.summary(safeGetString(rs, "summary"))
|
.summary(safeGetString(rs, "summary"))
|
||||||
.languageCode(safeGetString(rs, "language_code"))
|
.languageCode(safeGetString(rs, "language_code"))
|
||||||
.mimeType(safeGetString(rs, "mime_type"))
|
.mimeType(safeGetString(rs, "mime_type"))
|
||||||
|
.representationType(parseRepresentationType(safeGetString(rs, "representation_type")))
|
||||||
|
.primaryRepresentation(safeGetBoolean(rs, "is_primary"))
|
||||||
|
.chunkIndex(safeGetInteger(rs, "chunk_index"))
|
||||||
|
.chunkStartOffset(safeGetInteger(rs, "chunk_start_offset"))
|
||||||
|
.chunkEndOffset(safeGetInteger(rs, "chunk_end_offset"))
|
||||||
.primaryEngine(engineType)
|
.primaryEngine(engineType)
|
||||||
.matchedField(matchedField == null || matchedField.isBlank()
|
.matchedField(matchedField == null || matchedField.isBlank()
|
||||||
? defaultField
|
? defaultField
|
||||||
|
|
@ -51,4 +57,25 @@ final class SearchHitRowMapper implements RowMapper<SearchHit> {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private Integer safeGetInteger(ResultSet rs, String column) {
|
||||||
|
try {
|
||||||
|
int value = rs.getInt(column);
|
||||||
|
return rs.wasNull() ? null : value;
|
||||||
|
} catch (SQLException ignore) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean safeGetBoolean(ResultSet rs, String column) {
|
||||||
|
try {
|
||||||
|
return rs.getBoolean(column) && !rs.wasNull();
|
||||||
|
} catch (SQLException ignore) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private RepresentationType parseRepresentationType(String value) {
|
||||||
|
return value == null || value.isBlank() ? null : RepresentationType.valueOf(value);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ import at.procon.dip.domain.document.DocumentFamily;
|
||||||
import at.procon.dip.domain.document.DocumentType;
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
import at.procon.dip.domain.document.RepresentationType;
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
import at.procon.dip.search.api.SearchExecutionContext;
|
import at.procon.dip.search.api.SearchExecutionContext;
|
||||||
|
import at.procon.dip.search.dto.SearchRepresentationSelectionMode;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
@ -56,7 +57,19 @@ final class SearchSqlFilterSupport {
|
||||||
sql.append(" AND CAST(").append(representationAlias).append(".representation_type AS text) IN (:representationTypes)");
|
sql.append(" AND CAST(").append(representationAlias).append(".representation_type AS text) IN (:representationTypes)");
|
||||||
params.addValue("representationTypes", enumNames(representationTypes));
|
params.addValue("representationTypes", enumNames(representationTypes));
|
||||||
} else {
|
} else {
|
||||||
sql.append(" AND ").append(representationAlias).append(".is_primary = true");
|
SearchRepresentationSelectionMode selectionMode = context.getRequest().getRepresentationSelectionMode();
|
||||||
|
if (selectionMode == null) {
|
||||||
|
selectionMode = SearchRepresentationSelectionMode.PRIMARY_AND_CHUNKS;
|
||||||
|
}
|
||||||
|
switch (selectionMode) {
|
||||||
|
case PRIMARY_ONLY -> sql.append(" AND ").append(representationAlias).append(".is_primary = true");
|
||||||
|
case PRIMARY_AND_CHUNKS -> sql.append(" AND (")
|
||||||
|
.append(representationAlias).append(".is_primary = true OR CAST(")
|
||||||
|
.append(representationAlias).append(".representation_type AS text) = 'CHUNK')");
|
||||||
|
case ALL -> {
|
||||||
|
// no implicit representation restriction
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (context.getRequest().getCreatedFrom() != null) {
|
if (context.getRequest().getCreatedFrom() != null) {
|
||||||
|
|
|
||||||
|
|
@ -28,17 +28,21 @@ public class DefaultSearchOrchestrator implements SearchOrchestrator {
|
||||||
private final SearchPlanner planner;
|
private final SearchPlanner planner;
|
||||||
private final List<SearchEngine> engines;
|
private final List<SearchEngine> engines;
|
||||||
private final SearchResultFusionService fusionService;
|
private final SearchResultFusionService fusionService;
|
||||||
|
private final SearchMetricsService metricsService;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SearchResponse search(SearchRequest request, SearchDocumentScope scope) {
|
public SearchResponse search(SearchRequest request, SearchDocumentScope scope) {
|
||||||
SearchExecution execution = executeInternal(request, scope);
|
SearchExecution execution = executeInternal(request, scope);
|
||||||
return fusionService.fuse(execution.context(), execution.plan(), execution.engineResults());
|
SearchResponse response = fusionService.fuse(execution.context(), execution.plan(), execution.engineResults());
|
||||||
|
metricsService.recordSearch(execution.engineResults(), response.getHits().size(), false);
|
||||||
|
return response;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SearchDebugResponse debug(SearchRequest request, SearchDocumentScope scope) {
|
public SearchDebugResponse debug(SearchRequest request, SearchDocumentScope scope) {
|
||||||
SearchExecution execution = executeInternal(request, scope);
|
SearchExecution execution = executeInternal(request, scope);
|
||||||
SearchResponse fused = fusionService.fuse(execution.context(), execution.plan(), execution.engineResults());
|
SearchResponse fused = fusionService.fuse(execution.context(), execution.plan(), execution.engineResults());
|
||||||
|
metricsService.recordSearch(execution.engineResults(), fused.getHits().size(), true);
|
||||||
|
|
||||||
List<SearchEngineDebugResult> debugResults = new ArrayList<>();
|
List<SearchEngineDebugResult> debugResults = new ArrayList<>();
|
||||||
int topLimit = properties.getSearch().getDebugTopHitsPerEngine();
|
int topLimit = properties.getSearch().getDebugTopHitsPerEngine();
|
||||||
|
|
@ -56,6 +60,11 @@ public class DefaultSearchOrchestrator implements SearchOrchestrator {
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public at.procon.dip.search.dto.SearchMetricsResponse metrics() {
|
||||||
|
return metricsService.snapshot();
|
||||||
|
}
|
||||||
|
|
||||||
private SearchExecution executeInternal(SearchRequest request, SearchDocumentScope scope) {
|
private SearchExecution executeInternal(SearchRequest request, SearchDocumentScope scope) {
|
||||||
int page = request.getPage() == null || request.getPage() < 0 ? 0 : request.getPage();
|
int page = request.getPage() == null || request.getPage() < 0 ? 0 : request.getPage();
|
||||||
int requestedSize = request.getSize() == null || request.getSize() <= 0
|
int requestedSize = request.getSize() == null || request.getSize() <= 0
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,9 @@ package at.procon.dip.search.service;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
|
||||||
|
import jakarta.persistence.EntityManager;
|
||||||
|
import jakarta.persistence.PersistenceContext;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.springframework.jdbc.core.JdbcTemplate;
|
import org.springframework.jdbc.core.JdbcTemplate;
|
||||||
|
|
@ -18,6 +21,9 @@ public class DocumentLexicalIndexService {
|
||||||
private final NamedParameterJdbcTemplate namedParameterJdbcTemplate;
|
private final NamedParameterJdbcTemplate namedParameterJdbcTemplate;
|
||||||
private final JdbcTemplate jdbcTemplate;
|
private final JdbcTemplate jdbcTemplate;
|
||||||
|
|
||||||
|
@PersistenceContext
|
||||||
|
private EntityManager entityManager;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* New Slice 2 name kept for current code.
|
* New Slice 2 name kept for current code.
|
||||||
*/
|
*/
|
||||||
|
|
@ -26,9 +32,6 @@ public class DocumentLexicalIndexService {
|
||||||
refreshRepresentationLexicalIndex(representationId);
|
refreshRepresentationLexicalIndex(representationId);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Backward-compatible Slice 1 method name.
|
|
||||||
*/
|
|
||||||
@Transactional
|
@Transactional
|
||||||
public void refreshRepresentationLexicalIndex(UUID representationId) {
|
public void refreshRepresentationLexicalIndex(UUID representationId) {
|
||||||
if (!isLexicalSearchSchemaAvailable()) {
|
if (!isLexicalSearchSchemaAvailable()) {
|
||||||
|
|
@ -36,9 +39,12 @@ public class DocumentLexicalIndexService {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
entityManager.flush();
|
||||||
|
|
||||||
MapSqlParameterSource params = new MapSqlParameterSource();
|
MapSqlParameterSource params = new MapSqlParameterSource();
|
||||||
params.addValue("representationId", representationId);
|
params.addValue("representationId", representationId);
|
||||||
namedParameterJdbcTemplate.update("""
|
|
||||||
|
int updated = namedParameterJdbcTemplate.update("""
|
||||||
UPDATE doc.doc_text_representation
|
UPDATE doc.doc_text_representation
|
||||||
SET search_config = CASE
|
SET search_config = CASE
|
||||||
WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german'
|
WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german'
|
||||||
|
|
@ -55,6 +61,10 @@ public class DocumentLexicalIndexService {
|
||||||
)
|
)
|
||||||
WHERE id = :representationId
|
WHERE id = :representationId
|
||||||
""", params);
|
""", params);
|
||||||
|
|
||||||
|
if (updated == 0) {
|
||||||
|
log.warn("Lexical indexing updated 0 rows for representation {}", representationId);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,55 @@
|
||||||
|
package at.procon.dip.search.service;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
||||||
|
import at.procon.dip.search.dto.SearchEngineType;
|
||||||
|
import at.procon.dip.search.dto.SearchMetricsResponse;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.EnumMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class SearchMetricsService {
|
||||||
|
|
||||||
|
private final DocumentTextRepresentationRepository representationRepository;
|
||||||
|
|
||||||
|
private final AtomicLong totalSearchRequests = new AtomicLong();
|
||||||
|
private final AtomicLong totalDebugRequests = new AtomicLong();
|
||||||
|
private final AtomicLong totalCollapsedHitsReturned = new AtomicLong();
|
||||||
|
private final Map<SearchEngineType, AtomicLong> engineExecutions = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
|
public void recordSearch(Map<SearchEngineType, ?> engineResults, int collapsedHits, boolean debug) {
|
||||||
|
totalSearchRequests.incrementAndGet();
|
||||||
|
if (debug) {
|
||||||
|
totalDebugRequests.incrementAndGet();
|
||||||
|
}
|
||||||
|
totalCollapsedHitsReturned.addAndGet(collapsedHits);
|
||||||
|
engineResults.keySet().forEach(engine -> engineExecutions
|
||||||
|
.computeIfAbsent(engine, key -> new AtomicLong())
|
||||||
|
.incrementAndGet());
|
||||||
|
}
|
||||||
|
|
||||||
|
public SearchMetricsResponse snapshot() {
|
||||||
|
Map<SearchEngineType, Long> engineCounts = new EnumMap<>(SearchEngineType.class);
|
||||||
|
engineExecutions.forEach((engine, value) -> engineCounts.put(engine, value.get()));
|
||||||
|
|
||||||
|
Map<RepresentationType, Long> representationCounts = new EnumMap<>(RepresentationType.class);
|
||||||
|
Arrays.stream(RepresentationType.values())
|
||||||
|
.forEach(type -> representationCounts.put(type, representationRepository.countByRepresentationType(type)));
|
||||||
|
|
||||||
|
return SearchMetricsResponse.builder()
|
||||||
|
.totalSearchRequests(totalSearchRequests.get())
|
||||||
|
.totalDebugRequests(totalDebugRequests.get())
|
||||||
|
.totalCollapsedHitsReturned(totalCollapsedHitsReturned.get())
|
||||||
|
.engineExecutions(engineCounts)
|
||||||
|
.representationCounts(representationCounts)
|
||||||
|
.primaryRepresentationCount(representationRepository.countByPrimaryRepresentationTrue())
|
||||||
|
.chunkRepresentationCount(representationRepository.countByRepresentationType(RepresentationType.CHUNK))
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
package at.procon.dip.search.service;
|
package at.procon.dip.search.service;
|
||||||
|
|
||||||
import at.procon.dip.search.dto.SearchDebugResponse;
|
import at.procon.dip.search.dto.SearchDebugResponse;
|
||||||
|
import at.procon.dip.search.dto.SearchMetricsResponse;
|
||||||
import at.procon.dip.search.dto.SearchRequest;
|
import at.procon.dip.search.dto.SearchRequest;
|
||||||
import at.procon.dip.search.dto.SearchResponse;
|
import at.procon.dip.search.dto.SearchResponse;
|
||||||
import at.procon.dip.search.spi.SearchDocumentScope;
|
import at.procon.dip.search.spi.SearchDocumentScope;
|
||||||
|
|
@ -8,4 +9,6 @@ import at.procon.dip.search.spi.SearchDocumentScope;
|
||||||
public interface SearchOrchestrator {
|
public interface SearchOrchestrator {
|
||||||
SearchResponse search(SearchRequest request, SearchDocumentScope scope);
|
SearchResponse search(SearchRequest request, SearchDocumentScope scope);
|
||||||
SearchDebugResponse debug(SearchRequest request, SearchDocumentScope scope);
|
SearchDebugResponse debug(SearchRequest request, SearchDocumentScope scope);
|
||||||
|
|
||||||
|
SearchMetricsResponse metrics();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
package at.procon.dip.search.web;
|
package at.procon.dip.search.web;
|
||||||
|
|
||||||
import at.procon.dip.search.dto.SearchDebugResponse;
|
import at.procon.dip.search.dto.SearchDebugResponse;
|
||||||
|
import at.procon.dip.search.dto.SearchMetricsResponse;
|
||||||
import at.procon.dip.search.dto.SearchRequest;
|
import at.procon.dip.search.dto.SearchRequest;
|
||||||
import at.procon.dip.search.dto.SearchResponse;
|
import at.procon.dip.search.dto.SearchResponse;
|
||||||
import at.procon.dip.search.service.SearchOrchestrator;
|
import at.procon.dip.search.service.SearchOrchestrator;
|
||||||
|
|
@ -9,6 +10,7 @@ import jakarta.validation.Valid;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import org.springframework.web.bind.annotation.PostMapping;
|
import org.springframework.web.bind.annotation.PostMapping;
|
||||||
|
import org.springframework.web.bind.annotation.GetMapping;
|
||||||
import org.springframework.web.bind.annotation.RequestBody;
|
import org.springframework.web.bind.annotation.RequestBody;
|
||||||
import org.springframework.web.bind.annotation.RequestMapping;
|
import org.springframework.web.bind.annotation.RequestMapping;
|
||||||
import org.springframework.web.bind.annotation.RestController;
|
import org.springframework.web.bind.annotation.RestController;
|
||||||
|
|
@ -30,6 +32,11 @@ public class GenericSearchController {
|
||||||
return searchOrchestrator.debug(request, buildScope(request));
|
return searchOrchestrator.debug(request, buildScope(request));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@GetMapping("/metrics")
|
||||||
|
public SearchMetricsResponse metrics() {
|
||||||
|
return searchOrchestrator.metrics();
|
||||||
|
}
|
||||||
|
|
||||||
private SearchDocumentScope buildScope(SearchRequest request) {
|
private SearchDocumentScope buildScope(SearchRequest request) {
|
||||||
String scopeLanguage = (request.getLanguageCodes() == null || request.getLanguageCodes().isEmpty())
|
String scopeLanguage = (request.getLanguageCodes() == null || request.getLanguageCodes().isEmpty())
|
||||||
? null
|
? null
|
||||||
|
|
|
||||||
|
|
@ -234,6 +234,41 @@ public class TedProcessorProperties {
|
||||||
private double trigramWeight = 0.20;
|
private double trigramWeight = 0.20;
|
||||||
private double semanticWeight = 0.45;
|
private double semanticWeight = 0.45;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enable chunk representations for long documents.
|
||||||
|
*/
|
||||||
|
private boolean chunkingEnabled = true;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Target chunk size in characters for CHUNK representations.
|
||||||
|
*/
|
||||||
|
@Positive
|
||||||
|
private int chunkTargetChars = 1800;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Overlap between consecutive chunks in characters.
|
||||||
|
*/
|
||||||
|
@Min(0)
|
||||||
|
private int chunkOverlapChars = 200;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Maximum CHUNK representations generated per document.
|
||||||
|
*/
|
||||||
|
@Positive
|
||||||
|
private int maxChunksPerDocument = 12;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Additional score weight for recency.
|
||||||
|
*/
|
||||||
|
private double recencyBoostWeight = 0.05;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Half-life in days used for recency decay.
|
||||||
|
*/
|
||||||
|
@Positive
|
||||||
|
private int recencyHalfLifeDays = 30;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Startup backfill limit for missing DOC lexical vectors.
|
* Startup backfill limit for missing DOC lexical vectors.
|
||||||
*/
|
*/
|
||||||
|
|
|
||||||
|
|
@ -134,6 +134,18 @@ ted:
|
||||||
fulltext-weight: 0.35
|
fulltext-weight: 0.35
|
||||||
trigram-weight: 0.20
|
trigram-weight: 0.20
|
||||||
semantic-weight: 0.45
|
semantic-weight: 0.45
|
||||||
|
# Additional score weight for recency
|
||||||
|
recency-boost-weight: 0.05
|
||||||
|
# Recency half-life in days
|
||||||
|
recency-half-life-days: 30
|
||||||
|
# Enable chunk representations for long documents
|
||||||
|
chunking-enabled: true
|
||||||
|
# Target chunk size in characters
|
||||||
|
chunk-target-chars: 1800
|
||||||
|
# Overlap between consecutive chunks
|
||||||
|
chunk-overlap-chars: 200
|
||||||
|
# Maximum number of chunks generated per document
|
||||||
|
max-chunks-per-document: 12
|
||||||
# Startup backfill limit for missing lexical vectors
|
# Startup backfill limit for missing lexical vectors
|
||||||
startup-lexical-backfill-limit: 500
|
startup-lexical-backfill-limit: 500
|
||||||
# Number of top hits per engine returned by /search/debug
|
# Number of top hits per engine returned by /search/debug
|
||||||
|
|
@ -142,7 +154,7 @@ ted:
|
||||||
# TED Daily Package Download configuration
|
# TED Daily Package Download configuration
|
||||||
download:
|
download:
|
||||||
# Enable/disable automatic package download
|
# Enable/disable automatic package download
|
||||||
enabled: false
|
enabled: true
|
||||||
# User service-based camel route
|
# User service-based camel route
|
||||||
use-service-based: false
|
use-service-based: false
|
||||||
# Base URL for TED Daily Packages
|
# Base URL for TED Daily Packages
|
||||||
|
|
@ -177,7 +189,7 @@ ted:
|
||||||
# IMAP Mail configuration
|
# IMAP Mail configuration
|
||||||
mail:
|
mail:
|
||||||
# Enable/disable mail processing
|
# Enable/disable mail processing
|
||||||
enabled: true
|
enabled: false
|
||||||
# IMAP server hostname
|
# IMAP server hostname
|
||||||
host: mail.mymagenta.business
|
host: mail.mymagenta.business
|
||||||
# IMAP server port (993 for IMAPS)
|
# IMAP server port (993 for IMAPS)
|
||||||
|
|
|
||||||
|
|
@ -100,8 +100,10 @@ import static org.assertj.core.api.Assertions.assertThat;
|
||||||
})
|
})
|
||||||
class MailBundleProcessingIntegrationTest {
|
class MailBundleProcessingIntegrationTest {
|
||||||
|
|
||||||
|
private static final int HOST_PORT = 15433;
|
||||||
|
|
||||||
@Container
|
@Container
|
||||||
static PostgreSQLContainer<?> postgres = new FixedPortPostgreSQLContainer<>("postgres:16-alpine", 15432)
|
static PostgreSQLContainer<?> postgres = new FixedPortPostgreSQLContainer<>("postgres:16-alpine", HOST_PORT)
|
||||||
.withDatabaseName("dip_test")
|
.withDatabaseName("dip_test")
|
||||||
.withUsername("test")
|
.withUsername("test")
|
||||||
.withPassword("test")
|
.withPassword("test")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue