Refactor phases 5 - search - tests
parent
039b5a5f0a
commit
c8659bd45d
@ -0,0 +1,16 @@
|
|||||||
|
Slice 3 patch for the generic search platform.
|
||||||
|
|
||||||
|
Contents:
|
||||||
|
- long-text CHUNK representations for generic and TED documents
|
||||||
|
- representation selection mode for generic search (PRIMARY_ONLY / PRIMARY_AND_CHUNKS / ALL)
|
||||||
|
- chunk-aware document collapse and matchedRepresentationCount in fused results
|
||||||
|
- recency-aware scoring boost
|
||||||
|
- lightweight search metrics endpoint: GET /api/search/metrics
|
||||||
|
|
||||||
|
Assumptions:
|
||||||
|
- apply on top of Slice 2 and the Slice 2 fix patch
|
||||||
|
- no additional DB migration is required in this slice
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Maven compile was not available in the patch generation environment
|
||||||
|
- this patch intentionally keeps TED and Mail structured search for later slices
|
||||||
@ -0,0 +1,27 @@
|
|||||||
|
# Generic Search Slice Test Plan
|
||||||
|
|
||||||
|
This patch adds a minimal but useful integration-test baseline for the new generic search slices.
|
||||||
|
|
||||||
|
## What is covered
|
||||||
|
|
||||||
|
- PostgreSQL full-text search over `DOC.doc_text_representation.search_vector`
|
||||||
|
- PostgreSQL trigram search over document title / summary / representation text
|
||||||
|
- hybrid orchestration and document-level collapse
|
||||||
|
- representation selection modes (`PRIMARY_ONLY`, `PRIMARY_AND_CHUNKS`)
|
||||||
|
- REST endpoint smoke tests for:
|
||||||
|
- `POST /api/search`
|
||||||
|
- `POST /api/search/debug`
|
||||||
|
- `GET /api/search/metrics`
|
||||||
|
|
||||||
|
## Recommended execution order
|
||||||
|
|
||||||
|
1. Apply the search-slice DB migration(s) or ensure the runtime schema already contains the lexical search columns.
|
||||||
|
2. Run the new integration tests with PostgreSQL Testcontainers.
|
||||||
|
3. Start the application locally and try the included Postman requests.
|
||||||
|
4. Only after lexical tests are green, add semantic engine integration tests.
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- The test application intentionally imports only the DOC domain services and lexical search beans.
|
||||||
|
- Semantic/vector beans are left out to keep the test context small and deterministic.
|
||||||
|
- The base test class adds the `search_config` and `search_vector` columns if they are not already present.
|
||||||
@ -0,0 +1,92 @@
|
|||||||
|
{
|
||||||
|
"info": {
|
||||||
|
"name": "DIP Generic Search",
|
||||||
|
"_postman_id": "2d8f227e-4f38-45c0-9d59-b0642773c993",
|
||||||
|
"description": "Sample requests for the generic lexical search slices (full-text, trigram, hybrid, debug, metrics).",
|
||||||
|
"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
|
||||||
|
},
|
||||||
|
"variable": [
|
||||||
|
{"key": "baseUrl", "value": "http://localhost:8889/api"}
|
||||||
|
],
|
||||||
|
"item": [
|
||||||
|
{
|
||||||
|
"name": "Search - fulltext exact",
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"header": [{"key": "Content-Type", "value": "application/json"}],
|
||||||
|
"url": "{{baseUrl}}/search",
|
||||||
|
"body": {
|
||||||
|
"mode": "raw",
|
||||||
|
"raw": "{\n \"queryText\": \"framework agreement\",\n \"modes\": [\"FULLTEXT\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Search - trigram fuzzy title",
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"header": [{"key": "Content-Type", "value": "application/json"}],
|
||||||
|
"url": "{{baseUrl}}/search",
|
||||||
|
"body": {
|
||||||
|
"mode": "raw",
|
||||||
|
"raw": "{\n \"queryText\": \"Viena school renovtion\",\n \"modes\": [\"TRIGRAM\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Search - hybrid lexical",
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"header": [{"key": "Content-Type", "value": "application/json"}],
|
||||||
|
"url": "{{baseUrl}}/search",
|
||||||
|
"body": {
|
||||||
|
"mode": "raw",
|
||||||
|
"raw": "{\n \"queryText\": \"Maintenance manual\",\n \"modes\": [\"HYBRID\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Search - chunk-aware",
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"header": [{"key": "Content-Type", "value": "application/json"}],
|
||||||
|
"url": "{{baseUrl}}/search",
|
||||||
|
"body": {
|
||||||
|
"mode": "raw",
|
||||||
|
"raw": "{\n \"queryText\": \"district heating optimization\",\n \"modes\": [\"FULLTEXT\"],\n \"documentTypes\": [\"TEXT\"],\n \"documentFamilies\": [\"GENERIC\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_AND_CHUNKS\",\n \"page\": 0,\n \"size\": 10\n}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Search - createdFrom filter",
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"header": [{"key": "Content-Type", "value": "application/json"}],
|
||||||
|
"url": "{{baseUrl}}/search",
|
||||||
|
"body": {
|
||||||
|
"mode": "raw",
|
||||||
|
"raw": "{\n \"queryText\": \"framework agreement\",\n \"modes\": [\"FULLTEXT\"],\n \"createdFrom\": \"2026-01-01T00:00:00Z\",\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Search - debug",
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"header": [{"key": "Content-Type", "value": "application/json"}],
|
||||||
|
"url": "{{baseUrl}}/search/debug",
|
||||||
|
"body": {
|
||||||
|
"mode": "raw",
|
||||||
|
"raw": "{\n \"queryText\": \"maintenence manual\",\n \"modes\": [\"HYBRID\"],\n \"collapseByDocument\": true,\n \"representationSelectionMode\": \"PRIMARY_ONLY\",\n \"page\": 0,\n \"size\": 10\n}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Search - metrics",
|
||||||
|
"request": {
|
||||||
|
"method": "GET",
|
||||||
|
"url": "{{baseUrl}}/search/metrics"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@ -0,0 +1,97 @@
|
|||||||
|
package at.procon.dip.normalization.impl;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.ContentRole;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
|
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
|
||||||
|
import at.procon.dip.normalization.spi.TextRepresentationBuilder;
|
||||||
|
import at.procon.dip.normalization.spi.TextRepresentationDraft;
|
||||||
|
import at.procon.ted.config.TedProcessorProperties;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.core.annotation.Order;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.util.StringUtils;
|
||||||
|
|
||||||
|
@Component
|
||||||
|
@Order(200)
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class ChunkedLongTextRepresentationBuilder implements TextRepresentationBuilder {
|
||||||
|
|
||||||
|
public static final String BUILDER_KEY = "long-text-chunker";
|
||||||
|
|
||||||
|
private final TedProcessorProperties properties;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean supports(DocumentType documentType) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<TextRepresentationDraft> build(RepresentationBuildRequest request) {
|
||||||
|
if (!properties.getSearch().isChunkingEnabled()) {
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
|
||||||
|
String baseText = request.extractionResult().derivedTextByRole().get(ContentRole.NORMALIZED_TEXT);
|
||||||
|
if (!StringUtils.hasText(baseText)) {
|
||||||
|
baseText = request.extractionResult().derivedTextByRole().get(ContentRole.HTML_CLEAN);
|
||||||
|
}
|
||||||
|
if (!StringUtils.hasText(baseText)) {
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
|
||||||
|
int target = Math.max(400, properties.getSearch().getChunkTargetChars());
|
||||||
|
int overlap = Math.max(0, Math.min(target / 3, properties.getSearch().getChunkOverlapChars()));
|
||||||
|
if (baseText.length() <= target + overlap) {
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
|
||||||
|
List<TextRepresentationDraft> drafts = new ArrayList<>();
|
||||||
|
int start = 0;
|
||||||
|
int chunkIndex = 0;
|
||||||
|
while (start < baseText.length() && chunkIndex < properties.getSearch().getMaxChunksPerDocument()) {
|
||||||
|
int end = Math.min(baseText.length(), start + target);
|
||||||
|
if (end < baseText.length()) {
|
||||||
|
int boundary = findBoundary(baseText, end, Math.min(baseText.length(), end + 160));
|
||||||
|
if (boundary > start + 200) {
|
||||||
|
end = boundary;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
String chunk = baseText.substring(start, end).trim();
|
||||||
|
if (StringUtils.hasText(chunk)) {
|
||||||
|
drafts.add(new TextRepresentationDraft(
|
||||||
|
RepresentationType.CHUNK,
|
||||||
|
BUILDER_KEY,
|
||||||
|
request.detectionResult().languageCode(),
|
||||||
|
chunk,
|
||||||
|
false,
|
||||||
|
chunkIndex,
|
||||||
|
start,
|
||||||
|
end,
|
||||||
|
ContentRole.NORMALIZED_TEXT,
|
||||||
|
Boolean.TRUE
|
||||||
|
));
|
||||||
|
chunkIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (end >= baseText.length()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
start = Math.max(end - overlap, start + 1);
|
||||||
|
}
|
||||||
|
return drafts;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int findBoundary(String text, int preferred, int max) {
|
||||||
|
for (int i = preferred; i < max; i++) {
|
||||||
|
char c = text.charAt(i);
|
||||||
|
if (c == '\n' || c == '.' || c == '!' || c == '?' || c == ';') {
|
||||||
|
return i + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return preferred;
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,22 @@
|
|||||||
|
package at.procon.dip.search.dto;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
|
import java.util.Map;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class SearchMetricsResponse {
|
||||||
|
private long totalSearchRequests;
|
||||||
|
private long totalDebugRequests;
|
||||||
|
private long totalCollapsedHitsReturned;
|
||||||
|
private Map<SearchEngineType, Long> engineExecutions;
|
||||||
|
private Map<RepresentationType, Long> representationCounts;
|
||||||
|
private long primaryRepresentationCount;
|
||||||
|
private long chunkRepresentationCount;
|
||||||
|
}
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
package at.procon.dip.search.dto;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Controls which document text representations participate in generic search
|
||||||
|
* when no explicit representationTypes filter is supplied.
|
||||||
|
*/
|
||||||
|
public enum SearchRepresentationSelectionMode {
|
||||||
|
PRIMARY_ONLY,
|
||||||
|
PRIMARY_AND_CHUNKS,
|
||||||
|
ALL
|
||||||
|
}
|
||||||
@ -0,0 +1,55 @@
|
|||||||
|
package at.procon.dip.search.service;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
||||||
|
import at.procon.dip.search.dto.SearchEngineType;
|
||||||
|
import at.procon.dip.search.dto.SearchMetricsResponse;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.EnumMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class SearchMetricsService {
|
||||||
|
|
||||||
|
private final DocumentTextRepresentationRepository representationRepository;
|
||||||
|
|
||||||
|
private final AtomicLong totalSearchRequests = new AtomicLong();
|
||||||
|
private final AtomicLong totalDebugRequests = new AtomicLong();
|
||||||
|
private final AtomicLong totalCollapsedHitsReturned = new AtomicLong();
|
||||||
|
private final Map<SearchEngineType, AtomicLong> engineExecutions = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
|
public void recordSearch(Map<SearchEngineType, ?> engineResults, int collapsedHits, boolean debug) {
|
||||||
|
totalSearchRequests.incrementAndGet();
|
||||||
|
if (debug) {
|
||||||
|
totalDebugRequests.incrementAndGet();
|
||||||
|
}
|
||||||
|
totalCollapsedHitsReturned.addAndGet(collapsedHits);
|
||||||
|
engineResults.keySet().forEach(engine -> engineExecutions
|
||||||
|
.computeIfAbsent(engine, key -> new AtomicLong())
|
||||||
|
.incrementAndGet());
|
||||||
|
}
|
||||||
|
|
||||||
|
public SearchMetricsResponse snapshot() {
|
||||||
|
Map<SearchEngineType, Long> engineCounts = new EnumMap<>(SearchEngineType.class);
|
||||||
|
engineExecutions.forEach((engine, value) -> engineCounts.put(engine, value.get()));
|
||||||
|
|
||||||
|
Map<RepresentationType, Long> representationCounts = new EnumMap<>(RepresentationType.class);
|
||||||
|
Arrays.stream(RepresentationType.values())
|
||||||
|
.forEach(type -> representationCounts.put(type, representationRepository.countByRepresentationType(type)));
|
||||||
|
|
||||||
|
return SearchMetricsResponse.builder()
|
||||||
|
.totalSearchRequests(totalSearchRequests.get())
|
||||||
|
.totalDebugRequests(totalDebugRequests.get())
|
||||||
|
.totalCollapsedHitsReturned(totalCollapsedHitsReturned.get())
|
||||||
|
.engineExecutions(engineCounts)
|
||||||
|
.representationCounts(representationCounts)
|
||||||
|
.primaryRepresentationCount(representationRepository.countByPrimaryRepresentationTrue())
|
||||||
|
.chunkRepresentationCount(representationRepository.countByRepresentationType(RepresentationType.CHUNK))
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue