diff --git a/src/main/java/at/procon/dip/domain/document/service/DocumentRepresentationService.java b/src/main/java/at/procon/dip/domain/document/service/DocumentRepresentationService.java index e466387..88fe97a 100644 --- a/src/main/java/at/procon/dip/domain/document/service/DocumentRepresentationService.java +++ b/src/main/java/at/procon/dip/domain/document/service/DocumentRepresentationService.java @@ -3,8 +3,8 @@ package at.procon.dip.domain.document.service; import at.procon.dip.domain.document.entity.DocumentContent; import at.procon.dip.domain.document.entity.DocumentTextRepresentation; import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository; -import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand; import at.procon.dip.search.service.DocumentLexicalIndexService; +import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand; import java.util.List; import java.util.UUID; import lombok.RequiredArgsConstructor; @@ -37,7 +37,7 @@ public class DocumentRepresentationService { .textBody(command.textBody()) .build(); DocumentTextRepresentation saved = representationRepository.save(representation); - lexicalIndexService.refreshRepresentationLexicalIndex(saved.getId()); + lexicalIndexService.indexRepresentation(saved.getId()); return saved; } diff --git a/src/main/java/at/procon/dip/search/api/SearchExecutionContext.java b/src/main/java/at/procon/dip/search/api/SearchExecutionContext.java index 5a6eb02..8336be8 100644 --- a/src/main/java/at/procon/dip/search/api/SearchExecutionContext.java +++ b/src/main/java/at/procon/dip/search/api/SearchExecutionContext.java @@ -5,10 +5,9 @@ import at.procon.dip.search.spi.SearchDocumentScope; import lombok.Builder; import lombok.Getter; -@Getter @Builder +@Getter public class SearchExecutionContext { - private final SearchRequest request; private final SearchDocumentScope scope; private final int page; diff --git a/src/main/java/at/procon/dip/search/api/SearchExecutionPlan.java b/src/main/java/at/procon/dip/search/api/SearchExecutionPlan.java index 3475488..f8e9afc 100644 --- a/src/main/java/at/procon/dip/search/api/SearchExecutionPlan.java +++ b/src/main/java/at/procon/dip/search/api/SearchExecutionPlan.java @@ -6,10 +6,9 @@ import java.util.List; import lombok.Builder; import lombok.Getter; -@Getter @Builder +@Getter public class SearchExecutionPlan { - private final List engines; private final boolean collapseByDocument; private final SearchSortMode sortMode; diff --git a/src/main/java/at/procon/dip/search/dto/SearchDebugResponse.java b/src/main/java/at/procon/dip/search/dto/SearchDebugResponse.java new file mode 100644 index 0000000..9fa1d47 --- /dev/null +++ b/src/main/java/at/procon/dip/search/dto/SearchDebugResponse.java @@ -0,0 +1,19 @@ +package at.procon.dip.search.dto; + +import at.procon.dip.search.api.SearchExecutionPlan; +import java.util.List; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class SearchDebugResponse { + private SearchRequest request; + private SearchExecutionPlan plan; + private List engineResults; + private SearchResponse fusedResponse; +} diff --git a/src/main/java/at/procon/dip/search/dto/SearchEngineDebugResult.java b/src/main/java/at/procon/dip/search/dto/SearchEngineDebugResult.java new file mode 100644 index 0000000..a8fa784 --- /dev/null +++ b/src/main/java/at/procon/dip/search/dto/SearchEngineDebugResult.java @@ -0,0 +1,17 @@ +package at.procon.dip.search.dto; + +import java.util.List; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class SearchEngineDebugResult { + private SearchEngineType engineType; + private int hitCount; + private List topHits; +} diff --git a/src/main/java/at/procon/dip/search/dto/SearchEngineType.java b/src/main/java/at/procon/dip/search/dto/SearchEngineType.java index 71ceac3..f6828a8 100644 --- a/src/main/java/at/procon/dip/search/dto/SearchEngineType.java +++ b/src/main/java/at/procon/dip/search/dto/SearchEngineType.java @@ -2,5 +2,6 @@ package at.procon.dip.search.dto; public enum SearchEngineType { POSTGRES_FULLTEXT, - POSTGRES_TRIGRAM + POSTGRES_TRIGRAM, + PGVECTOR_SEMANTIC } diff --git a/src/main/java/at/procon/dip/search/dto/SearchHit.java b/src/main/java/at/procon/dip/search/dto/SearchHit.java index 5344b39..263389f 100644 --- a/src/main/java/at/procon/dip/search/dto/SearchHit.java +++ b/src/main/java/at/procon/dip/search/dto/SearchHit.java @@ -15,7 +15,6 @@ import lombok.NoArgsConstructor; @NoArgsConstructor @AllArgsConstructor public class SearchHit { - private UUID documentId; private UUID representationId; diff --git a/src/main/java/at/procon/dip/search/dto/SearchMode.java b/src/main/java/at/procon/dip/search/dto/SearchMode.java index cc659ce..0fa6de8 100644 --- a/src/main/java/at/procon/dip/search/dto/SearchMode.java +++ b/src/main/java/at/procon/dip/search/dto/SearchMode.java @@ -3,5 +3,6 @@ package at.procon.dip.search.dto; public enum SearchMode { FULLTEXT, TRIGRAM, + SEMANTIC, HYBRID } diff --git a/src/main/java/at/procon/dip/search/dto/SearchRequest.java b/src/main/java/at/procon/dip/search/dto/SearchRequest.java index f5c4b9f..236e583 100644 --- a/src/main/java/at/procon/dip/search/dto/SearchRequest.java +++ b/src/main/java/at/procon/dip/search/dto/SearchRequest.java @@ -4,10 +4,8 @@ import at.procon.dip.domain.access.DocumentVisibility; import at.procon.dip.domain.document.DocumentFamily; import at.procon.dip.domain.document.DocumentType; import at.procon.dip.domain.document.RepresentationType; -import jakarta.validation.constraints.Min; import jakarta.validation.constraints.NotBlank; import java.time.OffsetDateTime; -import java.util.LinkedHashSet; import java.util.Set; import lombok.AllArgsConstructor; import lombok.Builder; @@ -24,7 +22,7 @@ public class SearchRequest { private String queryText; @Builder.Default - private Set modes = new LinkedHashSet<>(Set.of(SearchMode.HYBRID)); + private Set modes = Set.of(SearchMode.HYBRID); private Set documentTypes; private Set documentFamilies; @@ -34,10 +32,7 @@ public class SearchRequest { private OffsetDateTime createdFrom; private OffsetDateTime createdTo; - @Min(0) private Integer page; - - @Min(1) private Integer size; @Builder.Default diff --git a/src/main/java/at/procon/dip/search/dto/SearchResponse.java b/src/main/java/at/procon/dip/search/dto/SearchResponse.java index ddf8922..d42f504 100644 --- a/src/main/java/at/procon/dip/search/dto/SearchResponse.java +++ b/src/main/java/at/procon/dip/search/dto/SearchResponse.java @@ -12,7 +12,6 @@ import lombok.NoArgsConstructor; @NoArgsConstructor @AllArgsConstructor public class SearchResponse { - private List hits; private int page; private int size; diff --git a/src/main/java/at/procon/dip/search/engine/fulltext/PostgresFullTextSearchEngine.java b/src/main/java/at/procon/dip/search/engine/fulltext/PostgresFullTextSearchEngine.java index ab406f8..23452af 100644 --- a/src/main/java/at/procon/dip/search/engine/fulltext/PostgresFullTextSearchEngine.java +++ b/src/main/java/at/procon/dip/search/engine/fulltext/PostgresFullTextSearchEngine.java @@ -1,15 +1,11 @@ package at.procon.dip.search.engine.fulltext; -import at.procon.dip.domain.access.DocumentVisibility; -import at.procon.dip.domain.document.DocumentFamily; -import at.procon.dip.domain.document.DocumentType; import at.procon.dip.search.api.SearchExecutionContext; import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchHit; -import at.procon.dip.search.dto.SearchMatchField; import at.procon.dip.search.engine.SearchEngine; import at.procon.dip.search.repository.DocumentFullTextSearchRepository; -import at.procon.dip.search.repository.FullTextSearchRow; +import at.procon.ted.config.TedProcessorProperties; import java.util.List; import lombok.RequiredArgsConstructor; import org.springframework.stereotype.Component; @@ -19,6 +15,7 @@ import org.springframework.stereotype.Component; public class PostgresFullTextSearchEngine implements SearchEngine { private final DocumentFullTextSearchRepository repository; + private final TedProcessorProperties properties; @Override public SearchEngineType type() { @@ -32,40 +29,6 @@ public class PostgresFullTextSearchEngine implements SearchEngine { @Override public List execute(SearchExecutionContext context) { - return repository.search(context).stream() - .map(this::mapRow) - .toList(); - } - - private SearchHit mapRow(FullTextSearchRow row) { - return SearchHit.builder() - .documentId(row.documentId()) - .representationId(row.representationId()) - .documentType(parseDocumentType(row.documentType())) - .documentFamily(parseDocumentFamily(row.documentFamily())) - .visibility(parseVisibility(row.visibility())) - .title(row.title()) - .summary(row.summary()) - .languageCode(row.languageCode()) - .mimeType(row.mimeType()) - .primaryEngine(SearchEngineType.POSTGRES_FULLTEXT) - .matchedField(SearchMatchField.REPRESENTATION_TEXT) - .snippet(row.snippet()) - .rawScore(row.score() == null ? 0.0d : row.score()) - .createdAt(row.createdAt()) - .updatedAt(row.updatedAt()) - .build(); - } - - private DocumentType parseDocumentType(String value) { - return value == null ? null : DocumentType.valueOf(value); - } - - private DocumentFamily parseDocumentFamily(String value) { - return value == null ? null : DocumentFamily.valueOf(value); - } - - private DocumentVisibility parseVisibility(String value) { - return value == null ? null : DocumentVisibility.valueOf(value); + return repository.search(context, properties.getSearch().getFulltextCandidateLimit()); } } diff --git a/src/main/java/at/procon/dip/search/engine/semantic/PgVectorSemanticSearchEngine.java b/src/main/java/at/procon/dip/search/engine/semantic/PgVectorSemanticSearchEngine.java new file mode 100644 index 0000000..f73bdda --- /dev/null +++ b/src/main/java/at/procon/dip/search/engine/semantic/PgVectorSemanticSearchEngine.java @@ -0,0 +1,45 @@ +package at.procon.dip.search.engine.semantic; + +import at.procon.dip.search.api.SearchExecutionContext; +import at.procon.dip.search.dto.SearchEngineType; +import at.procon.dip.search.dto.SearchHit; +import at.procon.dip.search.engine.SearchEngine; +import at.procon.dip.search.repository.DocumentSemanticSearchRepository; +import at.procon.dip.search.service.SemanticQueryEmbeddingService; +import at.procon.ted.config.TedProcessorProperties; +import java.util.List; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Component; + +@Component +@RequiredArgsConstructor +public class PgVectorSemanticSearchEngine implements SearchEngine { + + private final TedProcessorProperties properties; + private final SemanticQueryEmbeddingService queryEmbeddingService; + private final DocumentSemanticSearchRepository repository; + + @Override + public SearchEngineType type() { + return SearchEngineType.PGVECTOR_SEMANTIC; + } + + @Override + public boolean supports(SearchExecutionContext context) { + return properties.getVectorization().isEnabled() + && context.getRequest().getQueryText() != null + && !context.getRequest().getQueryText().isBlank(); + } + + @Override + public List execute(SearchExecutionContext context) { + return queryEmbeddingService.buildQueryEmbedding(context.getRequest().getQueryText()) + .map(query -> repository.search( + context, + query.modelId(), + query.vectorString(), + properties.getSearch().getSemanticCandidateLimit(), + properties.getSearch().getSimilarityThreshold())) + .orElse(List.of()); + } +} diff --git a/src/main/java/at/procon/dip/search/engine/trigram/PostgresTrigramSearchEngine.java b/src/main/java/at/procon/dip/search/engine/trigram/PostgresTrigramSearchEngine.java index 1a3a951..dce85c9 100644 --- a/src/main/java/at/procon/dip/search/engine/trigram/PostgresTrigramSearchEngine.java +++ b/src/main/java/at/procon/dip/search/engine/trigram/PostgresTrigramSearchEngine.java @@ -1,15 +1,11 @@ package at.procon.dip.search.engine.trigram; -import at.procon.dip.domain.access.DocumentVisibility; -import at.procon.dip.domain.document.DocumentFamily; -import at.procon.dip.domain.document.DocumentType; import at.procon.dip.search.api.SearchExecutionContext; import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchHit; -import at.procon.dip.search.dto.SearchMatchField; import at.procon.dip.search.engine.SearchEngine; import at.procon.dip.search.repository.DocumentTrigramSearchRepository; -import at.procon.dip.search.repository.TrigramSearchRow; +import at.procon.ted.config.TedProcessorProperties; import java.util.List; import lombok.RequiredArgsConstructor; import org.springframework.stereotype.Component; @@ -19,6 +15,7 @@ import org.springframework.stereotype.Component; public class PostgresTrigramSearchEngine implements SearchEngine { private final DocumentTrigramSearchRepository repository; + private final TedProcessorProperties properties; @Override public SearchEngineType type() { @@ -32,44 +29,9 @@ public class PostgresTrigramSearchEngine implements SearchEngine { @Override public List execute(SearchExecutionContext context) { - return repository.search(context).stream() - .map(this::mapRow) - .toList(); - } - - private SearchHit mapRow(TrigramSearchRow row) { - return SearchHit.builder() - .documentId(row.documentId()) - .representationId(row.representationId()) - .documentType(parseDocumentType(row.documentType())) - .documentFamily(parseDocumentFamily(row.documentFamily())) - .visibility(parseVisibility(row.visibility())) - .title(row.title()) - .summary(row.summary()) - .languageCode(row.languageCode()) - .mimeType(row.mimeType()) - .primaryEngine(SearchEngineType.POSTGRES_TRIGRAM) - .matchedField(parseMatchField(row.matchedField())) - .snippet(row.snippet()) - .rawScore(row.score() == null ? 0.0d : row.score()) - .createdAt(row.createdAt()) - .updatedAt(row.updatedAt()) - .build(); - } - - private SearchMatchField parseMatchField(String value) { - return value == null ? SearchMatchField.REPRESENTATION_TEXT : SearchMatchField.valueOf(value); - } - - private DocumentType parseDocumentType(String value) { - return value == null ? null : DocumentType.valueOf(value); - } - - private DocumentFamily parseDocumentFamily(String value) { - return value == null ? null : DocumentFamily.valueOf(value); - } - - private DocumentVisibility parseVisibility(String value) { - return value == null ? null : DocumentVisibility.valueOf(value); + return repository.search( + context, + properties.getSearch().getTrigramCandidateLimit(), + properties.getSearch().getTrigramSimilarityThreshold()); } } diff --git a/src/main/java/at/procon/dip/search/plan/DefaultSearchPlanner.java b/src/main/java/at/procon/dip/search/plan/DefaultSearchPlanner.java index 1cfefe1..99769ab 100644 --- a/src/main/java/at/procon/dip/search/plan/DefaultSearchPlanner.java +++ b/src/main/java/at/procon/dip/search/plan/DefaultSearchPlanner.java @@ -5,6 +5,7 @@ import at.procon.dip.search.api.SearchExecutionPlan; import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchMode; import java.util.ArrayList; +import java.util.LinkedHashSet; import java.util.List; import java.util.Set; import org.springframework.stereotype.Component; @@ -15,17 +16,23 @@ public class DefaultSearchPlanner implements SearchPlanner { @Override public SearchExecutionPlan plan(SearchExecutionContext context) { Set modes = context.getRequest().getModes(); - List engines = new ArrayList<>(); + if (modes == null || modes.isEmpty()) { + modes = Set.of(SearchMode.HYBRID); + } - if (modes == null || modes.isEmpty() || modes.contains(SearchMode.HYBRID) || modes.contains(SearchMode.FULLTEXT)) { + Set engines = new LinkedHashSet<>(); + if (modes.contains(SearchMode.HYBRID) || modes.contains(SearchMode.FULLTEXT)) { engines.add(SearchEngineType.POSTGRES_FULLTEXT); } - if (modes == null || modes.isEmpty() || modes.contains(SearchMode.HYBRID) || modes.contains(SearchMode.TRIGRAM)) { + if (modes.contains(SearchMode.HYBRID) || modes.contains(SearchMode.TRIGRAM)) { engines.add(SearchEngineType.POSTGRES_TRIGRAM); } + if (modes.contains(SearchMode.HYBRID) || modes.contains(SearchMode.SEMANTIC)) { + engines.add(SearchEngineType.PGVECTOR_SEMANTIC); + } return SearchExecutionPlan.builder() - .engines(engines) + .engines(new ArrayList<>(engines)) .collapseByDocument(context.getRequest().isCollapseByDocument()) .sortMode(context.getRequest().getSortMode()) .build(); diff --git a/src/main/java/at/procon/dip/search/rank/DefaultSearchResultFusionService.java b/src/main/java/at/procon/dip/search/rank/DefaultSearchResultFusionService.java index 3b80557..38c0198 100644 --- a/src/main/java/at/procon/dip/search/rank/DefaultSearchResultFusionService.java +++ b/src/main/java/at/procon/dip/search/rank/DefaultSearchResultFusionService.java @@ -5,103 +5,120 @@ import at.procon.dip.search.api.SearchExecutionPlan; import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchHit; import at.procon.dip.search.dto.SearchResponse; +import at.procon.dip.search.dto.SearchSortMode; +import at.procon.ted.config.TedProcessorProperties; import java.util.ArrayList; import java.util.Comparator; +import java.util.EnumMap; import java.util.LinkedHashMap; -import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.UUID; +import lombok.RequiredArgsConstructor; import org.springframework.stereotype.Component; @Component +@RequiredArgsConstructor public class DefaultSearchResultFusionService implements SearchResultFusionService { - private static final double FULLTEXT_WEIGHT = 0.60d; - private static final double TRIGRAM_WEIGHT = 0.40d; - - private final SearchScoreNormalizer scoreNormalizer; - - public DefaultSearchResultFusionService(SearchScoreNormalizer scoreNormalizer) { - this.scoreNormalizer = scoreNormalizer; - } + private final SearchScoreNormalizer normalizer; + private final TedProcessorProperties properties; @Override public SearchResponse fuse(SearchExecutionContext context, SearchExecutionPlan plan, Map> engineResults) { - Map> normalizedResults = new LinkedHashMap<>(); - for (Map.Entry> entry : engineResults.entrySet()) { - normalizedResults.put(entry.getKey(), scoreNormalizer.normalize(entry.getKey(), entry.getValue())); - } - - List ranked = plan.isCollapseByDocument() - ? collapseByDocument(normalizedResults) - : flatten(normalizedResults); + Map> normalized = new EnumMap<>(SearchEngineType.class); + engineResults.forEach((engine, hits) -> normalized.put(engine, normalizer.normalize(engine, hits))); - ranked.sort(Comparator - .comparingDouble(SearchHit::getFinalScore).reversed() - .thenComparing(SearchHit::getUpdatedAt, Comparator.nullsLast(Comparator.reverseOrder()))); + List fused = plan.isCollapseByDocument() + ? collapse(normalized) + : mergeWithoutCollapse(normalized); - int totalHits = ranked.size(); - int fromIndex = Math.min(context.getPage() * context.getSize(), ranked.size()); - int toIndex = Math.min(fromIndex + context.getSize(), ranked.size()); - List pageHits = ranked.subList(fromIndex, toIndex); + sort(fused, plan.getSortMode()); + long total = fused.size(); + int fromIndex = Math.min(context.getPage() * context.getSize(), fused.size()); + int toIndex = Math.min(fromIndex + context.getSize(), fused.size()); + List paged = fromIndex >= toIndex ? List.of() : fused.subList(fromIndex, toIndex); return SearchResponse.builder() - .hits(new ArrayList<>(pageHits)) + .hits(paged) .page(context.getPage()) .size(context.getSize()) - .totalHits(totalHits) - .truncated(toIndex < totalHits) - .enginesUsed(new LinkedHashSet<>(normalizedResults.keySet())) + .totalHits(total) + .truncated(total > toIndex) + .enginesUsed(engineResults.keySet()) .build(); } - private List flatten(Map> normalizedResults) { - List merged = new ArrayList<>(); - for (Map.Entry> entry : normalizedResults.entrySet()) { - for (SearchHit hit : entry.getValue()) { - merged.add(hit.toBuilder().finalScore(weight(entry.getKey()) * hit.getNormalizedScore()).build()); + private List collapse(Map> normalized) { + Map aggregates = new LinkedHashMap<>(); + normalized.forEach((engine, hits) -> { + for (SearchHit hit : hits) { + Aggregate aggregate = aggregates.computeIfAbsent(hit.getDocumentId(), id -> new Aggregate()); + aggregate.bestByEngine.put(engine, hit); + if (aggregate.representative == null || hit.getNormalizedScore() > aggregate.representative.getNormalizedScore()) { + aggregate.representative = hit; + } } + }); + + List fused = new ArrayList<>(); + for (Aggregate aggregate : aggregates.values()) { + SearchHit representative = aggregate.representative; + double finalScore = weight(SearchEngineType.POSTGRES_FULLTEXT, aggregate) + + weight(SearchEngineType.POSTGRES_TRIGRAM, aggregate) + + weight(SearchEngineType.PGVECTOR_SEMANTIC, aggregate); + fused.add(representative.toBuilder().finalScore(finalScore).build()); } - return merged; + return fused; } - private List collapseByDocument(Map> normalizedResults) { - Map collapsed = new LinkedHashMap<>(); - Map accumulatedScores = new LinkedHashMap<>(); - Set docOrder = new LinkedHashSet<>(); - - for (Map.Entry> entry : normalizedResults.entrySet()) { - double weight = weight(entry.getKey()); - for (SearchHit hit : entry.getValue()) { - docOrder.add(hit.getDocumentId()); - double contribution = weight * hit.getNormalizedScore(); - accumulatedScores.merge(hit.getDocumentId(), contribution, Double::sum); - - SearchHit existing = collapsed.get(hit.getDocumentId()); - if (existing == null || hit.getNormalizedScore() > existing.getNormalizedScore()) { - collapsed.put(hit.getDocumentId(), hit); - } - } + private double weight(SearchEngineType engineType, Aggregate aggregate) { + SearchHit hit = aggregate.bestByEngine.get(engineType); + if (hit == null) { + return 0.0d; } + TedProcessorProperties.SearchProperties search = properties.getSearch(); + return switch (engineType) { + case POSTGRES_FULLTEXT -> hit.getNormalizedScore() * search.getFulltextWeight(); + case POSTGRES_TRIGRAM -> hit.getNormalizedScore() * search.getTrigramWeight(); + case PGVECTOR_SEMANTIC -> hit.getNormalizedScore() * search.getSemanticWeight(); + }; + } - List results = new ArrayList<>(docOrder.size()); - for (UUID documentId : docOrder) { - SearchHit base = collapsed.get(documentId); - if (base != null) { - results.add(base.toBuilder().finalScore(accumulatedScores.getOrDefault(documentId, 0.0d)).build()); + private List mergeWithoutCollapse(Map> normalized) { + List merged = new ArrayList<>(); + normalized.forEach((engine, hits) -> { + for (SearchHit hit : hits) { + double finalScore = switch (engine) { + case POSTGRES_FULLTEXT -> hit.getNormalizedScore() * properties.getSearch().getFulltextWeight(); + case POSTGRES_TRIGRAM -> hit.getNormalizedScore() * properties.getSearch().getTrigramWeight(); + case PGVECTOR_SEMANTIC -> hit.getNormalizedScore() * properties.getSearch().getSemanticWeight(); + }; + merged.add(hit.toBuilder().finalScore(finalScore).build()); } - } - return results; + }); + return merged; } - private double weight(SearchEngineType engineType) { - return switch (engineType) { - case POSTGRES_FULLTEXT -> FULLTEXT_WEIGHT; - case POSTGRES_TRIGRAM -> TRIGRAM_WEIGHT; + private void sort(List hits, SearchSortMode sortMode) { + Comparator comparator = switch (sortMode) { + case CREATED_AT_DESC -> Comparator.comparing(SearchHit::getCreatedAt, + Comparator.nullsLast(Comparator.reverseOrder())); + case TITLE_ASC -> Comparator.comparing(hit -> hit.getTitle() == null ? "" : hit.getTitle(), + String.CASE_INSENSITIVE_ORDER); + case SCORE_DESC -> Comparator.comparingDouble(SearchHit::getFinalScore).reversed(); }; + if (sortMode != SearchSortMode.SCORE_DESC) { + comparator = comparator.thenComparing(Comparator.comparingDouble(SearchHit::getFinalScore).reversed()); + } + hits.sort(comparator); + } + + private static final class Aggregate { + private final Map bestByEngine = new EnumMap<>(SearchEngineType.class); + private SearchHit representative; } } diff --git a/src/main/java/at/procon/dip/search/rank/DefaultSearchScoreNormalizer.java b/src/main/java/at/procon/dip/search/rank/DefaultSearchScoreNormalizer.java index 35f05c1..15872d9 100644 --- a/src/main/java/at/procon/dip/search/rank/DefaultSearchScoreNormalizer.java +++ b/src/main/java/at/procon/dip/search/rank/DefaultSearchScoreNormalizer.java @@ -2,7 +2,6 @@ package at.procon.dip.search.rank; import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchHit; -import java.util.ArrayList; import java.util.List; import org.springframework.stereotype.Component; @@ -14,15 +13,22 @@ public class DefaultSearchScoreNormalizer implements SearchScoreNormalizer { if (hits == null || hits.isEmpty()) { return List.of(); } - double max = hits.stream().mapToDouble(SearchHit::getRawScore).max().orElse(0.0d); - if (max <= 0.0d) { - max = 1.0d; + double max = hits.stream().mapToDouble(SearchHit::getRawScore).max().orElse(1.0d); + double divisor = max > 0.0d ? max : 1.0d; + return hits.stream() + .map(hit -> hit.toBuilder() + .normalizedScore(clamp(hit.getRawScore() / divisor)) + .build()) + .toList(); + } + + private double clamp(double value) { + if (value < 0.0d) { + return 0.0d; } - List normalized = new ArrayList<>(hits.size()); - for (SearchHit hit : hits) { - double score = Math.max(0.0d, Math.min(1.0d, hit.getRawScore() / max)); - normalized.add(hit.toBuilder().normalizedScore(score).build()); + if (value > 1.0d) { + return 1.0d; } - return normalized; + return value; } } diff --git a/src/main/java/at/procon/dip/search/rank/SearchResultFusionService.java b/src/main/java/at/procon/dip/search/rank/SearchResultFusionService.java index d41c17d..508b0ce 100644 --- a/src/main/java/at/procon/dip/search/rank/SearchResultFusionService.java +++ b/src/main/java/at/procon/dip/search/rank/SearchResultFusionService.java @@ -9,9 +9,7 @@ import java.util.List; import java.util.Map; public interface SearchResultFusionService { - SearchResponse fuse( - SearchExecutionContext context, - SearchExecutionPlan plan, - Map> engineResults - ); + SearchResponse fuse(SearchExecutionContext context, + SearchExecutionPlan plan, + Map> engineResults); } diff --git a/src/main/java/at/procon/dip/search/repository/DocumentFullTextSearchRepository.java b/src/main/java/at/procon/dip/search/repository/DocumentFullTextSearchRepository.java index db355ca..667efcf 100644 --- a/src/main/java/at/procon/dip/search/repository/DocumentFullTextSearchRepository.java +++ b/src/main/java/at/procon/dip/search/repository/DocumentFullTextSearchRepository.java @@ -1,8 +1,10 @@ package at.procon.dip.search.repository; import at.procon.dip.search.api.SearchExecutionContext; +import at.procon.dip.search.dto.SearchHit; import java.util.List; public interface DocumentFullTextSearchRepository { - List search(SearchExecutionContext context); + + List search(SearchExecutionContext context, int limit); } diff --git a/src/main/java/at/procon/dip/search/repository/DocumentFullTextSearchRepositoryImpl.java b/src/main/java/at/procon/dip/search/repository/DocumentFullTextSearchRepositoryImpl.java index aa50443..ce18494 100644 --- a/src/main/java/at/procon/dip/search/repository/DocumentFullTextSearchRepositoryImpl.java +++ b/src/main/java/at/procon/dip/search/repository/DocumentFullTextSearchRepositoryImpl.java @@ -1,72 +1,53 @@ package at.procon.dip.search.repository; import at.procon.dip.search.api.SearchExecutionContext; -import jakarta.persistence.EntityManager; -import jakarta.persistence.PersistenceContext; -import jakarta.persistence.Query; -import java.util.ArrayList; +import at.procon.dip.search.dto.SearchEngineType; +import at.procon.dip.search.dto.SearchHit; +import at.procon.dip.search.dto.SearchMatchField; import java.util.List; -import java.util.Map; +import lombok.RequiredArgsConstructor; +import org.springframework.jdbc.core.namedparam.MapSqlParameterSource; +import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate; import org.springframework.stereotype.Repository; @Repository -public class DocumentFullTextSearchRepositoryImpl extends AbstractNativeSearchRepository implements DocumentFullTextSearchRepository { +@RequiredArgsConstructor +public class DocumentFullTextSearchRepositoryImpl implements DocumentFullTextSearchRepository { - @PersistenceContext - private EntityManager entityManager; + private final NamedParameterJdbcTemplate jdbcTemplate; @Override - public List search(SearchExecutionContext context) { + public List search(SearchExecutionContext context, int limit) { StringBuilder sql = new StringBuilder(""" SELECT d.id AS document_id, dtr.id AS representation_id, + CAST(d.document_type AS text) AS document_type, + CAST(d.document_family AS text) AS document_family, + CAST(d.visibility AS text) AS visibility, d.title AS title, d.summary AS summary, COALESCE(dtr.language_code, d.language_code) AS language_code, d.mime_type AS mime_type, - d.document_type AS document_type, - d.document_family AS document_family, - d.visibility AS visibility, d.created_at AS created_at, d.updated_at AS updated_at, - ts_headline('simple', COALESCE(dtr.text_body, ''), websearch_to_tsquery('simple', :queryText)) AS snippet, + ts_headline('simple', COALESCE(dtr.text_body, ''), websearch_to_tsquery('simple', :queryText), + 'MaxFragments=2, MinWords=5, MaxWords=20') AS snippet, ts_rank_cd(dtr.search_vector, websearch_to_tsquery('simple', :queryText)) AS score - FROM DOC.doc_text_representation dtr - JOIN DOC.doc_document d ON d.id = dtr.document_id - LEFT JOIN DOC.doc_tenant dt ON dt.id = d.owner_tenant_id - WHERE dtr.search_vector @@ websearch_to_tsquery('simple', :queryText) + FROM doc.doc_text_representation dtr + JOIN doc.doc_document d ON d.id = dtr.document_id + LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id + WHERE dtr.search_vector IS NOT NULL + AND dtr.search_vector @@ websearch_to_tsquery('simple', :queryText) """); - Map params = newParams(); - params.put("queryText", context.getRequest().getQueryText().trim()); - appendGenericFilters(sql, params, context); + MapSqlParameterSource params = new MapSqlParameterSource(); + params.addValue("queryText", context.getRequest().getQueryText()); + SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true); sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit"); - params.put("limit", engineLimit(context)); + params.addValue("limit", limit); - Query query = entityManager.createNativeQuery(sql.toString()); - bindParameters(query, params); - - List rows = query.getResultList(); - List results = new ArrayList<>(rows.size()); - for (Object row : rows) { - Object[] cols = (Object[]) row; - results.add(new FullTextSearchRow( - asUuid(cols[0]), - asUuid(cols[1]), - asString(cols[2]), - asString(cols[3]), - asString(cols[4]), - asString(cols[5]), - asString(cols[6]), - asString(cols[7]), - asString(cols[8]), - asOffsetDateTime(cols[9]), - asOffsetDateTime(cols[10]), - asString(cols[11]), - asDouble(cols[12]) - )); - } - return results; + return jdbcTemplate.query(sql.toString(), params, + new SearchHitRowMapper(SearchEngineType.POSTGRES_FULLTEXT, SearchMatchField.REPRESENTATION_TEXT)); } } diff --git a/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java b/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java new file mode 100644 index 0000000..63c4a1c --- /dev/null +++ b/src/main/java/at/procon/dip/search/repository/DocumentSemanticSearchRepository.java @@ -0,0 +1,61 @@ +package at.procon.dip.search.repository; + +import at.procon.dip.search.api.SearchExecutionContext; +import at.procon.dip.search.dto.SearchEngineType; +import at.procon.dip.search.dto.SearchHit; +import at.procon.dip.search.dto.SearchMatchField; +import java.util.List; +import java.util.UUID; +import lombok.RequiredArgsConstructor; +import org.springframework.jdbc.core.namedparam.MapSqlParameterSource; +import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate; +import org.springframework.stereotype.Repository; + +@Repository +@RequiredArgsConstructor +public class DocumentSemanticSearchRepository { + + private final NamedParameterJdbcTemplate jdbcTemplate; + + public List search(SearchExecutionContext context, + UUID modelId, + String queryVector, + int limit, + double threshold) { + StringBuilder sql = new StringBuilder(""" + SELECT + d.id AS document_id, + dtr.id AS representation_id, + CAST(d.document_type AS text) AS document_type, + CAST(d.document_family AS text) AS document_family, + CAST(d.visibility AS text) AS visibility, + d.title AS title, + d.summary AS summary, + COALESCE(dtr.language_code, d.language_code) AS language_code, + d.mime_type AS mime_type, + d.created_at AS created_at, + d.updated_at AS updated_at, + LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet, + (1 - (de.embedding_vector <=> CAST(:queryVector AS vector))) AS score + FROM doc.doc_embedding de + JOIN doc.doc_text_representation dtr ON dtr.id = de.representation_id + JOIN doc.doc_document d ON d.id = de.document_id + LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id + WHERE de.embedding_status = 'COMPLETED' + AND de.embedding_vector IS NOT NULL + AND de.model_id = :modelId + AND (1 - (de.embedding_vector <=> CAST(:queryVector AS vector))) >= :threshold + """); + + MapSqlParameterSource params = new MapSqlParameterSource(); + params.addValue("queryVector", queryVector); + params.addValue("modelId", modelId); + params.addValue("threshold", threshold); + SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true); + sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit"); + params.addValue("limit", limit); + + return jdbcTemplate.query(sql.toString(), params, + new SearchHitRowMapper(SearchEngineType.PGVECTOR_SEMANTIC, SearchMatchField.REPRESENTATION_TEXT)); + } +} diff --git a/src/main/java/at/procon/dip/search/repository/DocumentTrigramSearchRepository.java b/src/main/java/at/procon/dip/search/repository/DocumentTrigramSearchRepository.java index 59082cf..03f4c97 100644 --- a/src/main/java/at/procon/dip/search/repository/DocumentTrigramSearchRepository.java +++ b/src/main/java/at/procon/dip/search/repository/DocumentTrigramSearchRepository.java @@ -1,8 +1,10 @@ package at.procon.dip.search.repository; import at.procon.dip.search.api.SearchExecutionContext; +import at.procon.dip.search.dto.SearchHit; import java.util.List; public interface DocumentTrigramSearchRepository { - List search(SearchExecutionContext context); + + List search(SearchExecutionContext context, int limit, double threshold); } diff --git a/src/main/java/at/procon/dip/search/repository/DocumentTrigramSearchRepositoryImpl.java b/src/main/java/at/procon/dip/search/repository/DocumentTrigramSearchRepositoryImpl.java index 4f7c47e..a2a6975 100644 --- a/src/main/java/at/procon/dip/search/repository/DocumentTrigramSearchRepositoryImpl.java +++ b/src/main/java/at/procon/dip/search/repository/DocumentTrigramSearchRepositoryImpl.java @@ -1,102 +1,60 @@ package at.procon.dip.search.repository; import at.procon.dip.search.api.SearchExecutionContext; -import jakarta.persistence.EntityManager; -import jakarta.persistence.PersistenceContext; -import jakarta.persistence.Query; -import java.util.ArrayList; +import at.procon.dip.search.dto.SearchEngineType; +import at.procon.dip.search.dto.SearchHit; +import at.procon.dip.search.dto.SearchMatchField; import java.util.List; -import java.util.Map; +import lombok.RequiredArgsConstructor; +import org.springframework.jdbc.core.namedparam.MapSqlParameterSource; +import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate; import org.springframework.stereotype.Repository; @Repository -public class DocumentTrigramSearchRepositoryImpl extends AbstractNativeSearchRepository implements DocumentTrigramSearchRepository { +@RequiredArgsConstructor +public class DocumentTrigramSearchRepositoryImpl implements DocumentTrigramSearchRepository { - @PersistenceContext - private EntityManager entityManager; + private final NamedParameterJdbcTemplate jdbcTemplate; @Override - public List search(SearchExecutionContext context) { - StringBuilder sql = new StringBuilder(""" - SELECT - d.id AS document_id, - dtr.id AS representation_id, - d.title AS title, - d.summary AS summary, - COALESCE(dtr.language_code, d.language_code) AS language_code, - d.mime_type AS mime_type, - d.document_type AS document_type, - d.document_family AS document_family, - d.visibility AS visibility, - d.created_at AS created_at, - d.updated_at AS updated_at, - CASE - WHEN similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(d.summary, ''), :queryText) - AND similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText) - THEN COALESCE(d.title, '') - WHEN similarity(COALESCE(d.summary, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText) - THEN COALESCE(d.summary, '') - ELSE LEFT(COALESCE(dtr.text_body, ''), 400) - END AS snippet, - GREATEST( - similarity(COALESCE(d.title, ''), :queryText), - similarity(COALESCE(d.summary, ''), :queryText), - similarity(COALESCE(dtr.text_body, ''), :queryText) - ) AS score, - CASE - WHEN similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(d.summary, ''), :queryText) - AND similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText) - THEN 'DOCUMENT_TITLE' - WHEN similarity(COALESCE(d.summary, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText) - THEN 'DOCUMENT_SUMMARY' - ELSE 'REPRESENTATION_TEXT' - END AS matched_field - FROM DOC.doc_text_representation dtr - JOIN DOC.doc_document d ON d.id = dtr.document_id - LEFT JOIN DOC.doc_tenant dt ON dt.id = d.owner_tenant_id - WHERE ( - COALESCE(d.title, '') % :queryText - OR COALESCE(d.summary, '') % :queryText - OR COALESCE(dtr.text_body, '') % :queryText - ) - """); + public List search(SearchExecutionContext context, int limit, double threshold) { + String scoreExpr = "GREATEST(" + + "similarity(COALESCE(d.title, ''), :queryText), " + + "similarity(COALESCE(d.summary, ''), :queryText), " + + "similarity(COALESCE(dtr.text_body, ''), :queryText))"; - Map params = newParams(); - params.put("queryText", context.getRequest().getQueryText().trim()); - appendGenericFilters(sql, params, context); - sql.append(" AND GREATEST(") - .append(" similarity(COALESCE(d.title, ''), :queryText),") - .append(" similarity(COALESCE(d.summary, ''), :queryText),") - .append(" similarity(COALESCE(dtr.text_body, ''), :queryText)") - .append(") >= :minSimilarity"); - sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit"); - params.put("minSimilarity", 0.10d); - params.put("limit", engineLimit(context)); + StringBuilder sql = new StringBuilder("SELECT " + + "d.id AS document_id, " + + "dtr.id AS representation_id, " + + "CAST(d.document_type AS text) AS document_type, " + + "CAST(d.document_family AS text) AS document_family, " + + "CAST(d.visibility AS text) AS visibility, " + + "d.title AS title, " + + "d.summary AS summary, " + + "COALESCE(dtr.language_code, d.language_code) AS language_code, " + + "d.mime_type AS mime_type, " + + "d.created_at AS created_at, " + + "d.updated_at AS updated_at, " + + "LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet, " + + scoreExpr + " AS score, " + + "CASE " + + "WHEN similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(d.summary, ''), :queryText) " + + " AND similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText) THEN 'DOCUMENT_TITLE' " + + "WHEN similarity(COALESCE(d.summary, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText) THEN 'DOCUMENT_SUMMARY' " + + "ELSE 'REPRESENTATION_TEXT' END AS matched_field " + + "FROM doc.doc_text_representation dtr " + + "JOIN doc.doc_document d ON d.id = dtr.document_id " + + "LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id " + + "WHERE " + scoreExpr + " >= :threshold"); - Query query = entityManager.createNativeQuery(sql.toString()); - bindParameters(query, params); + MapSqlParameterSource params = new MapSqlParameterSource(); + params.addValue("queryText", context.getRequest().getQueryText()); + params.addValue("threshold", threshold); + SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true); + sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit"); + params.addValue("limit", limit); - List rows = query.getResultList(); - List results = new ArrayList<>(rows.size()); - for (Object row : rows) { - Object[] cols = (Object[]) row; - results.add(new TrigramSearchRow( - asUuid(cols[0]), - asUuid(cols[1]), - asString(cols[2]), - asString(cols[3]), - asString(cols[4]), - asString(cols[5]), - asString(cols[6]), - asString(cols[7]), - asString(cols[8]), - asOffsetDateTime(cols[9]), - asOffsetDateTime(cols[10]), - asString(cols[11]), - asDouble(cols[12]), - asString(cols[13]) - )); - } - return results; + return jdbcTemplate.query(sql.toString(), params, + new SearchHitRowMapper(SearchEngineType.POSTGRES_TRIGRAM, SearchMatchField.REPRESENTATION_TEXT)); } } diff --git a/src/main/java/at/procon/dip/search/repository/SearchHitRowMapper.java b/src/main/java/at/procon/dip/search/repository/SearchHitRowMapper.java new file mode 100644 index 0000000..7121243 --- /dev/null +++ b/src/main/java/at/procon/dip/search/repository/SearchHitRowMapper.java @@ -0,0 +1,54 @@ +package at.procon.dip.search.repository; + +import at.procon.dip.domain.access.DocumentVisibility; +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.search.dto.SearchEngineType; +import at.procon.dip.search.dto.SearchHit; +import at.procon.dip.search.dto.SearchMatchField; +import java.sql.ResultSet; +import java.sql.SQLException; +import org.springframework.jdbc.core.RowMapper; + +final class SearchHitRowMapper implements RowMapper { + + private final SearchEngineType engineType; + private final SearchMatchField defaultField; + + SearchHitRowMapper(SearchEngineType engineType, SearchMatchField defaultField) { + this.engineType = engineType; + this.defaultField = defaultField; + } + + @Override + public SearchHit mapRow(ResultSet rs, int rowNum) throws SQLException { + String matchedField = safeGetString(rs, "matched_field"); + return SearchHit.builder() + .documentId(rs.getObject("document_id", java.util.UUID.class)) + .representationId(rs.getObject("representation_id", java.util.UUID.class)) + .documentType(DocumentType.valueOf(rs.getString("document_type"))) + .documentFamily(DocumentFamily.valueOf(rs.getString("document_family"))) + .visibility(DocumentVisibility.valueOf(rs.getString("visibility"))) + .title(safeGetString(rs, "title")) + .summary(safeGetString(rs, "summary")) + .languageCode(safeGetString(rs, "language_code")) + .mimeType(safeGetString(rs, "mime_type")) + .primaryEngine(engineType) + .matchedField(matchedField == null || matchedField.isBlank() + ? defaultField + : SearchMatchField.valueOf(matchedField)) + .snippet(safeGetString(rs, "snippet")) + .rawScore(rs.getDouble("score")) + .createdAt(rs.getObject("created_at", java.time.OffsetDateTime.class)) + .updatedAt(rs.getObject("updated_at", java.time.OffsetDateTime.class)) + .build(); + } + + private String safeGetString(ResultSet rs, String column) { + try { + return rs.getString(column); + } catch (SQLException ignore) { + return null; + } + } +} diff --git a/src/main/java/at/procon/dip/search/repository/SearchSqlFilterSupport.java b/src/main/java/at/procon/dip/search/repository/SearchSqlFilterSupport.java new file mode 100644 index 0000000..9ed4811 --- /dev/null +++ b/src/main/java/at/procon/dip/search/repository/SearchSqlFilterSupport.java @@ -0,0 +1,84 @@ +package at.procon.dip.search.repository; + +import at.procon.dip.domain.access.DocumentVisibility; +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.RepresentationType; +import at.procon.dip.search.api.SearchExecutionContext; +import java.util.Collection; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import org.springframework.jdbc.core.namedparam.MapSqlParameterSource; +import org.springframework.util.CollectionUtils; + +final class SearchSqlFilterSupport { + + private SearchSqlFilterSupport() { + } + + static void appendCommonFilters(StringBuilder sql, + MapSqlParameterSource params, + SearchExecutionContext context, + String documentAlias, + String representationAlias, + boolean tenantJoinPresent) { + Set documentTypes = firstNonEmpty(context.getRequest().getDocumentTypes(), context.getScope().documentTypes()); + if (!CollectionUtils.isEmpty(documentTypes)) { + sql.append(" AND CAST(").append(documentAlias).append(".document_type AS text) IN (:documentTypes)"); + params.addValue("documentTypes", enumNames(documentTypes)); + } + + Set documentFamilies = firstNonEmpty(context.getRequest().getDocumentFamilies(), context.getScope().documentFamilies()); + if (!CollectionUtils.isEmpty(documentFamilies)) { + sql.append(" AND CAST(").append(documentAlias).append(".document_family AS text) IN (:documentFamilies)"); + params.addValue("documentFamilies", enumNames(documentFamilies)); + } + + Set visibilities = firstNonEmpty(context.getRequest().getVisibilities(), context.getScope().visibilities()); + if (!CollectionUtils.isEmpty(visibilities)) { + sql.append(" AND CAST(").append(documentAlias).append(".visibility AS text) IN (:visibilities)"); + params.addValue("visibilities", enumNames(visibilities)); + } + + Set languageCodes = context.getRequest().getLanguageCodes(); + if (CollectionUtils.isEmpty(languageCodes) && context.getScope().languageCode() != null && !context.getScope().languageCode().isBlank()) { + languageCodes = Set.of(context.getScope().languageCode()); + } + if (!CollectionUtils.isEmpty(languageCodes)) { + sql.append(" AND COALESCE(").append(representationAlias).append(".language_code, ") + .append(documentAlias).append(".language_code, '') IN (:languageCodes)"); + params.addValue("languageCodes", languageCodes); + } + + Set representationTypes = context.getRequest().getRepresentationTypes(); + if (!CollectionUtils.isEmpty(representationTypes)) { + sql.append(" AND CAST(").append(representationAlias).append(".representation_type AS text) IN (:representationTypes)"); + params.addValue("representationTypes", enumNames(representationTypes)); + } else { + sql.append(" AND ").append(representationAlias).append(".is_primary = true"); + } + + if (context.getRequest().getCreatedFrom() != null) { + sql.append(" AND ").append(documentAlias).append(".created_at >= :createdFrom"); + params.addValue("createdFrom", context.getRequest().getCreatedFrom()); + } + if (context.getRequest().getCreatedTo() != null) { + sql.append(" AND ").append(documentAlias).append(".created_at <= :createdTo"); + params.addValue("createdTo", context.getRequest().getCreatedTo()); + } + + if (tenantJoinPresent && !CollectionUtils.isEmpty(context.getScope().ownerTenantKeys())) { + sql.append(" AND dt.tenant_key IN (:ownerTenantKeys)"); + params.addValue("ownerTenantKeys", context.getScope().ownerTenantKeys()); + } + } + + private static Set firstNonEmpty(Set primary, Set fallback) { + return !CollectionUtils.isEmpty(primary) ? primary : fallback; + } + + private static List enumNames(Collection> values) { + return values.stream().map(Enum::name).collect(Collectors.toList()); + } +} diff --git a/src/main/java/at/procon/dip/search/service/DefaultSearchOrchestrator.java b/src/main/java/at/procon/dip/search/service/DefaultSearchOrchestrator.java index 88dd269..b1b33b3 100644 --- a/src/main/java/at/procon/dip/search/service/DefaultSearchOrchestrator.java +++ b/src/main/java/at/procon/dip/search/service/DefaultSearchOrchestrator.java @@ -2,6 +2,8 @@ package at.procon.dip.search.service; import at.procon.dip.search.api.SearchExecutionContext; import at.procon.dip.search.api.SearchExecutionPlan; +import at.procon.dip.search.dto.SearchDebugResponse; +import at.procon.dip.search.dto.SearchEngineDebugResult; import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchHit; import at.procon.dip.search.dto.SearchRequest; @@ -10,6 +12,8 @@ import at.procon.dip.search.engine.SearchEngine; import at.procon.dip.search.plan.SearchPlanner; import at.procon.dip.search.rank.SearchResultFusionService; import at.procon.dip.search.spi.SearchDocumentScope; +import at.procon.ted.config.TedProcessorProperties; +import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -20,28 +24,66 @@ import org.springframework.stereotype.Service; @RequiredArgsConstructor public class DefaultSearchOrchestrator implements SearchOrchestrator { + private final TedProcessorProperties properties; private final SearchPlanner planner; private final List engines; private final SearchResultFusionService fusionService; @Override public SearchResponse search(SearchRequest request, SearchDocumentScope scope) { + SearchExecution execution = executeInternal(request, scope); + return fusionService.fuse(execution.context(), execution.plan(), execution.engineResults()); + } + + @Override + public SearchDebugResponse debug(SearchRequest request, SearchDocumentScope scope) { + SearchExecution execution = executeInternal(request, scope); + SearchResponse fused = fusionService.fuse(execution.context(), execution.plan(), execution.engineResults()); + + List debugResults = new ArrayList<>(); + int topLimit = properties.getSearch().getDebugTopHitsPerEngine(); + execution.engineResults().forEach((engine, hits) -> debugResults.add(SearchEngineDebugResult.builder() + .engineType(engine) + .hitCount(hits.size()) + .topHits(hits.stream().limit(topLimit).toList()) + .build())); + + return SearchDebugResponse.builder() + .request(request) + .plan(execution.plan()) + .engineResults(debugResults) + .fusedResponse(fused) + .build(); + } + + private SearchExecution executeInternal(SearchRequest request, SearchDocumentScope scope) { + int page = request.getPage() == null || request.getPage() < 0 ? 0 : request.getPage(); + int requestedSize = request.getSize() == null || request.getSize() <= 0 + ? properties.getSearch().getDefaultPageSize() + : request.getSize(); + int size = Math.min(requestedSize, properties.getSearch().getMaxPageSize()); + SearchExecutionContext context = SearchExecutionContext.builder() .request(request) .scope(scope) - .page(request.getPage() == null ? 0 : request.getPage()) - .size(request.getSize() == null ? 20 : request.getSize()) + .page(page) + .size(size) .build(); SearchExecutionPlan plan = planner.plan(context); - Map> engineResults = new LinkedHashMap<>(); for (SearchEngine engine : engines) { if (plan.getEngines().contains(engine.type()) && engine.supports(context)) { engineResults.put(engine.type(), engine.execute(context)); } } + return new SearchExecution(context, plan, engineResults); + } - return fusionService.fuse(context, plan, engineResults); + private record SearchExecution( + SearchExecutionContext context, + SearchExecutionPlan plan, + Map> engineResults + ) { } } diff --git a/src/main/java/at/procon/dip/search/service/DocumentLexicalIndexService.java b/src/main/java/at/procon/dip/search/service/DocumentLexicalIndexService.java index 3cc8ef3..976af52 100644 --- a/src/main/java/at/procon/dip/search/service/DocumentLexicalIndexService.java +++ b/src/main/java/at/procon/dip/search/service/DocumentLexicalIndexService.java @@ -1,27 +1,45 @@ package at.procon.dip.search.service; -import jakarta.persistence.EntityManager; -import jakarta.persistence.PersistenceContext; -import jakarta.transaction.Transactional; +import java.util.List; import java.util.UUID; +import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.springframework.jdbc.core.JdbcTemplate; +import org.springframework.jdbc.core.namedparam.MapSqlParameterSource; +import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate; import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; @Service -@Transactional +@RequiredArgsConstructor @Slf4j public class DocumentLexicalIndexService { - @PersistenceContext - private EntityManager entityManager; + private final NamedParameterJdbcTemplate namedParameterJdbcTemplate; + private final JdbcTemplate jdbcTemplate; + /** + * New Slice 2 name kept for current code. + */ + @Transactional + public void indexRepresentation(UUID representationId) { + refreshRepresentationLexicalIndex(representationId); + } + + /** + * Backward-compatible Slice 1 method name. + */ + @Transactional public void refreshRepresentationLexicalIndex(UUID representationId) { if (!isLexicalSearchSchemaAvailable()) { - log.debug("Skipping lexical index refresh for representation {} because search columns are not available yet", representationId); + log.debug("Skipping lexical indexing for representation {} because search_vector columns are not present yet", representationId); return; } - entityManager.createNativeQuery(""" - UPDATE DOC.doc_text_representation + + MapSqlParameterSource params = new MapSqlParameterSource(); + params.addValue("representationId", representationId); + namedParameterJdbcTemplate.update(""" + UPDATE doc.doc_text_representation SET search_config = CASE WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german' WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english' @@ -36,18 +54,39 @@ public class DocumentLexicalIndexService { coalesce(text_body, '') ) WHERE id = :representationId - """) - .setParameter("representationId", representationId) - .executeUpdate(); + """, params); + } + + /** + * New Slice 2 method kept for current startup runner. + */ + @Transactional + public int backfillMissingVectors(int limit) { + if (!isLexicalSearchSchemaAvailable()) { + return 0; + } + List ids = jdbcTemplate.query(""" + SELECT id + FROM doc.doc_text_representation + WHERE search_vector IS NULL + ORDER BY created_at ASC + LIMIT ? + """, (rs, rowNum) -> rs.getObject(1, UUID.class), limit); + ids.forEach(this::refreshRepresentationLexicalIndex); + return ids.size(); } + /** + * Backward-compatible Slice 1 method name. + */ + @Transactional public void refreshAllMissingLexicalIndexes() { if (!isLexicalSearchSchemaAvailable()) { log.info("Lexical search columns are not available yet. Skipping startup backfill for DOC lexical indexes."); return; } - entityManager.createNativeQuery(""" - UPDATE DOC.doc_text_representation + jdbcTemplate.update(""" + UPDATE doc.doc_text_representation SET search_config = CASE WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german' WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english' @@ -62,19 +101,27 @@ public class DocumentLexicalIndexService { coalesce(text_body, '') ) WHERE search_vector IS NULL - """) - .executeUpdate(); + """); + } + + /** + * New Slice 2 name kept for current code. + */ + public boolean searchVectorColumnsPresent() { + return isLexicalSearchSchemaAvailable(); } - private boolean isLexicalSearchSchemaAvailable() { - Number count = (Number) entityManager.createNativeQuery(""" + /** + * Backward-compatible Slice 1 method name. + */ + public boolean isLexicalSearchSchemaAvailable() { + Integer count = jdbcTemplate.queryForObject(""" SELECT COUNT(*) FROM information_schema.columns WHERE table_schema = 'doc' AND table_name = 'doc_text_representation' - AND column_name IN ('search_config', 'search_vector') - """) - .getSingleResult(); - return count != null && count.intValue() >= 2; + AND column_name IN ('search_vector', 'search_config') + """, Integer.class); + return count != null && count >= 2; } } diff --git a/src/main/java/at/procon/dip/search/service/SearchLexicalIndexStartupRunner.java b/src/main/java/at/procon/dip/search/service/SearchLexicalIndexStartupRunner.java new file mode 100644 index 0000000..c834c24 --- /dev/null +++ b/src/main/java/at/procon/dip/search/service/SearchLexicalIndexStartupRunner.java @@ -0,0 +1,25 @@ +package at.procon.dip.search.service; + +import at.procon.ted.config.TedProcessorProperties; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.boot.ApplicationArguments; +import org.springframework.boot.ApplicationRunner; +import org.springframework.stereotype.Component; + +@Component +@RequiredArgsConstructor +@Slf4j +public class SearchLexicalIndexStartupRunner implements ApplicationRunner { + + private final TedProcessorProperties properties; + private final DocumentLexicalIndexService lexicalIndexService; + + @Override + public void run(ApplicationArguments args) { + int updated = lexicalIndexService.backfillMissingVectors(properties.getSearch().getStartupLexicalBackfillLimit()); + if (updated > 0) { + log.info("Search lexical index startup backfill updated {} representations", updated); + } + } +} diff --git a/src/main/java/at/procon/dip/search/service/SearchOrchestrator.java b/src/main/java/at/procon/dip/search/service/SearchOrchestrator.java index 0131bbd..b5f8c36 100644 --- a/src/main/java/at/procon/dip/search/service/SearchOrchestrator.java +++ b/src/main/java/at/procon/dip/search/service/SearchOrchestrator.java @@ -1,9 +1,11 @@ package at.procon.dip.search.service; +import at.procon.dip.search.dto.SearchDebugResponse; import at.procon.dip.search.dto.SearchRequest; import at.procon.dip.search.dto.SearchResponse; import at.procon.dip.search.spi.SearchDocumentScope; public interface SearchOrchestrator { SearchResponse search(SearchRequest request, SearchDocumentScope scope); + SearchDebugResponse debug(SearchRequest request, SearchDocumentScope scope); } diff --git a/src/main/java/at/procon/dip/search/service/SemanticQueryEmbeddingService.java b/src/main/java/at/procon/dip/search/service/SemanticQueryEmbeddingService.java new file mode 100644 index 0000000..2117771 --- /dev/null +++ b/src/main/java/at/procon/dip/search/service/SemanticQueryEmbeddingService.java @@ -0,0 +1,39 @@ +package at.procon.dip.search.service; + +import at.procon.dip.domain.document.entity.DocumentEmbeddingModel; +import at.procon.dip.domain.document.service.DocumentEmbeddingService; +import at.procon.ted.config.TedProcessorProperties; +import at.procon.ted.service.VectorizationService; +import java.util.Optional; +import java.util.UUID; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Service; + +@Service +@RequiredArgsConstructor +@Slf4j +public class SemanticQueryEmbeddingService { + + private final TedProcessorProperties properties; + private final DocumentEmbeddingService documentEmbeddingService; + private final VectorizationService vectorizationService; + + public Optional buildQueryEmbedding(String queryText) { + if (!properties.getVectorization().isEnabled()) { + return Optional.empty(); + } + try { + DocumentEmbeddingModel model = documentEmbeddingService.findActiveModelByKey( + properties.getVectorization().getModelName()); + float[] vector = vectorizationService.generateQueryEmbedding(queryText); + return Optional.of(new QueryEmbedding(model.getId(), vectorizationService.floatArrayToVectorString(vector))); + } catch (Exception e) { + log.warn("Failed to generate semantic query embedding: {}", e.getMessage()); + return Optional.empty(); + } + } + + public record QueryEmbedding(UUID modelId, String vectorString) { + } +} diff --git a/src/main/java/at/procon/dip/search/web/GenericSearchController.java b/src/main/java/at/procon/dip/search/web/GenericSearchController.java index f819de0..91bcb30 100644 --- a/src/main/java/at/procon/dip/search/web/GenericSearchController.java +++ b/src/main/java/at/procon/dip/search/web/GenericSearchController.java @@ -1,5 +1,6 @@ package at.procon.dip.search.web; +import at.procon.dip.search.dto.SearchDebugResponse; import at.procon.dip.search.dto.SearchRequest; import at.procon.dip.search.dto.SearchResponse; import at.procon.dip.search.service.SearchOrchestrator; @@ -21,15 +22,24 @@ public class GenericSearchController { @PostMapping public SearchResponse search(@Valid @RequestBody SearchRequest request) { - SearchDocumentScope scope = new SearchDocumentScope( + return searchOrchestrator.search(request, buildScope(request)); + } + + @PostMapping("/debug") + public SearchDebugResponse debug(@Valid @RequestBody SearchRequest request) { + return searchOrchestrator.debug(request, buildScope(request)); + } + + private SearchDocumentScope buildScope(SearchRequest request) { + String scopeLanguage = (request.getLanguageCodes() == null || request.getLanguageCodes().isEmpty()) + ? null + : request.getLanguageCodes().iterator().next(); + return new SearchDocumentScope( Set.of(), request.getDocumentTypes(), request.getDocumentFamilies(), request.getVisibilities(), - request.getLanguageCodes() == null || request.getLanguageCodes().isEmpty() - ? null - : request.getLanguageCodes().iterator().next() + scopeLanguage ); - return searchOrchestrator.search(request, scope); } } diff --git a/src/main/java/at/procon/ted/config/TedProcessorProperties.java b/src/main/java/at/procon/ted/config/TedProcessorProperties.java index 59bf0e7..4397bc0 100644 --- a/src/main/java/at/procon/ted/config/TedProcessorProperties.java +++ b/src/main/java/at/procon/ted/config/TedProcessorProperties.java @@ -209,6 +209,42 @@ public class TedProcessorProperties { * Similarity threshold for vector search (0.0 - 1.0). */ private double similarityThreshold = 0.7; + + /** + * Minimum trigram similarity for fuzzy lexical matches. + */ + private double trigramSimilarityThreshold = 0.12; + + /** + * Candidate limits per search engine before fusion/collapse. + */ + @Positive + private int fulltextCandidateLimit = 120; + + @Positive + private int trigramCandidateLimit = 120; + + @Positive + private int semanticCandidateLimit = 120; + + /** + * Hybrid fusion weights. + */ + private double fulltextWeight = 0.35; + private double trigramWeight = 0.20; + private double semanticWeight = 0.45; + + /** + * Startup backfill limit for missing DOC lexical vectors. + */ + @Positive + private int startupLexicalBackfillLimit = 500; + + /** + * Number of hits per engine returned by the debug endpoint. + */ + @Positive + private int debugTopHitsPerEngine = 10; } /** diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 1fa8a4f..88a317a 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -124,11 +124,25 @@ ted: max-page-size: 100 # Similarity threshold for vector search (0.0 - 1.0) similarity-threshold: 0.7 + # Minimum trigram similarity for fuzzy lexical matches + trigram-similarity-threshold: 0.12 + # Candidate limits per engine before fusion/collapse + fulltext-candidate-limit: 120 + trigram-candidate-limit: 120 + semantic-candidate-limit: 120 + # Hybrid fusion weights + fulltext-weight: 0.35 + trigram-weight: 0.20 + semantic-weight: 0.45 + # Startup backfill limit for missing lexical vectors + startup-lexical-backfill-limit: 500 + # Number of top hits per engine returned by /search/debug + debug-top-hits-per-engine: 10 # TED Daily Package Download configuration download: # Enable/disable automatic package download - enabled: true + enabled: false # User service-based camel route use-service-based: false # Base URL for TED Daily Packages @@ -142,7 +156,7 @@ ted: # Max consecutive 404 errors before stopping max-consecutive-404: 4 # Polling interval (milliseconds) - 2 minutes - poll-interval: 3600000 + poll-interval: 1800000 # Retry interval for tail NOT_FOUND packages - 6 hours not-found-retry-interval: 21600000 # Grace period after year end before a previous-year tail 404 is treated as final @@ -163,7 +177,7 @@ ted: # IMAP Mail configuration mail: # Enable/disable mail processing - enabled: false + enabled: true # IMAP server hostname host: mail.mymagenta.business # IMAP server port (993 for IMAPS) @@ -185,7 +199,7 @@ ted: # Polling delay in milliseconds (1 minute) delay: 60000 # Max messages per poll - max-messages-per-poll: 10 + max-messages-per-poll: 100 # Output directory for processed attachments attachment-output-directory: /ted.europe/mail-attachments # Enable/disable MIME file input processing @@ -195,7 +209,7 @@ ted: # File pattern for MIME files (regex) mime-input-pattern: .*\\.eml # Polling interval for MIME input directory (milliseconds) - mime-input-poll-interval: 10000 + mime-input-poll-interval: 1000000 # Phase 3 TED projection configuration projection: @@ -225,7 +239,7 @@ ted: # Polling interval for the generic route poll-interval: 15000 # Maximum files per poll - max-messages-per-poll: 10 + max-messages-per-poll: 200 # Optional default owner tenant; leave empty for PUBLIC docs like TED or public knowledge docs default-owner-tenant-key: # Default visibility when no explicit access context is provided @@ -247,7 +261,7 @@ ted: # Enable Phase 4.1 TED package adapter on top of the generic DOC ingestion SPI ted-package-adapter-enabled: true # Enable Phase 4.1 mail/document adapter on top of the generic DOC ingestion SPI - mail-adapter-enabled: false + mail-adapter-enabled: true # Optional dedicated mail owner tenant, falls back to default-owner-tenant-key mail-default-owner-tenant-key: # Visibility for imported mail messages and attachments diff --git a/src/main/resources/db/migration/V9__search_slice2_generic_search_support.sql b/src/main/resources/db/migration/V9__search_slice2_generic_search_support.sql new file mode 100644 index 0000000..f4ccfdf --- /dev/null +++ b/src/main/resources/db/migration/V9__search_slice2_generic_search_support.sql @@ -0,0 +1,26 @@ +-- Slice 1 + Slice 2 generic search support for DOC documents. +-- Adds lexical-search support columns/indexes and pg_trgm extension. + +CREATE EXTENSION IF NOT EXISTS pg_trgm; + +ALTER TABLE DOC.doc_text_representation + ADD COLUMN IF NOT EXISTS search_config VARCHAR(64); + +ALTER TABLE DOC.doc_text_representation + ADD COLUMN IF NOT EXISTS search_vector tsvector; + +CREATE INDEX IF NOT EXISTS idx_doc_text_repr_search_vector + ON DOC.doc_text_representation + USING GIN (search_vector); + +CREATE INDEX IF NOT EXISTS idx_doc_document_title_trgm + ON DOC.doc_document + USING GIN (title gin_trgm_ops); + +CREATE INDEX IF NOT EXISTS idx_doc_document_summary_trgm + ON DOC.doc_document + USING GIN (summary gin_trgm_ops); + +CREATE INDEX IF NOT EXISTS idx_doc_text_repr_text_trgm + ON DOC.doc_text_representation + USING GIN (text_body gin_trgm_ops);