Refactor phases 5 - semantic search - slice 2

master
trifonovt 1 month ago
parent 47894257a4
commit 039b5a5f0a

@ -3,8 +3,8 @@ package at.procon.dip.domain.document.service;
import at.procon.dip.domain.document.entity.DocumentContent; import at.procon.dip.domain.document.entity.DocumentContent;
import at.procon.dip.domain.document.entity.DocumentTextRepresentation; import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository; import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
import at.procon.dip.search.service.DocumentLexicalIndexService; import at.procon.dip.search.service.DocumentLexicalIndexService;
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
import java.util.List; import java.util.List;
import java.util.UUID; import java.util.UUID;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -37,7 +37,7 @@ public class DocumentRepresentationService {
.textBody(command.textBody()) .textBody(command.textBody())
.build(); .build();
DocumentTextRepresentation saved = representationRepository.save(representation); DocumentTextRepresentation saved = representationRepository.save(representation);
lexicalIndexService.refreshRepresentationLexicalIndex(saved.getId()); lexicalIndexService.indexRepresentation(saved.getId());
return saved; return saved;
} }

@ -5,10 +5,9 @@ import at.procon.dip.search.spi.SearchDocumentScope;
import lombok.Builder; import lombok.Builder;
import lombok.Getter; import lombok.Getter;
@Getter
@Builder @Builder
@Getter
public class SearchExecutionContext { public class SearchExecutionContext {
private final SearchRequest request; private final SearchRequest request;
private final SearchDocumentScope scope; private final SearchDocumentScope scope;
private final int page; private final int page;

@ -6,10 +6,9 @@ import java.util.List;
import lombok.Builder; import lombok.Builder;
import lombok.Getter; import lombok.Getter;
@Getter
@Builder @Builder
@Getter
public class SearchExecutionPlan { public class SearchExecutionPlan {
private final List<SearchEngineType> engines; private final List<SearchEngineType> engines;
private final boolean collapseByDocument; private final boolean collapseByDocument;
private final SearchSortMode sortMode; private final SearchSortMode sortMode;

@ -0,0 +1,19 @@
package at.procon.dip.search.dto;
import at.procon.dip.search.api.SearchExecutionPlan;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SearchDebugResponse {
private SearchRequest request;
private SearchExecutionPlan plan;
private List<SearchEngineDebugResult> engineResults;
private SearchResponse fusedResponse;
}

@ -0,0 +1,17 @@
package at.procon.dip.search.dto;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SearchEngineDebugResult {
private SearchEngineType engineType;
private int hitCount;
private List<SearchHit> topHits;
}

@ -2,5 +2,6 @@ package at.procon.dip.search.dto;
public enum SearchEngineType { public enum SearchEngineType {
POSTGRES_FULLTEXT, POSTGRES_FULLTEXT,
POSTGRES_TRIGRAM POSTGRES_TRIGRAM,
PGVECTOR_SEMANTIC
} }

@ -15,7 +15,6 @@ import lombok.NoArgsConstructor;
@NoArgsConstructor @NoArgsConstructor
@AllArgsConstructor @AllArgsConstructor
public class SearchHit { public class SearchHit {
private UUID documentId; private UUID documentId;
private UUID representationId; private UUID representationId;

@ -3,5 +3,6 @@ package at.procon.dip.search.dto;
public enum SearchMode { public enum SearchMode {
FULLTEXT, FULLTEXT,
TRIGRAM, TRIGRAM,
SEMANTIC,
HYBRID HYBRID
} }

@ -4,10 +4,8 @@ import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.DocumentFamily; import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType; import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType; import at.procon.dip.domain.document.RepresentationType;
import jakarta.validation.constraints.Min;
import jakarta.validation.constraints.NotBlank; import jakarta.validation.constraints.NotBlank;
import java.time.OffsetDateTime; import java.time.OffsetDateTime;
import java.util.LinkedHashSet;
import java.util.Set; import java.util.Set;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Builder; import lombok.Builder;
@ -24,7 +22,7 @@ public class SearchRequest {
private String queryText; private String queryText;
@Builder.Default @Builder.Default
private Set<SearchMode> modes = new LinkedHashSet<>(Set.of(SearchMode.HYBRID)); private Set<SearchMode> modes = Set.of(SearchMode.HYBRID);
private Set<DocumentType> documentTypes; private Set<DocumentType> documentTypes;
private Set<DocumentFamily> documentFamilies; private Set<DocumentFamily> documentFamilies;
@ -34,10 +32,7 @@ public class SearchRequest {
private OffsetDateTime createdFrom; private OffsetDateTime createdFrom;
private OffsetDateTime createdTo; private OffsetDateTime createdTo;
@Min(0)
private Integer page; private Integer page;
@Min(1)
private Integer size; private Integer size;
@Builder.Default @Builder.Default

@ -12,7 +12,6 @@ import lombok.NoArgsConstructor;
@NoArgsConstructor @NoArgsConstructor
@AllArgsConstructor @AllArgsConstructor
public class SearchResponse { public class SearchResponse {
private List<SearchHit> hits; private List<SearchHit> hits;
private int page; private int page;
private int size; private int size;

@ -1,15 +1,11 @@
package at.procon.dip.search.engine.fulltext; package at.procon.dip.search.engine.fulltext;
import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.search.api.SearchExecutionContext; import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit; import at.procon.dip.search.dto.SearchHit;
import at.procon.dip.search.dto.SearchMatchField;
import at.procon.dip.search.engine.SearchEngine; import at.procon.dip.search.engine.SearchEngine;
import at.procon.dip.search.repository.DocumentFullTextSearchRepository; import at.procon.dip.search.repository.DocumentFullTextSearchRepository;
import at.procon.dip.search.repository.FullTextSearchRow; import at.procon.ted.config.TedProcessorProperties;
import java.util.List; import java.util.List;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
@ -19,6 +15,7 @@ import org.springframework.stereotype.Component;
public class PostgresFullTextSearchEngine implements SearchEngine { public class PostgresFullTextSearchEngine implements SearchEngine {
private final DocumentFullTextSearchRepository repository; private final DocumentFullTextSearchRepository repository;
private final TedProcessorProperties properties;
@Override @Override
public SearchEngineType type() { public SearchEngineType type() {
@ -32,40 +29,6 @@ public class PostgresFullTextSearchEngine implements SearchEngine {
@Override @Override
public List<SearchHit> execute(SearchExecutionContext context) { public List<SearchHit> execute(SearchExecutionContext context) {
return repository.search(context).stream() return repository.search(context, properties.getSearch().getFulltextCandidateLimit());
.map(this::mapRow)
.toList();
}
private SearchHit mapRow(FullTextSearchRow row) {
return SearchHit.builder()
.documentId(row.documentId())
.representationId(row.representationId())
.documentType(parseDocumentType(row.documentType()))
.documentFamily(parseDocumentFamily(row.documentFamily()))
.visibility(parseVisibility(row.visibility()))
.title(row.title())
.summary(row.summary())
.languageCode(row.languageCode())
.mimeType(row.mimeType())
.primaryEngine(SearchEngineType.POSTGRES_FULLTEXT)
.matchedField(SearchMatchField.REPRESENTATION_TEXT)
.snippet(row.snippet())
.rawScore(row.score() == null ? 0.0d : row.score())
.createdAt(row.createdAt())
.updatedAt(row.updatedAt())
.build();
}
private DocumentType parseDocumentType(String value) {
return value == null ? null : DocumentType.valueOf(value);
}
private DocumentFamily parseDocumentFamily(String value) {
return value == null ? null : DocumentFamily.valueOf(value);
}
private DocumentVisibility parseVisibility(String value) {
return value == null ? null : DocumentVisibility.valueOf(value);
} }
} }

@ -0,0 +1,45 @@
package at.procon.dip.search.engine.semantic;
import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit;
import at.procon.dip.search.engine.SearchEngine;
import at.procon.dip.search.repository.DocumentSemanticSearchRepository;
import at.procon.dip.search.service.SemanticQueryEmbeddingService;
import at.procon.ted.config.TedProcessorProperties;
import java.util.List;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component;
@Component
@RequiredArgsConstructor
public class PgVectorSemanticSearchEngine implements SearchEngine {
private final TedProcessorProperties properties;
private final SemanticQueryEmbeddingService queryEmbeddingService;
private final DocumentSemanticSearchRepository repository;
@Override
public SearchEngineType type() {
return SearchEngineType.PGVECTOR_SEMANTIC;
}
@Override
public boolean supports(SearchExecutionContext context) {
return properties.getVectorization().isEnabled()
&& context.getRequest().getQueryText() != null
&& !context.getRequest().getQueryText().isBlank();
}
@Override
public List<SearchHit> execute(SearchExecutionContext context) {
return queryEmbeddingService.buildQueryEmbedding(context.getRequest().getQueryText())
.map(query -> repository.search(
context,
query.modelId(),
query.vectorString(),
properties.getSearch().getSemanticCandidateLimit(),
properties.getSearch().getSimilarityThreshold()))
.orElse(List.of());
}
}

@ -1,15 +1,11 @@
package at.procon.dip.search.engine.trigram; package at.procon.dip.search.engine.trigram;
import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.search.api.SearchExecutionContext; import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit; import at.procon.dip.search.dto.SearchHit;
import at.procon.dip.search.dto.SearchMatchField;
import at.procon.dip.search.engine.SearchEngine; import at.procon.dip.search.engine.SearchEngine;
import at.procon.dip.search.repository.DocumentTrigramSearchRepository; import at.procon.dip.search.repository.DocumentTrigramSearchRepository;
import at.procon.dip.search.repository.TrigramSearchRow; import at.procon.ted.config.TedProcessorProperties;
import java.util.List; import java.util.List;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
@ -19,6 +15,7 @@ import org.springframework.stereotype.Component;
public class PostgresTrigramSearchEngine implements SearchEngine { public class PostgresTrigramSearchEngine implements SearchEngine {
private final DocumentTrigramSearchRepository repository; private final DocumentTrigramSearchRepository repository;
private final TedProcessorProperties properties;
@Override @Override
public SearchEngineType type() { public SearchEngineType type() {
@ -32,44 +29,9 @@ public class PostgresTrigramSearchEngine implements SearchEngine {
@Override @Override
public List<SearchHit> execute(SearchExecutionContext context) { public List<SearchHit> execute(SearchExecutionContext context) {
return repository.search(context).stream() return repository.search(
.map(this::mapRow) context,
.toList(); properties.getSearch().getTrigramCandidateLimit(),
} properties.getSearch().getTrigramSimilarityThreshold());
private SearchHit mapRow(TrigramSearchRow row) {
return SearchHit.builder()
.documentId(row.documentId())
.representationId(row.representationId())
.documentType(parseDocumentType(row.documentType()))
.documentFamily(parseDocumentFamily(row.documentFamily()))
.visibility(parseVisibility(row.visibility()))
.title(row.title())
.summary(row.summary())
.languageCode(row.languageCode())
.mimeType(row.mimeType())
.primaryEngine(SearchEngineType.POSTGRES_TRIGRAM)
.matchedField(parseMatchField(row.matchedField()))
.snippet(row.snippet())
.rawScore(row.score() == null ? 0.0d : row.score())
.createdAt(row.createdAt())
.updatedAt(row.updatedAt())
.build();
}
private SearchMatchField parseMatchField(String value) {
return value == null ? SearchMatchField.REPRESENTATION_TEXT : SearchMatchField.valueOf(value);
}
private DocumentType parseDocumentType(String value) {
return value == null ? null : DocumentType.valueOf(value);
}
private DocumentFamily parseDocumentFamily(String value) {
return value == null ? null : DocumentFamily.valueOf(value);
}
private DocumentVisibility parseVisibility(String value) {
return value == null ? null : DocumentVisibility.valueOf(value);
} }
} }

@ -5,6 +5,7 @@ import at.procon.dip.search.api.SearchExecutionPlan;
import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchMode; import at.procon.dip.search.dto.SearchMode;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
@ -15,17 +16,23 @@ public class DefaultSearchPlanner implements SearchPlanner {
@Override @Override
public SearchExecutionPlan plan(SearchExecutionContext context) { public SearchExecutionPlan plan(SearchExecutionContext context) {
Set<SearchMode> modes = context.getRequest().getModes(); Set<SearchMode> modes = context.getRequest().getModes();
List<SearchEngineType> engines = new ArrayList<>(); if (modes == null || modes.isEmpty()) {
modes = Set.of(SearchMode.HYBRID);
}
if (modes == null || modes.isEmpty() || modes.contains(SearchMode.HYBRID) || modes.contains(SearchMode.FULLTEXT)) { Set<SearchEngineType> engines = new LinkedHashSet<>();
if (modes.contains(SearchMode.HYBRID) || modes.contains(SearchMode.FULLTEXT)) {
engines.add(SearchEngineType.POSTGRES_FULLTEXT); engines.add(SearchEngineType.POSTGRES_FULLTEXT);
} }
if (modes == null || modes.isEmpty() || modes.contains(SearchMode.HYBRID) || modes.contains(SearchMode.TRIGRAM)) { if (modes.contains(SearchMode.HYBRID) || modes.contains(SearchMode.TRIGRAM)) {
engines.add(SearchEngineType.POSTGRES_TRIGRAM); engines.add(SearchEngineType.POSTGRES_TRIGRAM);
} }
if (modes.contains(SearchMode.HYBRID) || modes.contains(SearchMode.SEMANTIC)) {
engines.add(SearchEngineType.PGVECTOR_SEMANTIC);
}
return SearchExecutionPlan.builder() return SearchExecutionPlan.builder()
.engines(engines) .engines(new ArrayList<>(engines))
.collapseByDocument(context.getRequest().isCollapseByDocument()) .collapseByDocument(context.getRequest().isCollapseByDocument())
.sortMode(context.getRequest().getSortMode()) .sortMode(context.getRequest().getSortMode())
.build(); .build();

@ -5,103 +5,120 @@ import at.procon.dip.search.api.SearchExecutionPlan;
import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit; import at.procon.dip.search.dto.SearchHit;
import at.procon.dip.search.dto.SearchResponse; import at.procon.dip.search.dto.SearchResponse;
import at.procon.dip.search.dto.SearchSortMode;
import at.procon.ted.config.TedProcessorProperties;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
import java.util.EnumMap;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.UUID; import java.util.UUID;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
@Component @Component
@RequiredArgsConstructor
public class DefaultSearchResultFusionService implements SearchResultFusionService { public class DefaultSearchResultFusionService implements SearchResultFusionService {
private static final double FULLTEXT_WEIGHT = 0.60d; private final SearchScoreNormalizer normalizer;
private static final double TRIGRAM_WEIGHT = 0.40d; private final TedProcessorProperties properties;
private final SearchScoreNormalizer scoreNormalizer;
public DefaultSearchResultFusionService(SearchScoreNormalizer scoreNormalizer) {
this.scoreNormalizer = scoreNormalizer;
}
@Override @Override
public SearchResponse fuse(SearchExecutionContext context, public SearchResponse fuse(SearchExecutionContext context,
SearchExecutionPlan plan, SearchExecutionPlan plan,
Map<SearchEngineType, List<SearchHit>> engineResults) { Map<SearchEngineType, List<SearchHit>> engineResults) {
Map<SearchEngineType, List<SearchHit>> normalizedResults = new LinkedHashMap<>(); Map<SearchEngineType, List<SearchHit>> normalized = new EnumMap<>(SearchEngineType.class);
for (Map.Entry<SearchEngineType, List<SearchHit>> entry : engineResults.entrySet()) { engineResults.forEach((engine, hits) -> normalized.put(engine, normalizer.normalize(engine, hits)));
normalizedResults.put(entry.getKey(), scoreNormalizer.normalize(entry.getKey(), entry.getValue()));
}
List<SearchHit> ranked = plan.isCollapseByDocument()
? collapseByDocument(normalizedResults)
: flatten(normalizedResults);
ranked.sort(Comparator List<SearchHit> fused = plan.isCollapseByDocument()
.comparingDouble(SearchHit::getFinalScore).reversed() ? collapse(normalized)
.thenComparing(SearchHit::getUpdatedAt, Comparator.nullsLast(Comparator.reverseOrder()))); : mergeWithoutCollapse(normalized);
int totalHits = ranked.size(); sort(fused, plan.getSortMode());
int fromIndex = Math.min(context.getPage() * context.getSize(), ranked.size()); long total = fused.size();
int toIndex = Math.min(fromIndex + context.getSize(), ranked.size()); int fromIndex = Math.min(context.getPage() * context.getSize(), fused.size());
List<SearchHit> pageHits = ranked.subList(fromIndex, toIndex); int toIndex = Math.min(fromIndex + context.getSize(), fused.size());
List<SearchHit> paged = fromIndex >= toIndex ? List.of() : fused.subList(fromIndex, toIndex);
return SearchResponse.builder() return SearchResponse.builder()
.hits(new ArrayList<>(pageHits)) .hits(paged)
.page(context.getPage()) .page(context.getPage())
.size(context.getSize()) .size(context.getSize())
.totalHits(totalHits) .totalHits(total)
.truncated(toIndex < totalHits) .truncated(total > toIndex)
.enginesUsed(new LinkedHashSet<>(normalizedResults.keySet())) .enginesUsed(engineResults.keySet())
.build(); .build();
} }
private List<SearchHit> flatten(Map<SearchEngineType, List<SearchHit>> normalizedResults) { private List<SearchHit> collapse(Map<SearchEngineType, List<SearchHit>> normalized) {
List<SearchHit> merged = new ArrayList<>(); Map<UUID, Aggregate> aggregates = new LinkedHashMap<>();
for (Map.Entry<SearchEngineType, List<SearchHit>> entry : normalizedResults.entrySet()) { normalized.forEach((engine, hits) -> {
for (SearchHit hit : entry.getValue()) { for (SearchHit hit : hits) {
merged.add(hit.toBuilder().finalScore(weight(entry.getKey()) * hit.getNormalizedScore()).build()); Aggregate aggregate = aggregates.computeIfAbsent(hit.getDocumentId(), id -> new Aggregate());
aggregate.bestByEngine.put(engine, hit);
if (aggregate.representative == null || hit.getNormalizedScore() > aggregate.representative.getNormalizedScore()) {
aggregate.representative = hit;
} }
} }
return merged; });
List<SearchHit> fused = new ArrayList<>();
for (Aggregate aggregate : aggregates.values()) {
SearchHit representative = aggregate.representative;
double finalScore = weight(SearchEngineType.POSTGRES_FULLTEXT, aggregate) +
weight(SearchEngineType.POSTGRES_TRIGRAM, aggregate) +
weight(SearchEngineType.PGVECTOR_SEMANTIC, aggregate);
fused.add(representative.toBuilder().finalScore(finalScore).build());
} }
return fused;
private List<SearchHit> collapseByDocument(Map<SearchEngineType, List<SearchHit>> normalizedResults) {
Map<UUID, SearchHit> collapsed = new LinkedHashMap<>();
Map<UUID, Double> accumulatedScores = new LinkedHashMap<>();
Set<UUID> docOrder = new LinkedHashSet<>();
for (Map.Entry<SearchEngineType, List<SearchHit>> entry : normalizedResults.entrySet()) {
double weight = weight(entry.getKey());
for (SearchHit hit : entry.getValue()) {
docOrder.add(hit.getDocumentId());
double contribution = weight * hit.getNormalizedScore();
accumulatedScores.merge(hit.getDocumentId(), contribution, Double::sum);
SearchHit existing = collapsed.get(hit.getDocumentId());
if (existing == null || hit.getNormalizedScore() > existing.getNormalizedScore()) {
collapsed.put(hit.getDocumentId(), hit);
} }
private double weight(SearchEngineType engineType, Aggregate aggregate) {
SearchHit hit = aggregate.bestByEngine.get(engineType);
if (hit == null) {
return 0.0d;
} }
TedProcessorProperties.SearchProperties search = properties.getSearch();
return switch (engineType) {
case POSTGRES_FULLTEXT -> hit.getNormalizedScore() * search.getFulltextWeight();
case POSTGRES_TRIGRAM -> hit.getNormalizedScore() * search.getTrigramWeight();
case PGVECTOR_SEMANTIC -> hit.getNormalizedScore() * search.getSemanticWeight();
};
} }
List<SearchHit> results = new ArrayList<>(docOrder.size()); private List<SearchHit> mergeWithoutCollapse(Map<SearchEngineType, List<SearchHit>> normalized) {
for (UUID documentId : docOrder) { List<SearchHit> merged = new ArrayList<>();
SearchHit base = collapsed.get(documentId); normalized.forEach((engine, hits) -> {
if (base != null) { for (SearchHit hit : hits) {
results.add(base.toBuilder().finalScore(accumulatedScores.getOrDefault(documentId, 0.0d)).build()); double finalScore = switch (engine) {
} case POSTGRES_FULLTEXT -> hit.getNormalizedScore() * properties.getSearch().getFulltextWeight();
case POSTGRES_TRIGRAM -> hit.getNormalizedScore() * properties.getSearch().getTrigramWeight();
case PGVECTOR_SEMANTIC -> hit.getNormalizedScore() * properties.getSearch().getSemanticWeight();
};
merged.add(hit.toBuilder().finalScore(finalScore).build());
} }
return results; });
return merged;
} }
private double weight(SearchEngineType engineType) { private void sort(List<SearchHit> hits, SearchSortMode sortMode) {
return switch (engineType) { Comparator<SearchHit> comparator = switch (sortMode) {
case POSTGRES_FULLTEXT -> FULLTEXT_WEIGHT; case CREATED_AT_DESC -> Comparator.comparing(SearchHit::getCreatedAt,
case POSTGRES_TRIGRAM -> TRIGRAM_WEIGHT; Comparator.nullsLast(Comparator.reverseOrder()));
case TITLE_ASC -> Comparator.comparing(hit -> hit.getTitle() == null ? "" : hit.getTitle(),
String.CASE_INSENSITIVE_ORDER);
case SCORE_DESC -> Comparator.comparingDouble(SearchHit::getFinalScore).reversed();
}; };
if (sortMode != SearchSortMode.SCORE_DESC) {
comparator = comparator.thenComparing(Comparator.comparingDouble(SearchHit::getFinalScore).reversed());
}
hits.sort(comparator);
}
private static final class Aggregate {
private final Map<SearchEngineType, SearchHit> bestByEngine = new EnumMap<>(SearchEngineType.class);
private SearchHit representative;
} }
} }

@ -2,7 +2,6 @@ package at.procon.dip.search.rank;
import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit; import at.procon.dip.search.dto.SearchHit;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
@ -14,15 +13,22 @@ public class DefaultSearchScoreNormalizer implements SearchScoreNormalizer {
if (hits == null || hits.isEmpty()) { if (hits == null || hits.isEmpty()) {
return List.of(); return List.of();
} }
double max = hits.stream().mapToDouble(SearchHit::getRawScore).max().orElse(0.0d); double max = hits.stream().mapToDouble(SearchHit::getRawScore).max().orElse(1.0d);
if (max <= 0.0d) { double divisor = max > 0.0d ? max : 1.0d;
max = 1.0d; return hits.stream()
.map(hit -> hit.toBuilder()
.normalizedScore(clamp(hit.getRawScore() / divisor))
.build())
.toList();
} }
List<SearchHit> normalized = new ArrayList<>(hits.size());
for (SearchHit hit : hits) { private double clamp(double value) {
double score = Math.max(0.0d, Math.min(1.0d, hit.getRawScore() / max)); if (value < 0.0d) {
normalized.add(hit.toBuilder().normalizedScore(score).build()); return 0.0d;
}
if (value > 1.0d) {
return 1.0d;
} }
return normalized; return value;
} }
} }

@ -9,9 +9,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
public interface SearchResultFusionService { public interface SearchResultFusionService {
SearchResponse fuse( SearchResponse fuse(SearchExecutionContext context,
SearchExecutionContext context,
SearchExecutionPlan plan, SearchExecutionPlan plan,
Map<SearchEngineType, List<SearchHit>> engineResults Map<SearchEngineType, List<SearchHit>> engineResults);
);
} }

@ -1,8 +1,10 @@
package at.procon.dip.search.repository; package at.procon.dip.search.repository;
import at.procon.dip.search.api.SearchExecutionContext; import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.dto.SearchHit;
import java.util.List; import java.util.List;
public interface DocumentFullTextSearchRepository { public interface DocumentFullTextSearchRepository {
List<FullTextSearchRow> search(SearchExecutionContext context);
List<SearchHit> search(SearchExecutionContext context, int limit);
} }

@ -1,72 +1,53 @@
package at.procon.dip.search.repository; package at.procon.dip.search.repository;
import at.procon.dip.search.api.SearchExecutionContext; import at.procon.dip.search.api.SearchExecutionContext;
import jakarta.persistence.EntityManager; import at.procon.dip.search.dto.SearchEngineType;
import jakarta.persistence.PersistenceContext; import at.procon.dip.search.dto.SearchHit;
import jakarta.persistence.Query; import at.procon.dip.search.dto.SearchMatchField;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import lombok.RequiredArgsConstructor;
import org.springframework.jdbc.core.namedparam.MapSqlParameterSource;
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
import org.springframework.stereotype.Repository; import org.springframework.stereotype.Repository;
@Repository @Repository
public class DocumentFullTextSearchRepositoryImpl extends AbstractNativeSearchRepository implements DocumentFullTextSearchRepository { @RequiredArgsConstructor
public class DocumentFullTextSearchRepositoryImpl implements DocumentFullTextSearchRepository {
@PersistenceContext private final NamedParameterJdbcTemplate jdbcTemplate;
private EntityManager entityManager;
@Override @Override
public List<FullTextSearchRow> search(SearchExecutionContext context) { public List<SearchHit> search(SearchExecutionContext context, int limit) {
StringBuilder sql = new StringBuilder(""" StringBuilder sql = new StringBuilder("""
SELECT SELECT
d.id AS document_id, d.id AS document_id,
dtr.id AS representation_id, dtr.id AS representation_id,
CAST(d.document_type AS text) AS document_type,
CAST(d.document_family AS text) AS document_family,
CAST(d.visibility AS text) AS visibility,
d.title AS title, d.title AS title,
d.summary AS summary, d.summary AS summary,
COALESCE(dtr.language_code, d.language_code) AS language_code, COALESCE(dtr.language_code, d.language_code) AS language_code,
d.mime_type AS mime_type, d.mime_type AS mime_type,
d.document_type AS document_type,
d.document_family AS document_family,
d.visibility AS visibility,
d.created_at AS created_at, d.created_at AS created_at,
d.updated_at AS updated_at, d.updated_at AS updated_at,
ts_headline('simple', COALESCE(dtr.text_body, ''), websearch_to_tsquery('simple', :queryText)) AS snippet, ts_headline('simple', COALESCE(dtr.text_body, ''), websearch_to_tsquery('simple', :queryText),
'MaxFragments=2, MinWords=5, MaxWords=20') AS snippet,
ts_rank_cd(dtr.search_vector, websearch_to_tsquery('simple', :queryText)) AS score ts_rank_cd(dtr.search_vector, websearch_to_tsquery('simple', :queryText)) AS score
FROM DOC.doc_text_representation dtr FROM doc.doc_text_representation dtr
JOIN DOC.doc_document d ON d.id = dtr.document_id JOIN doc.doc_document d ON d.id = dtr.document_id
LEFT JOIN DOC.doc_tenant dt ON dt.id = d.owner_tenant_id LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id
WHERE dtr.search_vector @@ websearch_to_tsquery('simple', :queryText) WHERE dtr.search_vector IS NOT NULL
AND dtr.search_vector @@ websearch_to_tsquery('simple', :queryText)
"""); """);
Map<String, Object> params = newParams(); MapSqlParameterSource params = new MapSqlParameterSource();
params.put("queryText", context.getRequest().getQueryText().trim()); params.addValue("queryText", context.getRequest().getQueryText());
appendGenericFilters(sql, params, context); SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit"); sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
params.put("limit", engineLimit(context)); params.addValue("limit", limit);
Query query = entityManager.createNativeQuery(sql.toString()); return jdbcTemplate.query(sql.toString(), params,
bindParameters(query, params); new SearchHitRowMapper(SearchEngineType.POSTGRES_FULLTEXT, SearchMatchField.REPRESENTATION_TEXT));
List<?> rows = query.getResultList();
List<FullTextSearchRow> results = new ArrayList<>(rows.size());
for (Object row : rows) {
Object[] cols = (Object[]) row;
results.add(new FullTextSearchRow(
asUuid(cols[0]),
asUuid(cols[1]),
asString(cols[2]),
asString(cols[3]),
asString(cols[4]),
asString(cols[5]),
asString(cols[6]),
asString(cols[7]),
asString(cols[8]),
asOffsetDateTime(cols[9]),
asOffsetDateTime(cols[10]),
asString(cols[11]),
asDouble(cols[12])
));
}
return results;
} }
} }

@ -0,0 +1,61 @@
package at.procon.dip.search.repository;
import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit;
import at.procon.dip.search.dto.SearchMatchField;
import java.util.List;
import java.util.UUID;
import lombok.RequiredArgsConstructor;
import org.springframework.jdbc.core.namedparam.MapSqlParameterSource;
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
import org.springframework.stereotype.Repository;
@Repository
@RequiredArgsConstructor
public class DocumentSemanticSearchRepository {
private final NamedParameterJdbcTemplate jdbcTemplate;
public List<SearchHit> search(SearchExecutionContext context,
UUID modelId,
String queryVector,
int limit,
double threshold) {
StringBuilder sql = new StringBuilder("""
SELECT
d.id AS document_id,
dtr.id AS representation_id,
CAST(d.document_type AS text) AS document_type,
CAST(d.document_family AS text) AS document_family,
CAST(d.visibility AS text) AS visibility,
d.title AS title,
d.summary AS summary,
COALESCE(dtr.language_code, d.language_code) AS language_code,
d.mime_type AS mime_type,
d.created_at AS created_at,
d.updated_at AS updated_at,
LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet,
(1 - (de.embedding_vector <=> CAST(:queryVector AS vector))) AS score
FROM doc.doc_embedding de
JOIN doc.doc_text_representation dtr ON dtr.id = de.representation_id
JOIN doc.doc_document d ON d.id = de.document_id
LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id
WHERE de.embedding_status = 'COMPLETED'
AND de.embedding_vector IS NOT NULL
AND de.model_id = :modelId
AND (1 - (de.embedding_vector <=> CAST(:queryVector AS vector))) >= :threshold
""");
MapSqlParameterSource params = new MapSqlParameterSource();
params.addValue("queryVector", queryVector);
params.addValue("modelId", modelId);
params.addValue("threshold", threshold);
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
params.addValue("limit", limit);
return jdbcTemplate.query(sql.toString(), params,
new SearchHitRowMapper(SearchEngineType.PGVECTOR_SEMANTIC, SearchMatchField.REPRESENTATION_TEXT));
}
}

@ -1,8 +1,10 @@
package at.procon.dip.search.repository; package at.procon.dip.search.repository;
import at.procon.dip.search.api.SearchExecutionContext; import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.dto.SearchHit;
import java.util.List; import java.util.List;
public interface DocumentTrigramSearchRepository { public interface DocumentTrigramSearchRepository {
List<TrigramSearchRow> search(SearchExecutionContext context);
List<SearchHit> search(SearchExecutionContext context, int limit, double threshold);
} }

@ -1,102 +1,60 @@
package at.procon.dip.search.repository; package at.procon.dip.search.repository;
import at.procon.dip.search.api.SearchExecutionContext; import at.procon.dip.search.api.SearchExecutionContext;
import jakarta.persistence.EntityManager; import at.procon.dip.search.dto.SearchEngineType;
import jakarta.persistence.PersistenceContext; import at.procon.dip.search.dto.SearchHit;
import jakarta.persistence.Query; import at.procon.dip.search.dto.SearchMatchField;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import lombok.RequiredArgsConstructor;
import org.springframework.jdbc.core.namedparam.MapSqlParameterSource;
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
import org.springframework.stereotype.Repository; import org.springframework.stereotype.Repository;
@Repository @Repository
public class DocumentTrigramSearchRepositoryImpl extends AbstractNativeSearchRepository implements DocumentTrigramSearchRepository { @RequiredArgsConstructor
public class DocumentTrigramSearchRepositoryImpl implements DocumentTrigramSearchRepository {
@PersistenceContext private final NamedParameterJdbcTemplate jdbcTemplate;
private EntityManager entityManager;
@Override @Override
public List<TrigramSearchRow> search(SearchExecutionContext context) { public List<SearchHit> search(SearchExecutionContext context, int limit, double threshold) {
StringBuilder sql = new StringBuilder(""" String scoreExpr = "GREATEST(" +
SELECT "similarity(COALESCE(d.title, ''), :queryText), " +
d.id AS document_id, "similarity(COALESCE(d.summary, ''), :queryText), " +
dtr.id AS representation_id, "similarity(COALESCE(dtr.text_body, ''), :queryText))";
d.title AS title,
d.summary AS summary,
COALESCE(dtr.language_code, d.language_code) AS language_code,
d.mime_type AS mime_type,
d.document_type AS document_type,
d.document_family AS document_family,
d.visibility AS visibility,
d.created_at AS created_at,
d.updated_at AS updated_at,
CASE
WHEN similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(d.summary, ''), :queryText)
AND similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText)
THEN COALESCE(d.title, '')
WHEN similarity(COALESCE(d.summary, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText)
THEN COALESCE(d.summary, '')
ELSE LEFT(COALESCE(dtr.text_body, ''), 400)
END AS snippet,
GREATEST(
similarity(COALESCE(d.title, ''), :queryText),
similarity(COALESCE(d.summary, ''), :queryText),
similarity(COALESCE(dtr.text_body, ''), :queryText)
) AS score,
CASE
WHEN similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(d.summary, ''), :queryText)
AND similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText)
THEN 'DOCUMENT_TITLE'
WHEN similarity(COALESCE(d.summary, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText)
THEN 'DOCUMENT_SUMMARY'
ELSE 'REPRESENTATION_TEXT'
END AS matched_field
FROM DOC.doc_text_representation dtr
JOIN DOC.doc_document d ON d.id = dtr.document_id
LEFT JOIN DOC.doc_tenant dt ON dt.id = d.owner_tenant_id
WHERE (
COALESCE(d.title, '') % :queryText
OR COALESCE(d.summary, '') % :queryText
OR COALESCE(dtr.text_body, '') % :queryText
)
""");
Map<String, Object> params = newParams(); StringBuilder sql = new StringBuilder("SELECT " +
params.put("queryText", context.getRequest().getQueryText().trim()); "d.id AS document_id, " +
appendGenericFilters(sql, params, context); "dtr.id AS representation_id, " +
sql.append(" AND GREATEST(") "CAST(d.document_type AS text) AS document_type, " +
.append(" similarity(COALESCE(d.title, ''), :queryText),") "CAST(d.document_family AS text) AS document_family, " +
.append(" similarity(COALESCE(d.summary, ''), :queryText),") "CAST(d.visibility AS text) AS visibility, " +
.append(" similarity(COALESCE(dtr.text_body, ''), :queryText)") "d.title AS title, " +
.append(") >= :minSimilarity"); "d.summary AS summary, " +
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit"); "COALESCE(dtr.language_code, d.language_code) AS language_code, " +
params.put("minSimilarity", 0.10d); "d.mime_type AS mime_type, " +
params.put("limit", engineLimit(context)); "d.created_at AS created_at, " +
"d.updated_at AS updated_at, " +
"LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet, " +
scoreExpr + " AS score, " +
"CASE " +
"WHEN similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(d.summary, ''), :queryText) " +
" AND similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText) THEN 'DOCUMENT_TITLE' " +
"WHEN similarity(COALESCE(d.summary, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText) THEN 'DOCUMENT_SUMMARY' " +
"ELSE 'REPRESENTATION_TEXT' END AS matched_field " +
"FROM doc.doc_text_representation dtr " +
"JOIN doc.doc_document d ON d.id = dtr.document_id " +
"LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id " +
"WHERE " + scoreExpr + " >= :threshold");
Query query = entityManager.createNativeQuery(sql.toString()); MapSqlParameterSource params = new MapSqlParameterSource();
bindParameters(query, params); params.addValue("queryText", context.getRequest().getQueryText());
params.addValue("threshold", threshold);
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
params.addValue("limit", limit);
List<?> rows = query.getResultList(); return jdbcTemplate.query(sql.toString(), params,
List<TrigramSearchRow> results = new ArrayList<>(rows.size()); new SearchHitRowMapper(SearchEngineType.POSTGRES_TRIGRAM, SearchMatchField.REPRESENTATION_TEXT));
for (Object row : rows) {
Object[] cols = (Object[]) row;
results.add(new TrigramSearchRow(
asUuid(cols[0]),
asUuid(cols[1]),
asString(cols[2]),
asString(cols[3]),
asString(cols[4]),
asString(cols[5]),
asString(cols[6]),
asString(cols[7]),
asString(cols[8]),
asOffsetDateTime(cols[9]),
asOffsetDateTime(cols[10]),
asString(cols[11]),
asDouble(cols[12]),
asString(cols[13])
));
}
return results;
} }
} }

@ -0,0 +1,54 @@
package at.procon.dip.search.repository;
import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit;
import at.procon.dip.search.dto.SearchMatchField;
import java.sql.ResultSet;
import java.sql.SQLException;
import org.springframework.jdbc.core.RowMapper;
final class SearchHitRowMapper implements RowMapper<SearchHit> {
private final SearchEngineType engineType;
private final SearchMatchField defaultField;
SearchHitRowMapper(SearchEngineType engineType, SearchMatchField defaultField) {
this.engineType = engineType;
this.defaultField = defaultField;
}
@Override
public SearchHit mapRow(ResultSet rs, int rowNum) throws SQLException {
String matchedField = safeGetString(rs, "matched_field");
return SearchHit.builder()
.documentId(rs.getObject("document_id", java.util.UUID.class))
.representationId(rs.getObject("representation_id", java.util.UUID.class))
.documentType(DocumentType.valueOf(rs.getString("document_type")))
.documentFamily(DocumentFamily.valueOf(rs.getString("document_family")))
.visibility(DocumentVisibility.valueOf(rs.getString("visibility")))
.title(safeGetString(rs, "title"))
.summary(safeGetString(rs, "summary"))
.languageCode(safeGetString(rs, "language_code"))
.mimeType(safeGetString(rs, "mime_type"))
.primaryEngine(engineType)
.matchedField(matchedField == null || matchedField.isBlank()
? defaultField
: SearchMatchField.valueOf(matchedField))
.snippet(safeGetString(rs, "snippet"))
.rawScore(rs.getDouble("score"))
.createdAt(rs.getObject("created_at", java.time.OffsetDateTime.class))
.updatedAt(rs.getObject("updated_at", java.time.OffsetDateTime.class))
.build();
}
private String safeGetString(ResultSet rs, String column) {
try {
return rs.getString(column);
} catch (SQLException ignore) {
return null;
}
}
}

@ -0,0 +1,84 @@
package at.procon.dip.search.repository;
import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.search.api.SearchExecutionContext;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.springframework.jdbc.core.namedparam.MapSqlParameterSource;
import org.springframework.util.CollectionUtils;
final class SearchSqlFilterSupport {
private SearchSqlFilterSupport() {
}
static void appendCommonFilters(StringBuilder sql,
MapSqlParameterSource params,
SearchExecutionContext context,
String documentAlias,
String representationAlias,
boolean tenantJoinPresent) {
Set<DocumentType> documentTypes = firstNonEmpty(context.getRequest().getDocumentTypes(), context.getScope().documentTypes());
if (!CollectionUtils.isEmpty(documentTypes)) {
sql.append(" AND CAST(").append(documentAlias).append(".document_type AS text) IN (:documentTypes)");
params.addValue("documentTypes", enumNames(documentTypes));
}
Set<DocumentFamily> documentFamilies = firstNonEmpty(context.getRequest().getDocumentFamilies(), context.getScope().documentFamilies());
if (!CollectionUtils.isEmpty(documentFamilies)) {
sql.append(" AND CAST(").append(documentAlias).append(".document_family AS text) IN (:documentFamilies)");
params.addValue("documentFamilies", enumNames(documentFamilies));
}
Set<DocumentVisibility> visibilities = firstNonEmpty(context.getRequest().getVisibilities(), context.getScope().visibilities());
if (!CollectionUtils.isEmpty(visibilities)) {
sql.append(" AND CAST(").append(documentAlias).append(".visibility AS text) IN (:visibilities)");
params.addValue("visibilities", enumNames(visibilities));
}
Set<String> languageCodes = context.getRequest().getLanguageCodes();
if (CollectionUtils.isEmpty(languageCodes) && context.getScope().languageCode() != null && !context.getScope().languageCode().isBlank()) {
languageCodes = Set.of(context.getScope().languageCode());
}
if (!CollectionUtils.isEmpty(languageCodes)) {
sql.append(" AND COALESCE(").append(representationAlias).append(".language_code, ")
.append(documentAlias).append(".language_code, '') IN (:languageCodes)");
params.addValue("languageCodes", languageCodes);
}
Set<RepresentationType> representationTypes = context.getRequest().getRepresentationTypes();
if (!CollectionUtils.isEmpty(representationTypes)) {
sql.append(" AND CAST(").append(representationAlias).append(".representation_type AS text) IN (:representationTypes)");
params.addValue("representationTypes", enumNames(representationTypes));
} else {
sql.append(" AND ").append(representationAlias).append(".is_primary = true");
}
if (context.getRequest().getCreatedFrom() != null) {
sql.append(" AND ").append(documentAlias).append(".created_at >= :createdFrom");
params.addValue("createdFrom", context.getRequest().getCreatedFrom());
}
if (context.getRequest().getCreatedTo() != null) {
sql.append(" AND ").append(documentAlias).append(".created_at <= :createdTo");
params.addValue("createdTo", context.getRequest().getCreatedTo());
}
if (tenantJoinPresent && !CollectionUtils.isEmpty(context.getScope().ownerTenantKeys())) {
sql.append(" AND dt.tenant_key IN (:ownerTenantKeys)");
params.addValue("ownerTenantKeys", context.getScope().ownerTenantKeys());
}
}
private static <T> Set<T> firstNonEmpty(Set<T> primary, Set<T> fallback) {
return !CollectionUtils.isEmpty(primary) ? primary : fallback;
}
private static List<String> enumNames(Collection<? extends Enum<?>> values) {
return values.stream().map(Enum::name).collect(Collectors.toList());
}
}

@ -2,6 +2,8 @@ package at.procon.dip.search.service;
import at.procon.dip.search.api.SearchExecutionContext; import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.api.SearchExecutionPlan; import at.procon.dip.search.api.SearchExecutionPlan;
import at.procon.dip.search.dto.SearchDebugResponse;
import at.procon.dip.search.dto.SearchEngineDebugResult;
import at.procon.dip.search.dto.SearchEngineType; import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit; import at.procon.dip.search.dto.SearchHit;
import at.procon.dip.search.dto.SearchRequest; import at.procon.dip.search.dto.SearchRequest;
@ -10,6 +12,8 @@ import at.procon.dip.search.engine.SearchEngine;
import at.procon.dip.search.plan.SearchPlanner; import at.procon.dip.search.plan.SearchPlanner;
import at.procon.dip.search.rank.SearchResultFusionService; import at.procon.dip.search.rank.SearchResultFusionService;
import at.procon.dip.search.spi.SearchDocumentScope; import at.procon.dip.search.spi.SearchDocumentScope;
import at.procon.ted.config.TedProcessorProperties;
import java.util.ArrayList;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -20,28 +24,66 @@ import org.springframework.stereotype.Service;
@RequiredArgsConstructor @RequiredArgsConstructor
public class DefaultSearchOrchestrator implements SearchOrchestrator { public class DefaultSearchOrchestrator implements SearchOrchestrator {
private final TedProcessorProperties properties;
private final SearchPlanner planner; private final SearchPlanner planner;
private final List<SearchEngine> engines; private final List<SearchEngine> engines;
private final SearchResultFusionService fusionService; private final SearchResultFusionService fusionService;
@Override @Override
public SearchResponse search(SearchRequest request, SearchDocumentScope scope) { public SearchResponse search(SearchRequest request, SearchDocumentScope scope) {
SearchExecution execution = executeInternal(request, scope);
return fusionService.fuse(execution.context(), execution.plan(), execution.engineResults());
}
@Override
public SearchDebugResponse debug(SearchRequest request, SearchDocumentScope scope) {
SearchExecution execution = executeInternal(request, scope);
SearchResponse fused = fusionService.fuse(execution.context(), execution.plan(), execution.engineResults());
List<SearchEngineDebugResult> debugResults = new ArrayList<>();
int topLimit = properties.getSearch().getDebugTopHitsPerEngine();
execution.engineResults().forEach((engine, hits) -> debugResults.add(SearchEngineDebugResult.builder()
.engineType(engine)
.hitCount(hits.size())
.topHits(hits.stream().limit(topLimit).toList())
.build()));
return SearchDebugResponse.builder()
.request(request)
.plan(execution.plan())
.engineResults(debugResults)
.fusedResponse(fused)
.build();
}
private SearchExecution executeInternal(SearchRequest request, SearchDocumentScope scope) {
int page = request.getPage() == null || request.getPage() < 0 ? 0 : request.getPage();
int requestedSize = request.getSize() == null || request.getSize() <= 0
? properties.getSearch().getDefaultPageSize()
: request.getSize();
int size = Math.min(requestedSize, properties.getSearch().getMaxPageSize());
SearchExecutionContext context = SearchExecutionContext.builder() SearchExecutionContext context = SearchExecutionContext.builder()
.request(request) .request(request)
.scope(scope) .scope(scope)
.page(request.getPage() == null ? 0 : request.getPage()) .page(page)
.size(request.getSize() == null ? 20 : request.getSize()) .size(size)
.build(); .build();
SearchExecutionPlan plan = planner.plan(context); SearchExecutionPlan plan = planner.plan(context);
Map<SearchEngineType, List<SearchHit>> engineResults = new LinkedHashMap<>(); Map<SearchEngineType, List<SearchHit>> engineResults = new LinkedHashMap<>();
for (SearchEngine engine : engines) { for (SearchEngine engine : engines) {
if (plan.getEngines().contains(engine.type()) && engine.supports(context)) { if (plan.getEngines().contains(engine.type()) && engine.supports(context)) {
engineResults.put(engine.type(), engine.execute(context)); engineResults.put(engine.type(), engine.execute(context));
} }
} }
return new SearchExecution(context, plan, engineResults);
}
return fusionService.fuse(context, plan, engineResults); private record SearchExecution(
SearchExecutionContext context,
SearchExecutionPlan plan,
Map<SearchEngineType, List<SearchHit>> engineResults
) {
} }
} }

@ -1,27 +1,45 @@
package at.procon.dip.search.service; package at.procon.dip.search.service;
import jakarta.persistence.EntityManager; import java.util.List;
import jakarta.persistence.PersistenceContext;
import jakarta.transaction.Transactional;
import java.util.UUID; import java.util.UUID;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.jdbc.core.namedparam.MapSqlParameterSource;
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
@Service @Service
@Transactional @RequiredArgsConstructor
@Slf4j @Slf4j
public class DocumentLexicalIndexService { public class DocumentLexicalIndexService {
@PersistenceContext private final NamedParameterJdbcTemplate namedParameterJdbcTemplate;
private EntityManager entityManager; private final JdbcTemplate jdbcTemplate;
/**
* New Slice 2 name kept for current code.
*/
@Transactional
public void indexRepresentation(UUID representationId) {
refreshRepresentationLexicalIndex(representationId);
}
/**
* Backward-compatible Slice 1 method name.
*/
@Transactional
public void refreshRepresentationLexicalIndex(UUID representationId) { public void refreshRepresentationLexicalIndex(UUID representationId) {
if (!isLexicalSearchSchemaAvailable()) { if (!isLexicalSearchSchemaAvailable()) {
log.debug("Skipping lexical index refresh for representation {} because search columns are not available yet", representationId); log.debug("Skipping lexical indexing for representation {} because search_vector columns are not present yet", representationId);
return; return;
} }
entityManager.createNativeQuery("""
UPDATE DOC.doc_text_representation MapSqlParameterSource params = new MapSqlParameterSource();
params.addValue("representationId", representationId);
namedParameterJdbcTemplate.update("""
UPDATE doc.doc_text_representation
SET search_config = CASE SET search_config = CASE
WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german' WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german'
WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english' WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english'
@ -36,18 +54,39 @@ public class DocumentLexicalIndexService {
coalesce(text_body, '') coalesce(text_body, '')
) )
WHERE id = :representationId WHERE id = :representationId
""") """, params);
.setParameter("representationId", representationId)
.executeUpdate();
} }
/**
* New Slice 2 method kept for current startup runner.
*/
@Transactional
public int backfillMissingVectors(int limit) {
if (!isLexicalSearchSchemaAvailable()) {
return 0;
}
List<UUID> ids = jdbcTemplate.query("""
SELECT id
FROM doc.doc_text_representation
WHERE search_vector IS NULL
ORDER BY created_at ASC
LIMIT ?
""", (rs, rowNum) -> rs.getObject(1, UUID.class), limit);
ids.forEach(this::refreshRepresentationLexicalIndex);
return ids.size();
}
/**
* Backward-compatible Slice 1 method name.
*/
@Transactional
public void refreshAllMissingLexicalIndexes() { public void refreshAllMissingLexicalIndexes() {
if (!isLexicalSearchSchemaAvailable()) { if (!isLexicalSearchSchemaAvailable()) {
log.info("Lexical search columns are not available yet. Skipping startup backfill for DOC lexical indexes."); log.info("Lexical search columns are not available yet. Skipping startup backfill for DOC lexical indexes.");
return; return;
} }
entityManager.createNativeQuery(""" jdbcTemplate.update("""
UPDATE DOC.doc_text_representation UPDATE doc.doc_text_representation
SET search_config = CASE SET search_config = CASE
WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german' WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german'
WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english' WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english'
@ -62,19 +101,27 @@ public class DocumentLexicalIndexService {
coalesce(text_body, '') coalesce(text_body, '')
) )
WHERE search_vector IS NULL WHERE search_vector IS NULL
""") """);
.executeUpdate(); }
/**
* New Slice 2 name kept for current code.
*/
public boolean searchVectorColumnsPresent() {
return isLexicalSearchSchemaAvailable();
} }
private boolean isLexicalSearchSchemaAvailable() { /**
Number count = (Number) entityManager.createNativeQuery(""" * Backward-compatible Slice 1 method name.
*/
public boolean isLexicalSearchSchemaAvailable() {
Integer count = jdbcTemplate.queryForObject("""
SELECT COUNT(*) SELECT COUNT(*)
FROM information_schema.columns FROM information_schema.columns
WHERE table_schema = 'doc' WHERE table_schema = 'doc'
AND table_name = 'doc_text_representation' AND table_name = 'doc_text_representation'
AND column_name IN ('search_config', 'search_vector') AND column_name IN ('search_vector', 'search_config')
""") """, Integer.class);
.getSingleResult(); return count != null && count >= 2;
return count != null && count.intValue() >= 2;
} }
} }

@ -0,0 +1,25 @@
package at.procon.dip.search.service;
import at.procon.ted.config.TedProcessorProperties;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.boot.ApplicationArguments;
import org.springframework.boot.ApplicationRunner;
import org.springframework.stereotype.Component;
@Component
@RequiredArgsConstructor
@Slf4j
public class SearchLexicalIndexStartupRunner implements ApplicationRunner {
private final TedProcessorProperties properties;
private final DocumentLexicalIndexService lexicalIndexService;
@Override
public void run(ApplicationArguments args) {
int updated = lexicalIndexService.backfillMissingVectors(properties.getSearch().getStartupLexicalBackfillLimit());
if (updated > 0) {
log.info("Search lexical index startup backfill updated {} representations", updated);
}
}
}

@ -1,9 +1,11 @@
package at.procon.dip.search.service; package at.procon.dip.search.service;
import at.procon.dip.search.dto.SearchDebugResponse;
import at.procon.dip.search.dto.SearchRequest; import at.procon.dip.search.dto.SearchRequest;
import at.procon.dip.search.dto.SearchResponse; import at.procon.dip.search.dto.SearchResponse;
import at.procon.dip.search.spi.SearchDocumentScope; import at.procon.dip.search.spi.SearchDocumentScope;
public interface SearchOrchestrator { public interface SearchOrchestrator {
SearchResponse search(SearchRequest request, SearchDocumentScope scope); SearchResponse search(SearchRequest request, SearchDocumentScope scope);
SearchDebugResponse debug(SearchRequest request, SearchDocumentScope scope);
} }

@ -0,0 +1,39 @@
package at.procon.dip.search.service;
import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
import at.procon.dip.domain.document.service.DocumentEmbeddingService;
import at.procon.ted.config.TedProcessorProperties;
import at.procon.ted.service.VectorizationService;
import java.util.Optional;
import java.util.UUID;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
@Service
@RequiredArgsConstructor
@Slf4j
public class SemanticQueryEmbeddingService {
private final TedProcessorProperties properties;
private final DocumentEmbeddingService documentEmbeddingService;
private final VectorizationService vectorizationService;
public Optional<QueryEmbedding> buildQueryEmbedding(String queryText) {
if (!properties.getVectorization().isEnabled()) {
return Optional.empty();
}
try {
DocumentEmbeddingModel model = documentEmbeddingService.findActiveModelByKey(
properties.getVectorization().getModelName());
float[] vector = vectorizationService.generateQueryEmbedding(queryText);
return Optional.of(new QueryEmbedding(model.getId(), vectorizationService.floatArrayToVectorString(vector)));
} catch (Exception e) {
log.warn("Failed to generate semantic query embedding: {}", e.getMessage());
return Optional.empty();
}
}
public record QueryEmbedding(UUID modelId, String vectorString) {
}
}

@ -1,5 +1,6 @@
package at.procon.dip.search.web; package at.procon.dip.search.web;
import at.procon.dip.search.dto.SearchDebugResponse;
import at.procon.dip.search.dto.SearchRequest; import at.procon.dip.search.dto.SearchRequest;
import at.procon.dip.search.dto.SearchResponse; import at.procon.dip.search.dto.SearchResponse;
import at.procon.dip.search.service.SearchOrchestrator; import at.procon.dip.search.service.SearchOrchestrator;
@ -21,15 +22,24 @@ public class GenericSearchController {
@PostMapping @PostMapping
public SearchResponse search(@Valid @RequestBody SearchRequest request) { public SearchResponse search(@Valid @RequestBody SearchRequest request) {
SearchDocumentScope scope = new SearchDocumentScope( return searchOrchestrator.search(request, buildScope(request));
}
@PostMapping("/debug")
public SearchDebugResponse debug(@Valid @RequestBody SearchRequest request) {
return searchOrchestrator.debug(request, buildScope(request));
}
private SearchDocumentScope buildScope(SearchRequest request) {
String scopeLanguage = (request.getLanguageCodes() == null || request.getLanguageCodes().isEmpty())
? null
: request.getLanguageCodes().iterator().next();
return new SearchDocumentScope(
Set.of(), Set.of(),
request.getDocumentTypes(), request.getDocumentTypes(),
request.getDocumentFamilies(), request.getDocumentFamilies(),
request.getVisibilities(), request.getVisibilities(),
request.getLanguageCodes() == null || request.getLanguageCodes().isEmpty() scopeLanguage
? null
: request.getLanguageCodes().iterator().next()
); );
return searchOrchestrator.search(request, scope);
} }
} }

@ -209,6 +209,42 @@ public class TedProcessorProperties {
* Similarity threshold for vector search (0.0 - 1.0). * Similarity threshold for vector search (0.0 - 1.0).
*/ */
private double similarityThreshold = 0.7; private double similarityThreshold = 0.7;
/**
* Minimum trigram similarity for fuzzy lexical matches.
*/
private double trigramSimilarityThreshold = 0.12;
/**
* Candidate limits per search engine before fusion/collapse.
*/
@Positive
private int fulltextCandidateLimit = 120;
@Positive
private int trigramCandidateLimit = 120;
@Positive
private int semanticCandidateLimit = 120;
/**
* Hybrid fusion weights.
*/
private double fulltextWeight = 0.35;
private double trigramWeight = 0.20;
private double semanticWeight = 0.45;
/**
* Startup backfill limit for missing DOC lexical vectors.
*/
@Positive
private int startupLexicalBackfillLimit = 500;
/**
* Number of hits per engine returned by the debug endpoint.
*/
@Positive
private int debugTopHitsPerEngine = 10;
} }
/** /**

@ -124,11 +124,25 @@ ted:
max-page-size: 100 max-page-size: 100
# Similarity threshold for vector search (0.0 - 1.0) # Similarity threshold for vector search (0.0 - 1.0)
similarity-threshold: 0.7 similarity-threshold: 0.7
# Minimum trigram similarity for fuzzy lexical matches
trigram-similarity-threshold: 0.12
# Candidate limits per engine before fusion/collapse
fulltext-candidate-limit: 120
trigram-candidate-limit: 120
semantic-candidate-limit: 120
# Hybrid fusion weights
fulltext-weight: 0.35
trigram-weight: 0.20
semantic-weight: 0.45
# Startup backfill limit for missing lexical vectors
startup-lexical-backfill-limit: 500
# Number of top hits per engine returned by /search/debug
debug-top-hits-per-engine: 10
# TED Daily Package Download configuration # TED Daily Package Download configuration
download: download:
# Enable/disable automatic package download # Enable/disable automatic package download
enabled: true enabled: false
# User service-based camel route # User service-based camel route
use-service-based: false use-service-based: false
# Base URL for TED Daily Packages # Base URL for TED Daily Packages
@ -142,7 +156,7 @@ ted:
# Max consecutive 404 errors before stopping # Max consecutive 404 errors before stopping
max-consecutive-404: 4 max-consecutive-404: 4
# Polling interval (milliseconds) - 2 minutes # Polling interval (milliseconds) - 2 minutes
poll-interval: 3600000 poll-interval: 1800000
# Retry interval for tail NOT_FOUND packages - 6 hours # Retry interval for tail NOT_FOUND packages - 6 hours
not-found-retry-interval: 21600000 not-found-retry-interval: 21600000
# Grace period after year end before a previous-year tail 404 is treated as final # Grace period after year end before a previous-year tail 404 is treated as final
@ -163,7 +177,7 @@ ted:
# IMAP Mail configuration # IMAP Mail configuration
mail: mail:
# Enable/disable mail processing # Enable/disable mail processing
enabled: false enabled: true
# IMAP server hostname # IMAP server hostname
host: mail.mymagenta.business host: mail.mymagenta.business
# IMAP server port (993 for IMAPS) # IMAP server port (993 for IMAPS)
@ -185,7 +199,7 @@ ted:
# Polling delay in milliseconds (1 minute) # Polling delay in milliseconds (1 minute)
delay: 60000 delay: 60000
# Max messages per poll # Max messages per poll
max-messages-per-poll: 10 max-messages-per-poll: 100
# Output directory for processed attachments # Output directory for processed attachments
attachment-output-directory: /ted.europe/mail-attachments attachment-output-directory: /ted.europe/mail-attachments
# Enable/disable MIME file input processing # Enable/disable MIME file input processing
@ -195,7 +209,7 @@ ted:
# File pattern for MIME files (regex) # File pattern for MIME files (regex)
mime-input-pattern: .*\\.eml mime-input-pattern: .*\\.eml
# Polling interval for MIME input directory (milliseconds) # Polling interval for MIME input directory (milliseconds)
mime-input-poll-interval: 10000 mime-input-poll-interval: 1000000
# Phase 3 TED projection configuration # Phase 3 TED projection configuration
projection: projection:
@ -225,7 +239,7 @@ ted:
# Polling interval for the generic route # Polling interval for the generic route
poll-interval: 15000 poll-interval: 15000
# Maximum files per poll # Maximum files per poll
max-messages-per-poll: 10 max-messages-per-poll: 200
# Optional default owner tenant; leave empty for PUBLIC docs like TED or public knowledge docs # Optional default owner tenant; leave empty for PUBLIC docs like TED or public knowledge docs
default-owner-tenant-key: default-owner-tenant-key:
# Default visibility when no explicit access context is provided # Default visibility when no explicit access context is provided
@ -247,7 +261,7 @@ ted:
# Enable Phase 4.1 TED package adapter on top of the generic DOC ingestion SPI # Enable Phase 4.1 TED package adapter on top of the generic DOC ingestion SPI
ted-package-adapter-enabled: true ted-package-adapter-enabled: true
# Enable Phase 4.1 mail/document adapter on top of the generic DOC ingestion SPI # Enable Phase 4.1 mail/document adapter on top of the generic DOC ingestion SPI
mail-adapter-enabled: false mail-adapter-enabled: true
# Optional dedicated mail owner tenant, falls back to default-owner-tenant-key # Optional dedicated mail owner tenant, falls back to default-owner-tenant-key
mail-default-owner-tenant-key: mail-default-owner-tenant-key:
# Visibility for imported mail messages and attachments # Visibility for imported mail messages and attachments

@ -0,0 +1,26 @@
-- Slice 1 + Slice 2 generic search support for DOC documents.
-- Adds lexical-search support columns/indexes and pg_trgm extension.
CREATE EXTENSION IF NOT EXISTS pg_trgm;
ALTER TABLE DOC.doc_text_representation
ADD COLUMN IF NOT EXISTS search_config VARCHAR(64);
ALTER TABLE DOC.doc_text_representation
ADD COLUMN IF NOT EXISTS search_vector tsvector;
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_search_vector
ON DOC.doc_text_representation
USING GIN (search_vector);
CREATE INDEX IF NOT EXISTS idx_doc_document_title_trgm
ON DOC.doc_document
USING GIN (title gin_trgm_ops);
CREATE INDEX IF NOT EXISTS idx_doc_document_summary_trgm
ON DOC.doc_document
USING GIN (summary gin_trgm_ops);
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_text_trgm
ON DOC.doc_text_representation
USING GIN (text_body gin_trgm_ops);
Loading…
Cancel
Save