Refactor phases 5 - semantic search - slice 2
This commit is contained in:
parent
47894257a4
commit
039b5a5f0a
|
|
@ -3,8 +3,8 @@ package at.procon.dip.domain.document.service;
|
|||
import at.procon.dip.domain.document.entity.DocumentContent;
|
||||
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
|
||||
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
||||
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
|
||||
import at.procon.dip.search.service.DocumentLexicalIndexService;
|
||||
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
|
@ -37,7 +37,7 @@ public class DocumentRepresentationService {
|
|||
.textBody(command.textBody())
|
||||
.build();
|
||||
DocumentTextRepresentation saved = representationRepository.save(representation);
|
||||
lexicalIndexService.refreshRepresentationLexicalIndex(saved.getId());
|
||||
lexicalIndexService.indexRepresentation(saved.getId());
|
||||
return saved;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -5,10 +5,9 @@ import at.procon.dip.search.spi.SearchDocumentScope;
|
|||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
|
||||
@Getter
|
||||
@Builder
|
||||
@Getter
|
||||
public class SearchExecutionContext {
|
||||
|
||||
private final SearchRequest request;
|
||||
private final SearchDocumentScope scope;
|
||||
private final int page;
|
||||
|
|
|
|||
|
|
@ -6,10 +6,9 @@ import java.util.List;
|
|||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
|
||||
@Getter
|
||||
@Builder
|
||||
@Getter
|
||||
public class SearchExecutionPlan {
|
||||
|
||||
private final List<SearchEngineType> engines;
|
||||
private final boolean collapseByDocument;
|
||||
private final SearchSortMode sortMode;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,19 @@
|
|||
package at.procon.dip.search.dto;
|
||||
|
||||
import at.procon.dip.search.api.SearchExecutionPlan;
|
||||
import java.util.List;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class SearchDebugResponse {
|
||||
private SearchRequest request;
|
||||
private SearchExecutionPlan plan;
|
||||
private List<SearchEngineDebugResult> engineResults;
|
||||
private SearchResponse fusedResponse;
|
||||
}
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
package at.procon.dip.search.dto;
|
||||
|
||||
import java.util.List;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class SearchEngineDebugResult {
|
||||
private SearchEngineType engineType;
|
||||
private int hitCount;
|
||||
private List<SearchHit> topHits;
|
||||
}
|
||||
|
|
@ -2,5 +2,6 @@ package at.procon.dip.search.dto;
|
|||
|
||||
public enum SearchEngineType {
|
||||
POSTGRES_FULLTEXT,
|
||||
POSTGRES_TRIGRAM
|
||||
POSTGRES_TRIGRAM,
|
||||
PGVECTOR_SEMANTIC
|
||||
}
|
||||
|
|
|
|||
|
|
@ -15,7 +15,6 @@ import lombok.NoArgsConstructor;
|
|||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class SearchHit {
|
||||
|
||||
private UUID documentId;
|
||||
private UUID representationId;
|
||||
|
||||
|
|
|
|||
|
|
@ -3,5 +3,6 @@ package at.procon.dip.search.dto;
|
|||
public enum SearchMode {
|
||||
FULLTEXT,
|
||||
TRIGRAM,
|
||||
SEMANTIC,
|
||||
HYBRID
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,10 +4,8 @@ import at.procon.dip.domain.access.DocumentVisibility;
|
|||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import jakarta.validation.constraints.Min;
|
||||
import jakarta.validation.constraints.NotBlank;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Set;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
|
|
@ -24,7 +22,7 @@ public class SearchRequest {
|
|||
private String queryText;
|
||||
|
||||
@Builder.Default
|
||||
private Set<SearchMode> modes = new LinkedHashSet<>(Set.of(SearchMode.HYBRID));
|
||||
private Set<SearchMode> modes = Set.of(SearchMode.HYBRID);
|
||||
|
||||
private Set<DocumentType> documentTypes;
|
||||
private Set<DocumentFamily> documentFamilies;
|
||||
|
|
@ -34,10 +32,7 @@ public class SearchRequest {
|
|||
private OffsetDateTime createdFrom;
|
||||
private OffsetDateTime createdTo;
|
||||
|
||||
@Min(0)
|
||||
private Integer page;
|
||||
|
||||
@Min(1)
|
||||
private Integer size;
|
||||
|
||||
@Builder.Default
|
||||
|
|
|
|||
|
|
@ -12,7 +12,6 @@ import lombok.NoArgsConstructor;
|
|||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class SearchResponse {
|
||||
|
||||
private List<SearchHit> hits;
|
||||
private int page;
|
||||
private int size;
|
||||
|
|
|
|||
|
|
@ -1,15 +1,11 @@
|
|||
package at.procon.dip.search.engine.fulltext;
|
||||
|
||||
import at.procon.dip.domain.access.DocumentVisibility;
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import at.procon.dip.search.dto.SearchMatchField;
|
||||
import at.procon.dip.search.engine.SearchEngine;
|
||||
import at.procon.dip.search.repository.DocumentFullTextSearchRepository;
|
||||
import at.procon.dip.search.repository.FullTextSearchRow;
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import java.util.List;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
|
@ -19,6 +15,7 @@ import org.springframework.stereotype.Component;
|
|||
public class PostgresFullTextSearchEngine implements SearchEngine {
|
||||
|
||||
private final DocumentFullTextSearchRepository repository;
|
||||
private final TedProcessorProperties properties;
|
||||
|
||||
@Override
|
||||
public SearchEngineType type() {
|
||||
|
|
@ -32,40 +29,6 @@ public class PostgresFullTextSearchEngine implements SearchEngine {
|
|||
|
||||
@Override
|
||||
public List<SearchHit> execute(SearchExecutionContext context) {
|
||||
return repository.search(context).stream()
|
||||
.map(this::mapRow)
|
||||
.toList();
|
||||
}
|
||||
|
||||
private SearchHit mapRow(FullTextSearchRow row) {
|
||||
return SearchHit.builder()
|
||||
.documentId(row.documentId())
|
||||
.representationId(row.representationId())
|
||||
.documentType(parseDocumentType(row.documentType()))
|
||||
.documentFamily(parseDocumentFamily(row.documentFamily()))
|
||||
.visibility(parseVisibility(row.visibility()))
|
||||
.title(row.title())
|
||||
.summary(row.summary())
|
||||
.languageCode(row.languageCode())
|
||||
.mimeType(row.mimeType())
|
||||
.primaryEngine(SearchEngineType.POSTGRES_FULLTEXT)
|
||||
.matchedField(SearchMatchField.REPRESENTATION_TEXT)
|
||||
.snippet(row.snippet())
|
||||
.rawScore(row.score() == null ? 0.0d : row.score())
|
||||
.createdAt(row.createdAt())
|
||||
.updatedAt(row.updatedAt())
|
||||
.build();
|
||||
}
|
||||
|
||||
private DocumentType parseDocumentType(String value) {
|
||||
return value == null ? null : DocumentType.valueOf(value);
|
||||
}
|
||||
|
||||
private DocumentFamily parseDocumentFamily(String value) {
|
||||
return value == null ? null : DocumentFamily.valueOf(value);
|
||||
}
|
||||
|
||||
private DocumentVisibility parseVisibility(String value) {
|
||||
return value == null ? null : DocumentVisibility.valueOf(value);
|
||||
return repository.search(context, properties.getSearch().getFulltextCandidateLimit());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,45 @@
|
|||
package at.procon.dip.search.engine.semantic;
|
||||
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import at.procon.dip.search.engine.SearchEngine;
|
||||
import at.procon.dip.search.repository.DocumentSemanticSearchRepository;
|
||||
import at.procon.dip.search.service.SemanticQueryEmbeddingService;
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import java.util.List;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
public class PgVectorSemanticSearchEngine implements SearchEngine {
|
||||
|
||||
private final TedProcessorProperties properties;
|
||||
private final SemanticQueryEmbeddingService queryEmbeddingService;
|
||||
private final DocumentSemanticSearchRepository repository;
|
||||
|
||||
@Override
|
||||
public SearchEngineType type() {
|
||||
return SearchEngineType.PGVECTOR_SEMANTIC;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean supports(SearchExecutionContext context) {
|
||||
return properties.getVectorization().isEnabled()
|
||||
&& context.getRequest().getQueryText() != null
|
||||
&& !context.getRequest().getQueryText().isBlank();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<SearchHit> execute(SearchExecutionContext context) {
|
||||
return queryEmbeddingService.buildQueryEmbedding(context.getRequest().getQueryText())
|
||||
.map(query -> repository.search(
|
||||
context,
|
||||
query.modelId(),
|
||||
query.vectorString(),
|
||||
properties.getSearch().getSemanticCandidateLimit(),
|
||||
properties.getSearch().getSimilarityThreshold()))
|
||||
.orElse(List.of());
|
||||
}
|
||||
}
|
||||
|
|
@ -1,15 +1,11 @@
|
|||
package at.procon.dip.search.engine.trigram;
|
||||
|
||||
import at.procon.dip.domain.access.DocumentVisibility;
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import at.procon.dip.search.dto.SearchMatchField;
|
||||
import at.procon.dip.search.engine.SearchEngine;
|
||||
import at.procon.dip.search.repository.DocumentTrigramSearchRepository;
|
||||
import at.procon.dip.search.repository.TrigramSearchRow;
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import java.util.List;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
|
@ -19,6 +15,7 @@ import org.springframework.stereotype.Component;
|
|||
public class PostgresTrigramSearchEngine implements SearchEngine {
|
||||
|
||||
private final DocumentTrigramSearchRepository repository;
|
||||
private final TedProcessorProperties properties;
|
||||
|
||||
@Override
|
||||
public SearchEngineType type() {
|
||||
|
|
@ -32,44 +29,9 @@ public class PostgresTrigramSearchEngine implements SearchEngine {
|
|||
|
||||
@Override
|
||||
public List<SearchHit> execute(SearchExecutionContext context) {
|
||||
return repository.search(context).stream()
|
||||
.map(this::mapRow)
|
||||
.toList();
|
||||
}
|
||||
|
||||
private SearchHit mapRow(TrigramSearchRow row) {
|
||||
return SearchHit.builder()
|
||||
.documentId(row.documentId())
|
||||
.representationId(row.representationId())
|
||||
.documentType(parseDocumentType(row.documentType()))
|
||||
.documentFamily(parseDocumentFamily(row.documentFamily()))
|
||||
.visibility(parseVisibility(row.visibility()))
|
||||
.title(row.title())
|
||||
.summary(row.summary())
|
||||
.languageCode(row.languageCode())
|
||||
.mimeType(row.mimeType())
|
||||
.primaryEngine(SearchEngineType.POSTGRES_TRIGRAM)
|
||||
.matchedField(parseMatchField(row.matchedField()))
|
||||
.snippet(row.snippet())
|
||||
.rawScore(row.score() == null ? 0.0d : row.score())
|
||||
.createdAt(row.createdAt())
|
||||
.updatedAt(row.updatedAt())
|
||||
.build();
|
||||
}
|
||||
|
||||
private SearchMatchField parseMatchField(String value) {
|
||||
return value == null ? SearchMatchField.REPRESENTATION_TEXT : SearchMatchField.valueOf(value);
|
||||
}
|
||||
|
||||
private DocumentType parseDocumentType(String value) {
|
||||
return value == null ? null : DocumentType.valueOf(value);
|
||||
}
|
||||
|
||||
private DocumentFamily parseDocumentFamily(String value) {
|
||||
return value == null ? null : DocumentFamily.valueOf(value);
|
||||
}
|
||||
|
||||
private DocumentVisibility parseVisibility(String value) {
|
||||
return value == null ? null : DocumentVisibility.valueOf(value);
|
||||
return repository.search(
|
||||
context,
|
||||
properties.getSearch().getTrigramCandidateLimit(),
|
||||
properties.getSearch().getTrigramSimilarityThreshold());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import at.procon.dip.search.api.SearchExecutionPlan;
|
|||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchMode;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
|
@ -15,17 +16,23 @@ public class DefaultSearchPlanner implements SearchPlanner {
|
|||
@Override
|
||||
public SearchExecutionPlan plan(SearchExecutionContext context) {
|
||||
Set<SearchMode> modes = context.getRequest().getModes();
|
||||
List<SearchEngineType> engines = new ArrayList<>();
|
||||
if (modes == null || modes.isEmpty()) {
|
||||
modes = Set.of(SearchMode.HYBRID);
|
||||
}
|
||||
|
||||
if (modes == null || modes.isEmpty() || modes.contains(SearchMode.HYBRID) || modes.contains(SearchMode.FULLTEXT)) {
|
||||
Set<SearchEngineType> engines = new LinkedHashSet<>();
|
||||
if (modes.contains(SearchMode.HYBRID) || modes.contains(SearchMode.FULLTEXT)) {
|
||||
engines.add(SearchEngineType.POSTGRES_FULLTEXT);
|
||||
}
|
||||
if (modes == null || modes.isEmpty() || modes.contains(SearchMode.HYBRID) || modes.contains(SearchMode.TRIGRAM)) {
|
||||
if (modes.contains(SearchMode.HYBRID) || modes.contains(SearchMode.TRIGRAM)) {
|
||||
engines.add(SearchEngineType.POSTGRES_TRIGRAM);
|
||||
}
|
||||
if (modes.contains(SearchMode.HYBRID) || modes.contains(SearchMode.SEMANTIC)) {
|
||||
engines.add(SearchEngineType.PGVECTOR_SEMANTIC);
|
||||
}
|
||||
|
||||
return SearchExecutionPlan.builder()
|
||||
.engines(engines)
|
||||
.engines(new ArrayList<>(engines))
|
||||
.collapseByDocument(context.getRequest().isCollapseByDocument())
|
||||
.sortMode(context.getRequest().getSortMode())
|
||||
.build();
|
||||
|
|
|
|||
|
|
@ -5,103 +5,120 @@ import at.procon.dip.search.api.SearchExecutionPlan;
|
|||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import at.procon.dip.search.dto.SearchResponse;
|
||||
import at.procon.dip.search.dto.SearchSortMode;
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.EnumMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
public class DefaultSearchResultFusionService implements SearchResultFusionService {
|
||||
|
||||
private static final double FULLTEXT_WEIGHT = 0.60d;
|
||||
private static final double TRIGRAM_WEIGHT = 0.40d;
|
||||
|
||||
private final SearchScoreNormalizer scoreNormalizer;
|
||||
|
||||
public DefaultSearchResultFusionService(SearchScoreNormalizer scoreNormalizer) {
|
||||
this.scoreNormalizer = scoreNormalizer;
|
||||
}
|
||||
private final SearchScoreNormalizer normalizer;
|
||||
private final TedProcessorProperties properties;
|
||||
|
||||
@Override
|
||||
public SearchResponse fuse(SearchExecutionContext context,
|
||||
SearchExecutionPlan plan,
|
||||
Map<SearchEngineType, List<SearchHit>> engineResults) {
|
||||
Map<SearchEngineType, List<SearchHit>> normalizedResults = new LinkedHashMap<>();
|
||||
for (Map.Entry<SearchEngineType, List<SearchHit>> entry : engineResults.entrySet()) {
|
||||
normalizedResults.put(entry.getKey(), scoreNormalizer.normalize(entry.getKey(), entry.getValue()));
|
||||
}
|
||||
Map<SearchEngineType, List<SearchHit>> normalized = new EnumMap<>(SearchEngineType.class);
|
||||
engineResults.forEach((engine, hits) -> normalized.put(engine, normalizer.normalize(engine, hits)));
|
||||
|
||||
List<SearchHit> ranked = plan.isCollapseByDocument()
|
||||
? collapseByDocument(normalizedResults)
|
||||
: flatten(normalizedResults);
|
||||
List<SearchHit> fused = plan.isCollapseByDocument()
|
||||
? collapse(normalized)
|
||||
: mergeWithoutCollapse(normalized);
|
||||
|
||||
ranked.sort(Comparator
|
||||
.comparingDouble(SearchHit::getFinalScore).reversed()
|
||||
.thenComparing(SearchHit::getUpdatedAt, Comparator.nullsLast(Comparator.reverseOrder())));
|
||||
|
||||
int totalHits = ranked.size();
|
||||
int fromIndex = Math.min(context.getPage() * context.getSize(), ranked.size());
|
||||
int toIndex = Math.min(fromIndex + context.getSize(), ranked.size());
|
||||
List<SearchHit> pageHits = ranked.subList(fromIndex, toIndex);
|
||||
sort(fused, plan.getSortMode());
|
||||
long total = fused.size();
|
||||
int fromIndex = Math.min(context.getPage() * context.getSize(), fused.size());
|
||||
int toIndex = Math.min(fromIndex + context.getSize(), fused.size());
|
||||
List<SearchHit> paged = fromIndex >= toIndex ? List.of() : fused.subList(fromIndex, toIndex);
|
||||
|
||||
return SearchResponse.builder()
|
||||
.hits(new ArrayList<>(pageHits))
|
||||
.hits(paged)
|
||||
.page(context.getPage())
|
||||
.size(context.getSize())
|
||||
.totalHits(totalHits)
|
||||
.truncated(toIndex < totalHits)
|
||||
.enginesUsed(new LinkedHashSet<>(normalizedResults.keySet()))
|
||||
.totalHits(total)
|
||||
.truncated(total > toIndex)
|
||||
.enginesUsed(engineResults.keySet())
|
||||
.build();
|
||||
}
|
||||
|
||||
private List<SearchHit> flatten(Map<SearchEngineType, List<SearchHit>> normalizedResults) {
|
||||
private List<SearchHit> collapse(Map<SearchEngineType, List<SearchHit>> normalized) {
|
||||
Map<UUID, Aggregate> aggregates = new LinkedHashMap<>();
|
||||
normalized.forEach((engine, hits) -> {
|
||||
for (SearchHit hit : hits) {
|
||||
Aggregate aggregate = aggregates.computeIfAbsent(hit.getDocumentId(), id -> new Aggregate());
|
||||
aggregate.bestByEngine.put(engine, hit);
|
||||
if (aggregate.representative == null || hit.getNormalizedScore() > aggregate.representative.getNormalizedScore()) {
|
||||
aggregate.representative = hit;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
List<SearchHit> fused = new ArrayList<>();
|
||||
for (Aggregate aggregate : aggregates.values()) {
|
||||
SearchHit representative = aggregate.representative;
|
||||
double finalScore = weight(SearchEngineType.POSTGRES_FULLTEXT, aggregate) +
|
||||
weight(SearchEngineType.POSTGRES_TRIGRAM, aggregate) +
|
||||
weight(SearchEngineType.PGVECTOR_SEMANTIC, aggregate);
|
||||
fused.add(representative.toBuilder().finalScore(finalScore).build());
|
||||
}
|
||||
return fused;
|
||||
}
|
||||
|
||||
private double weight(SearchEngineType engineType, Aggregate aggregate) {
|
||||
SearchHit hit = aggregate.bestByEngine.get(engineType);
|
||||
if (hit == null) {
|
||||
return 0.0d;
|
||||
}
|
||||
TedProcessorProperties.SearchProperties search = properties.getSearch();
|
||||
return switch (engineType) {
|
||||
case POSTGRES_FULLTEXT -> hit.getNormalizedScore() * search.getFulltextWeight();
|
||||
case POSTGRES_TRIGRAM -> hit.getNormalizedScore() * search.getTrigramWeight();
|
||||
case PGVECTOR_SEMANTIC -> hit.getNormalizedScore() * search.getSemanticWeight();
|
||||
};
|
||||
}
|
||||
|
||||
private List<SearchHit> mergeWithoutCollapse(Map<SearchEngineType, List<SearchHit>> normalized) {
|
||||
List<SearchHit> merged = new ArrayList<>();
|
||||
for (Map.Entry<SearchEngineType, List<SearchHit>> entry : normalizedResults.entrySet()) {
|
||||
for (SearchHit hit : entry.getValue()) {
|
||||
merged.add(hit.toBuilder().finalScore(weight(entry.getKey()) * hit.getNormalizedScore()).build());
|
||||
}
|
||||
normalized.forEach((engine, hits) -> {
|
||||
for (SearchHit hit : hits) {
|
||||
double finalScore = switch (engine) {
|
||||
case POSTGRES_FULLTEXT -> hit.getNormalizedScore() * properties.getSearch().getFulltextWeight();
|
||||
case POSTGRES_TRIGRAM -> hit.getNormalizedScore() * properties.getSearch().getTrigramWeight();
|
||||
case PGVECTOR_SEMANTIC -> hit.getNormalizedScore() * properties.getSearch().getSemanticWeight();
|
||||
};
|
||||
merged.add(hit.toBuilder().finalScore(finalScore).build());
|
||||
}
|
||||
});
|
||||
return merged;
|
||||
}
|
||||
|
||||
private List<SearchHit> collapseByDocument(Map<SearchEngineType, List<SearchHit>> normalizedResults) {
|
||||
Map<UUID, SearchHit> collapsed = new LinkedHashMap<>();
|
||||
Map<UUID, Double> accumulatedScores = new LinkedHashMap<>();
|
||||
Set<UUID> docOrder = new LinkedHashSet<>();
|
||||
|
||||
for (Map.Entry<SearchEngineType, List<SearchHit>> entry : normalizedResults.entrySet()) {
|
||||
double weight = weight(entry.getKey());
|
||||
for (SearchHit hit : entry.getValue()) {
|
||||
docOrder.add(hit.getDocumentId());
|
||||
double contribution = weight * hit.getNormalizedScore();
|
||||
accumulatedScores.merge(hit.getDocumentId(), contribution, Double::sum);
|
||||
|
||||
SearchHit existing = collapsed.get(hit.getDocumentId());
|
||||
if (existing == null || hit.getNormalizedScore() > existing.getNormalizedScore()) {
|
||||
collapsed.put(hit.getDocumentId(), hit);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
List<SearchHit> results = new ArrayList<>(docOrder.size());
|
||||
for (UUID documentId : docOrder) {
|
||||
SearchHit base = collapsed.get(documentId);
|
||||
if (base != null) {
|
||||
results.add(base.toBuilder().finalScore(accumulatedScores.getOrDefault(documentId, 0.0d)).build());
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
private double weight(SearchEngineType engineType) {
|
||||
return switch (engineType) {
|
||||
case POSTGRES_FULLTEXT -> FULLTEXT_WEIGHT;
|
||||
case POSTGRES_TRIGRAM -> TRIGRAM_WEIGHT;
|
||||
private void sort(List<SearchHit> hits, SearchSortMode sortMode) {
|
||||
Comparator<SearchHit> comparator = switch (sortMode) {
|
||||
case CREATED_AT_DESC -> Comparator.comparing(SearchHit::getCreatedAt,
|
||||
Comparator.nullsLast(Comparator.reverseOrder()));
|
||||
case TITLE_ASC -> Comparator.comparing(hit -> hit.getTitle() == null ? "" : hit.getTitle(),
|
||||
String.CASE_INSENSITIVE_ORDER);
|
||||
case SCORE_DESC -> Comparator.comparingDouble(SearchHit::getFinalScore).reversed();
|
||||
};
|
||||
if (sortMode != SearchSortMode.SCORE_DESC) {
|
||||
comparator = comparator.thenComparing(Comparator.comparingDouble(SearchHit::getFinalScore).reversed());
|
||||
}
|
||||
hits.sort(comparator);
|
||||
}
|
||||
|
||||
private static final class Aggregate {
|
||||
private final Map<SearchEngineType, SearchHit> bestByEngine = new EnumMap<>(SearchEngineType.class);
|
||||
private SearchHit representative;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@ package at.procon.dip.search.rank;
|
|||
|
||||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
|
|
@ -14,15 +13,22 @@ public class DefaultSearchScoreNormalizer implements SearchScoreNormalizer {
|
|||
if (hits == null || hits.isEmpty()) {
|
||||
return List.of();
|
||||
}
|
||||
double max = hits.stream().mapToDouble(SearchHit::getRawScore).max().orElse(0.0d);
|
||||
if (max <= 0.0d) {
|
||||
max = 1.0d;
|
||||
double max = hits.stream().mapToDouble(SearchHit::getRawScore).max().orElse(1.0d);
|
||||
double divisor = max > 0.0d ? max : 1.0d;
|
||||
return hits.stream()
|
||||
.map(hit -> hit.toBuilder()
|
||||
.normalizedScore(clamp(hit.getRawScore() / divisor))
|
||||
.build())
|
||||
.toList();
|
||||
}
|
||||
List<SearchHit> normalized = new ArrayList<>(hits.size());
|
||||
for (SearchHit hit : hits) {
|
||||
double score = Math.max(0.0d, Math.min(1.0d, hit.getRawScore() / max));
|
||||
normalized.add(hit.toBuilder().normalizedScore(score).build());
|
||||
|
||||
private double clamp(double value) {
|
||||
if (value < 0.0d) {
|
||||
return 0.0d;
|
||||
}
|
||||
return normalized;
|
||||
if (value > 1.0d) {
|
||||
return 1.0d;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -9,9 +9,7 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
public interface SearchResultFusionService {
|
||||
SearchResponse fuse(
|
||||
SearchExecutionContext context,
|
||||
SearchResponse fuse(SearchExecutionContext context,
|
||||
SearchExecutionPlan plan,
|
||||
Map<SearchEngineType, List<SearchHit>> engineResults
|
||||
);
|
||||
Map<SearchEngineType, List<SearchHit>> engineResults);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,8 +1,10 @@
|
|||
package at.procon.dip.search.repository;
|
||||
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import java.util.List;
|
||||
|
||||
public interface DocumentFullTextSearchRepository {
|
||||
List<FullTextSearchRow> search(SearchExecutionContext context);
|
||||
|
||||
List<SearchHit> search(SearchExecutionContext context, int limit);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,72 +1,53 @@
|
|||
package at.procon.dip.search.repository;
|
||||
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import jakarta.persistence.EntityManager;
|
||||
import jakarta.persistence.PersistenceContext;
|
||||
import jakarta.persistence.Query;
|
||||
import java.util.ArrayList;
|
||||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import at.procon.dip.search.dto.SearchMatchField;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.jdbc.core.namedparam.MapSqlParameterSource;
|
||||
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
@Repository
|
||||
public class DocumentFullTextSearchRepositoryImpl extends AbstractNativeSearchRepository implements DocumentFullTextSearchRepository {
|
||||
@RequiredArgsConstructor
|
||||
public class DocumentFullTextSearchRepositoryImpl implements DocumentFullTextSearchRepository {
|
||||
|
||||
@PersistenceContext
|
||||
private EntityManager entityManager;
|
||||
private final NamedParameterJdbcTemplate jdbcTemplate;
|
||||
|
||||
@Override
|
||||
public List<FullTextSearchRow> search(SearchExecutionContext context) {
|
||||
public List<SearchHit> search(SearchExecutionContext context, int limit) {
|
||||
StringBuilder sql = new StringBuilder("""
|
||||
SELECT
|
||||
d.id AS document_id,
|
||||
dtr.id AS representation_id,
|
||||
CAST(d.document_type AS text) AS document_type,
|
||||
CAST(d.document_family AS text) AS document_family,
|
||||
CAST(d.visibility AS text) AS visibility,
|
||||
d.title AS title,
|
||||
d.summary AS summary,
|
||||
COALESCE(dtr.language_code, d.language_code) AS language_code,
|
||||
d.mime_type AS mime_type,
|
||||
d.document_type AS document_type,
|
||||
d.document_family AS document_family,
|
||||
d.visibility AS visibility,
|
||||
d.created_at AS created_at,
|
||||
d.updated_at AS updated_at,
|
||||
ts_headline('simple', COALESCE(dtr.text_body, ''), websearch_to_tsquery('simple', :queryText)) AS snippet,
|
||||
ts_headline('simple', COALESCE(dtr.text_body, ''), websearch_to_tsquery('simple', :queryText),
|
||||
'MaxFragments=2, MinWords=5, MaxWords=20') AS snippet,
|
||||
ts_rank_cd(dtr.search_vector, websearch_to_tsquery('simple', :queryText)) AS score
|
||||
FROM DOC.doc_text_representation dtr
|
||||
JOIN DOC.doc_document d ON d.id = dtr.document_id
|
||||
LEFT JOIN DOC.doc_tenant dt ON dt.id = d.owner_tenant_id
|
||||
WHERE dtr.search_vector @@ websearch_to_tsquery('simple', :queryText)
|
||||
FROM doc.doc_text_representation dtr
|
||||
JOIN doc.doc_document d ON d.id = dtr.document_id
|
||||
LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id
|
||||
WHERE dtr.search_vector IS NOT NULL
|
||||
AND dtr.search_vector @@ websearch_to_tsquery('simple', :queryText)
|
||||
""");
|
||||
|
||||
Map<String, Object> params = newParams();
|
||||
params.put("queryText", context.getRequest().getQueryText().trim());
|
||||
appendGenericFilters(sql, params, context);
|
||||
MapSqlParameterSource params = new MapSqlParameterSource();
|
||||
params.addValue("queryText", context.getRequest().getQueryText());
|
||||
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
|
||||
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
|
||||
params.put("limit", engineLimit(context));
|
||||
params.addValue("limit", limit);
|
||||
|
||||
Query query = entityManager.createNativeQuery(sql.toString());
|
||||
bindParameters(query, params);
|
||||
|
||||
List<?> rows = query.getResultList();
|
||||
List<FullTextSearchRow> results = new ArrayList<>(rows.size());
|
||||
for (Object row : rows) {
|
||||
Object[] cols = (Object[]) row;
|
||||
results.add(new FullTextSearchRow(
|
||||
asUuid(cols[0]),
|
||||
asUuid(cols[1]),
|
||||
asString(cols[2]),
|
||||
asString(cols[3]),
|
||||
asString(cols[4]),
|
||||
asString(cols[5]),
|
||||
asString(cols[6]),
|
||||
asString(cols[7]),
|
||||
asString(cols[8]),
|
||||
asOffsetDateTime(cols[9]),
|
||||
asOffsetDateTime(cols[10]),
|
||||
asString(cols[11]),
|
||||
asDouble(cols[12])
|
||||
));
|
||||
}
|
||||
return results;
|
||||
return jdbcTemplate.query(sql.toString(), params,
|
||||
new SearchHitRowMapper(SearchEngineType.POSTGRES_FULLTEXT, SearchMatchField.REPRESENTATION_TEXT));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,61 @@
|
|||
package at.procon.dip.search.repository;
|
||||
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import at.procon.dip.search.dto.SearchMatchField;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.jdbc.core.namedparam.MapSqlParameterSource;
|
||||
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
@Repository
|
||||
@RequiredArgsConstructor
|
||||
public class DocumentSemanticSearchRepository {
|
||||
|
||||
private final NamedParameterJdbcTemplate jdbcTemplate;
|
||||
|
||||
public List<SearchHit> search(SearchExecutionContext context,
|
||||
UUID modelId,
|
||||
String queryVector,
|
||||
int limit,
|
||||
double threshold) {
|
||||
StringBuilder sql = new StringBuilder("""
|
||||
SELECT
|
||||
d.id AS document_id,
|
||||
dtr.id AS representation_id,
|
||||
CAST(d.document_type AS text) AS document_type,
|
||||
CAST(d.document_family AS text) AS document_family,
|
||||
CAST(d.visibility AS text) AS visibility,
|
||||
d.title AS title,
|
||||
d.summary AS summary,
|
||||
COALESCE(dtr.language_code, d.language_code) AS language_code,
|
||||
d.mime_type AS mime_type,
|
||||
d.created_at AS created_at,
|
||||
d.updated_at AS updated_at,
|
||||
LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet,
|
||||
(1 - (de.embedding_vector <=> CAST(:queryVector AS vector))) AS score
|
||||
FROM doc.doc_embedding de
|
||||
JOIN doc.doc_text_representation dtr ON dtr.id = de.representation_id
|
||||
JOIN doc.doc_document d ON d.id = de.document_id
|
||||
LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id
|
||||
WHERE de.embedding_status = 'COMPLETED'
|
||||
AND de.embedding_vector IS NOT NULL
|
||||
AND de.model_id = :modelId
|
||||
AND (1 - (de.embedding_vector <=> CAST(:queryVector AS vector))) >= :threshold
|
||||
""");
|
||||
|
||||
MapSqlParameterSource params = new MapSqlParameterSource();
|
||||
params.addValue("queryVector", queryVector);
|
||||
params.addValue("modelId", modelId);
|
||||
params.addValue("threshold", threshold);
|
||||
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
|
||||
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
|
||||
params.addValue("limit", limit);
|
||||
|
||||
return jdbcTemplate.query(sql.toString(), params,
|
||||
new SearchHitRowMapper(SearchEngineType.PGVECTOR_SEMANTIC, SearchMatchField.REPRESENTATION_TEXT));
|
||||
}
|
||||
}
|
||||
|
|
@ -1,8 +1,10 @@
|
|||
package at.procon.dip.search.repository;
|
||||
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import java.util.List;
|
||||
|
||||
public interface DocumentTrigramSearchRepository {
|
||||
List<TrigramSearchRow> search(SearchExecutionContext context);
|
||||
|
||||
List<SearchHit> search(SearchExecutionContext context, int limit, double threshold);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,102 +1,60 @@
|
|||
package at.procon.dip.search.repository;
|
||||
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import jakarta.persistence.EntityManager;
|
||||
import jakarta.persistence.PersistenceContext;
|
||||
import jakarta.persistence.Query;
|
||||
import java.util.ArrayList;
|
||||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import at.procon.dip.search.dto.SearchMatchField;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.jdbc.core.namedparam.MapSqlParameterSource;
|
||||
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
@Repository
|
||||
public class DocumentTrigramSearchRepositoryImpl extends AbstractNativeSearchRepository implements DocumentTrigramSearchRepository {
|
||||
@RequiredArgsConstructor
|
||||
public class DocumentTrigramSearchRepositoryImpl implements DocumentTrigramSearchRepository {
|
||||
|
||||
@PersistenceContext
|
||||
private EntityManager entityManager;
|
||||
private final NamedParameterJdbcTemplate jdbcTemplate;
|
||||
|
||||
@Override
|
||||
public List<TrigramSearchRow> search(SearchExecutionContext context) {
|
||||
StringBuilder sql = new StringBuilder("""
|
||||
SELECT
|
||||
d.id AS document_id,
|
||||
dtr.id AS representation_id,
|
||||
d.title AS title,
|
||||
d.summary AS summary,
|
||||
COALESCE(dtr.language_code, d.language_code) AS language_code,
|
||||
d.mime_type AS mime_type,
|
||||
d.document_type AS document_type,
|
||||
d.document_family AS document_family,
|
||||
d.visibility AS visibility,
|
||||
d.created_at AS created_at,
|
||||
d.updated_at AS updated_at,
|
||||
CASE
|
||||
WHEN similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(d.summary, ''), :queryText)
|
||||
AND similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText)
|
||||
THEN COALESCE(d.title, '')
|
||||
WHEN similarity(COALESCE(d.summary, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText)
|
||||
THEN COALESCE(d.summary, '')
|
||||
ELSE LEFT(COALESCE(dtr.text_body, ''), 400)
|
||||
END AS snippet,
|
||||
GREATEST(
|
||||
similarity(COALESCE(d.title, ''), :queryText),
|
||||
similarity(COALESCE(d.summary, ''), :queryText),
|
||||
similarity(COALESCE(dtr.text_body, ''), :queryText)
|
||||
) AS score,
|
||||
CASE
|
||||
WHEN similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(d.summary, ''), :queryText)
|
||||
AND similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText)
|
||||
THEN 'DOCUMENT_TITLE'
|
||||
WHEN similarity(COALESCE(d.summary, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText)
|
||||
THEN 'DOCUMENT_SUMMARY'
|
||||
ELSE 'REPRESENTATION_TEXT'
|
||||
END AS matched_field
|
||||
FROM DOC.doc_text_representation dtr
|
||||
JOIN DOC.doc_document d ON d.id = dtr.document_id
|
||||
LEFT JOIN DOC.doc_tenant dt ON dt.id = d.owner_tenant_id
|
||||
WHERE (
|
||||
COALESCE(d.title, '') % :queryText
|
||||
OR COALESCE(d.summary, '') % :queryText
|
||||
OR COALESCE(dtr.text_body, '') % :queryText
|
||||
)
|
||||
""");
|
||||
public List<SearchHit> search(SearchExecutionContext context, int limit, double threshold) {
|
||||
String scoreExpr = "GREATEST(" +
|
||||
"similarity(COALESCE(d.title, ''), :queryText), " +
|
||||
"similarity(COALESCE(d.summary, ''), :queryText), " +
|
||||
"similarity(COALESCE(dtr.text_body, ''), :queryText))";
|
||||
|
||||
Map<String, Object> params = newParams();
|
||||
params.put("queryText", context.getRequest().getQueryText().trim());
|
||||
appendGenericFilters(sql, params, context);
|
||||
sql.append(" AND GREATEST(")
|
||||
.append(" similarity(COALESCE(d.title, ''), :queryText),")
|
||||
.append(" similarity(COALESCE(d.summary, ''), :queryText),")
|
||||
.append(" similarity(COALESCE(dtr.text_body, ''), :queryText)")
|
||||
.append(") >= :minSimilarity");
|
||||
StringBuilder sql = new StringBuilder("SELECT " +
|
||||
"d.id AS document_id, " +
|
||||
"dtr.id AS representation_id, " +
|
||||
"CAST(d.document_type AS text) AS document_type, " +
|
||||
"CAST(d.document_family AS text) AS document_family, " +
|
||||
"CAST(d.visibility AS text) AS visibility, " +
|
||||
"d.title AS title, " +
|
||||
"d.summary AS summary, " +
|
||||
"COALESCE(dtr.language_code, d.language_code) AS language_code, " +
|
||||
"d.mime_type AS mime_type, " +
|
||||
"d.created_at AS created_at, " +
|
||||
"d.updated_at AS updated_at, " +
|
||||
"LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet, " +
|
||||
scoreExpr + " AS score, " +
|
||||
"CASE " +
|
||||
"WHEN similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(d.summary, ''), :queryText) " +
|
||||
" AND similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText) THEN 'DOCUMENT_TITLE' " +
|
||||
"WHEN similarity(COALESCE(d.summary, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText) THEN 'DOCUMENT_SUMMARY' " +
|
||||
"ELSE 'REPRESENTATION_TEXT' END AS matched_field " +
|
||||
"FROM doc.doc_text_representation dtr " +
|
||||
"JOIN doc.doc_document d ON d.id = dtr.document_id " +
|
||||
"LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id " +
|
||||
"WHERE " + scoreExpr + " >= :threshold");
|
||||
|
||||
MapSqlParameterSource params = new MapSqlParameterSource();
|
||||
params.addValue("queryText", context.getRequest().getQueryText());
|
||||
params.addValue("threshold", threshold);
|
||||
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
|
||||
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
|
||||
params.put("minSimilarity", 0.10d);
|
||||
params.put("limit", engineLimit(context));
|
||||
params.addValue("limit", limit);
|
||||
|
||||
Query query = entityManager.createNativeQuery(sql.toString());
|
||||
bindParameters(query, params);
|
||||
|
||||
List<?> rows = query.getResultList();
|
||||
List<TrigramSearchRow> results = new ArrayList<>(rows.size());
|
||||
for (Object row : rows) {
|
||||
Object[] cols = (Object[]) row;
|
||||
results.add(new TrigramSearchRow(
|
||||
asUuid(cols[0]),
|
||||
asUuid(cols[1]),
|
||||
asString(cols[2]),
|
||||
asString(cols[3]),
|
||||
asString(cols[4]),
|
||||
asString(cols[5]),
|
||||
asString(cols[6]),
|
||||
asString(cols[7]),
|
||||
asString(cols[8]),
|
||||
asOffsetDateTime(cols[9]),
|
||||
asOffsetDateTime(cols[10]),
|
||||
asString(cols[11]),
|
||||
asDouble(cols[12]),
|
||||
asString(cols[13])
|
||||
));
|
||||
}
|
||||
return results;
|
||||
return jdbcTemplate.query(sql.toString(), params,
|
||||
new SearchHitRowMapper(SearchEngineType.POSTGRES_TRIGRAM, SearchMatchField.REPRESENTATION_TEXT));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,54 @@
|
|||
package at.procon.dip.search.repository;
|
||||
|
||||
import at.procon.dip.domain.access.DocumentVisibility;
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import at.procon.dip.search.dto.SearchMatchField;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import org.springframework.jdbc.core.RowMapper;
|
||||
|
||||
final class SearchHitRowMapper implements RowMapper<SearchHit> {
|
||||
|
||||
private final SearchEngineType engineType;
|
||||
private final SearchMatchField defaultField;
|
||||
|
||||
SearchHitRowMapper(SearchEngineType engineType, SearchMatchField defaultField) {
|
||||
this.engineType = engineType;
|
||||
this.defaultField = defaultField;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SearchHit mapRow(ResultSet rs, int rowNum) throws SQLException {
|
||||
String matchedField = safeGetString(rs, "matched_field");
|
||||
return SearchHit.builder()
|
||||
.documentId(rs.getObject("document_id", java.util.UUID.class))
|
||||
.representationId(rs.getObject("representation_id", java.util.UUID.class))
|
||||
.documentType(DocumentType.valueOf(rs.getString("document_type")))
|
||||
.documentFamily(DocumentFamily.valueOf(rs.getString("document_family")))
|
||||
.visibility(DocumentVisibility.valueOf(rs.getString("visibility")))
|
||||
.title(safeGetString(rs, "title"))
|
||||
.summary(safeGetString(rs, "summary"))
|
||||
.languageCode(safeGetString(rs, "language_code"))
|
||||
.mimeType(safeGetString(rs, "mime_type"))
|
||||
.primaryEngine(engineType)
|
||||
.matchedField(matchedField == null || matchedField.isBlank()
|
||||
? defaultField
|
||||
: SearchMatchField.valueOf(matchedField))
|
||||
.snippet(safeGetString(rs, "snippet"))
|
||||
.rawScore(rs.getDouble("score"))
|
||||
.createdAt(rs.getObject("created_at", java.time.OffsetDateTime.class))
|
||||
.updatedAt(rs.getObject("updated_at", java.time.OffsetDateTime.class))
|
||||
.build();
|
||||
}
|
||||
|
||||
private String safeGetString(ResultSet rs, String column) {
|
||||
try {
|
||||
return rs.getString(column);
|
||||
} catch (SQLException ignore) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,84 @@
|
|||
package at.procon.dip.search.repository;
|
||||
|
||||
import at.procon.dip.domain.access.DocumentVisibility;
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import org.springframework.jdbc.core.namedparam.MapSqlParameterSource;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
final class SearchSqlFilterSupport {
|
||||
|
||||
private SearchSqlFilterSupport() {
|
||||
}
|
||||
|
||||
static void appendCommonFilters(StringBuilder sql,
|
||||
MapSqlParameterSource params,
|
||||
SearchExecutionContext context,
|
||||
String documentAlias,
|
||||
String representationAlias,
|
||||
boolean tenantJoinPresent) {
|
||||
Set<DocumentType> documentTypes = firstNonEmpty(context.getRequest().getDocumentTypes(), context.getScope().documentTypes());
|
||||
if (!CollectionUtils.isEmpty(documentTypes)) {
|
||||
sql.append(" AND CAST(").append(documentAlias).append(".document_type AS text) IN (:documentTypes)");
|
||||
params.addValue("documentTypes", enumNames(documentTypes));
|
||||
}
|
||||
|
||||
Set<DocumentFamily> documentFamilies = firstNonEmpty(context.getRequest().getDocumentFamilies(), context.getScope().documentFamilies());
|
||||
if (!CollectionUtils.isEmpty(documentFamilies)) {
|
||||
sql.append(" AND CAST(").append(documentAlias).append(".document_family AS text) IN (:documentFamilies)");
|
||||
params.addValue("documentFamilies", enumNames(documentFamilies));
|
||||
}
|
||||
|
||||
Set<DocumentVisibility> visibilities = firstNonEmpty(context.getRequest().getVisibilities(), context.getScope().visibilities());
|
||||
if (!CollectionUtils.isEmpty(visibilities)) {
|
||||
sql.append(" AND CAST(").append(documentAlias).append(".visibility AS text) IN (:visibilities)");
|
||||
params.addValue("visibilities", enumNames(visibilities));
|
||||
}
|
||||
|
||||
Set<String> languageCodes = context.getRequest().getLanguageCodes();
|
||||
if (CollectionUtils.isEmpty(languageCodes) && context.getScope().languageCode() != null && !context.getScope().languageCode().isBlank()) {
|
||||
languageCodes = Set.of(context.getScope().languageCode());
|
||||
}
|
||||
if (!CollectionUtils.isEmpty(languageCodes)) {
|
||||
sql.append(" AND COALESCE(").append(representationAlias).append(".language_code, ")
|
||||
.append(documentAlias).append(".language_code, '') IN (:languageCodes)");
|
||||
params.addValue("languageCodes", languageCodes);
|
||||
}
|
||||
|
||||
Set<RepresentationType> representationTypes = context.getRequest().getRepresentationTypes();
|
||||
if (!CollectionUtils.isEmpty(representationTypes)) {
|
||||
sql.append(" AND CAST(").append(representationAlias).append(".representation_type AS text) IN (:representationTypes)");
|
||||
params.addValue("representationTypes", enumNames(representationTypes));
|
||||
} else {
|
||||
sql.append(" AND ").append(representationAlias).append(".is_primary = true");
|
||||
}
|
||||
|
||||
if (context.getRequest().getCreatedFrom() != null) {
|
||||
sql.append(" AND ").append(documentAlias).append(".created_at >= :createdFrom");
|
||||
params.addValue("createdFrom", context.getRequest().getCreatedFrom());
|
||||
}
|
||||
if (context.getRequest().getCreatedTo() != null) {
|
||||
sql.append(" AND ").append(documentAlias).append(".created_at <= :createdTo");
|
||||
params.addValue("createdTo", context.getRequest().getCreatedTo());
|
||||
}
|
||||
|
||||
if (tenantJoinPresent && !CollectionUtils.isEmpty(context.getScope().ownerTenantKeys())) {
|
||||
sql.append(" AND dt.tenant_key IN (:ownerTenantKeys)");
|
||||
params.addValue("ownerTenantKeys", context.getScope().ownerTenantKeys());
|
||||
}
|
||||
}
|
||||
|
||||
private static <T> Set<T> firstNonEmpty(Set<T> primary, Set<T> fallback) {
|
||||
return !CollectionUtils.isEmpty(primary) ? primary : fallback;
|
||||
}
|
||||
|
||||
private static List<String> enumNames(Collection<? extends Enum<?>> values) {
|
||||
return values.stream().map(Enum::name).collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
|
|
@ -2,6 +2,8 @@ package at.procon.dip.search.service;
|
|||
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import at.procon.dip.search.api.SearchExecutionPlan;
|
||||
import at.procon.dip.search.dto.SearchDebugResponse;
|
||||
import at.procon.dip.search.dto.SearchEngineDebugResult;
|
||||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import at.procon.dip.search.dto.SearchRequest;
|
||||
|
|
@ -10,6 +12,8 @@ import at.procon.dip.search.engine.SearchEngine;
|
|||
import at.procon.dip.search.plan.SearchPlanner;
|
||||
import at.procon.dip.search.rank.SearchResultFusionService;
|
||||
import at.procon.dip.search.spi.SearchDocumentScope;
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
|
@ -20,28 +24,66 @@ import org.springframework.stereotype.Service;
|
|||
@RequiredArgsConstructor
|
||||
public class DefaultSearchOrchestrator implements SearchOrchestrator {
|
||||
|
||||
private final TedProcessorProperties properties;
|
||||
private final SearchPlanner planner;
|
||||
private final List<SearchEngine> engines;
|
||||
private final SearchResultFusionService fusionService;
|
||||
|
||||
@Override
|
||||
public SearchResponse search(SearchRequest request, SearchDocumentScope scope) {
|
||||
SearchExecution execution = executeInternal(request, scope);
|
||||
return fusionService.fuse(execution.context(), execution.plan(), execution.engineResults());
|
||||
}
|
||||
|
||||
@Override
|
||||
public SearchDebugResponse debug(SearchRequest request, SearchDocumentScope scope) {
|
||||
SearchExecution execution = executeInternal(request, scope);
|
||||
SearchResponse fused = fusionService.fuse(execution.context(), execution.plan(), execution.engineResults());
|
||||
|
||||
List<SearchEngineDebugResult> debugResults = new ArrayList<>();
|
||||
int topLimit = properties.getSearch().getDebugTopHitsPerEngine();
|
||||
execution.engineResults().forEach((engine, hits) -> debugResults.add(SearchEngineDebugResult.builder()
|
||||
.engineType(engine)
|
||||
.hitCount(hits.size())
|
||||
.topHits(hits.stream().limit(topLimit).toList())
|
||||
.build()));
|
||||
|
||||
return SearchDebugResponse.builder()
|
||||
.request(request)
|
||||
.plan(execution.plan())
|
||||
.engineResults(debugResults)
|
||||
.fusedResponse(fused)
|
||||
.build();
|
||||
}
|
||||
|
||||
private SearchExecution executeInternal(SearchRequest request, SearchDocumentScope scope) {
|
||||
int page = request.getPage() == null || request.getPage() < 0 ? 0 : request.getPage();
|
||||
int requestedSize = request.getSize() == null || request.getSize() <= 0
|
||||
? properties.getSearch().getDefaultPageSize()
|
||||
: request.getSize();
|
||||
int size = Math.min(requestedSize, properties.getSearch().getMaxPageSize());
|
||||
|
||||
SearchExecutionContext context = SearchExecutionContext.builder()
|
||||
.request(request)
|
||||
.scope(scope)
|
||||
.page(request.getPage() == null ? 0 : request.getPage())
|
||||
.size(request.getSize() == null ? 20 : request.getSize())
|
||||
.page(page)
|
||||
.size(size)
|
||||
.build();
|
||||
|
||||
SearchExecutionPlan plan = planner.plan(context);
|
||||
|
||||
Map<SearchEngineType, List<SearchHit>> engineResults = new LinkedHashMap<>();
|
||||
for (SearchEngine engine : engines) {
|
||||
if (plan.getEngines().contains(engine.type()) && engine.supports(context)) {
|
||||
engineResults.put(engine.type(), engine.execute(context));
|
||||
}
|
||||
}
|
||||
return new SearchExecution(context, plan, engineResults);
|
||||
}
|
||||
|
||||
return fusionService.fuse(context, plan, engineResults);
|
||||
private record SearchExecution(
|
||||
SearchExecutionContext context,
|
||||
SearchExecutionPlan plan,
|
||||
Map<SearchEngineType, List<SearchHit>> engineResults
|
||||
) {
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,27 +1,45 @@
|
|||
package at.procon.dip.search.service;
|
||||
|
||||
import jakarta.persistence.EntityManager;
|
||||
import jakarta.persistence.PersistenceContext;
|
||||
import jakarta.transaction.Transactional;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.jdbc.core.JdbcTemplate;
|
||||
import org.springframework.jdbc.core.namedparam.MapSqlParameterSource;
|
||||
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
@Service
|
||||
@Transactional
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class DocumentLexicalIndexService {
|
||||
|
||||
@PersistenceContext
|
||||
private EntityManager entityManager;
|
||||
private final NamedParameterJdbcTemplate namedParameterJdbcTemplate;
|
||||
private final JdbcTemplate jdbcTemplate;
|
||||
|
||||
/**
|
||||
* New Slice 2 name kept for current code.
|
||||
*/
|
||||
@Transactional
|
||||
public void indexRepresentation(UUID representationId) {
|
||||
refreshRepresentationLexicalIndex(representationId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Backward-compatible Slice 1 method name.
|
||||
*/
|
||||
@Transactional
|
||||
public void refreshRepresentationLexicalIndex(UUID representationId) {
|
||||
if (!isLexicalSearchSchemaAvailable()) {
|
||||
log.debug("Skipping lexical index refresh for representation {} because search columns are not available yet", representationId);
|
||||
log.debug("Skipping lexical indexing for representation {} because search_vector columns are not present yet", representationId);
|
||||
return;
|
||||
}
|
||||
entityManager.createNativeQuery("""
|
||||
UPDATE DOC.doc_text_representation
|
||||
|
||||
MapSqlParameterSource params = new MapSqlParameterSource();
|
||||
params.addValue("representationId", representationId);
|
||||
namedParameterJdbcTemplate.update("""
|
||||
UPDATE doc.doc_text_representation
|
||||
SET search_config = CASE
|
||||
WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german'
|
||||
WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english'
|
||||
|
|
@ -36,18 +54,39 @@ public class DocumentLexicalIndexService {
|
|||
coalesce(text_body, '')
|
||||
)
|
||||
WHERE id = :representationId
|
||||
""")
|
||||
.setParameter("representationId", representationId)
|
||||
.executeUpdate();
|
||||
""", params);
|
||||
}
|
||||
|
||||
/**
|
||||
* New Slice 2 method kept for current startup runner.
|
||||
*/
|
||||
@Transactional
|
||||
public int backfillMissingVectors(int limit) {
|
||||
if (!isLexicalSearchSchemaAvailable()) {
|
||||
return 0;
|
||||
}
|
||||
List<UUID> ids = jdbcTemplate.query("""
|
||||
SELECT id
|
||||
FROM doc.doc_text_representation
|
||||
WHERE search_vector IS NULL
|
||||
ORDER BY created_at ASC
|
||||
LIMIT ?
|
||||
""", (rs, rowNum) -> rs.getObject(1, UUID.class), limit);
|
||||
ids.forEach(this::refreshRepresentationLexicalIndex);
|
||||
return ids.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Backward-compatible Slice 1 method name.
|
||||
*/
|
||||
@Transactional
|
||||
public void refreshAllMissingLexicalIndexes() {
|
||||
if (!isLexicalSearchSchemaAvailable()) {
|
||||
log.info("Lexical search columns are not available yet. Skipping startup backfill for DOC lexical indexes.");
|
||||
return;
|
||||
}
|
||||
entityManager.createNativeQuery("""
|
||||
UPDATE DOC.doc_text_representation
|
||||
jdbcTemplate.update("""
|
||||
UPDATE doc.doc_text_representation
|
||||
SET search_config = CASE
|
||||
WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german'
|
||||
WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english'
|
||||
|
|
@ -62,19 +101,27 @@ public class DocumentLexicalIndexService {
|
|||
coalesce(text_body, '')
|
||||
)
|
||||
WHERE search_vector IS NULL
|
||||
""")
|
||||
.executeUpdate();
|
||||
""");
|
||||
}
|
||||
|
||||
private boolean isLexicalSearchSchemaAvailable() {
|
||||
Number count = (Number) entityManager.createNativeQuery("""
|
||||
/**
|
||||
* New Slice 2 name kept for current code.
|
||||
*/
|
||||
public boolean searchVectorColumnsPresent() {
|
||||
return isLexicalSearchSchemaAvailable();
|
||||
}
|
||||
|
||||
/**
|
||||
* Backward-compatible Slice 1 method name.
|
||||
*/
|
||||
public boolean isLexicalSearchSchemaAvailable() {
|
||||
Integer count = jdbcTemplate.queryForObject("""
|
||||
SELECT COUNT(*)
|
||||
FROM information_schema.columns
|
||||
WHERE table_schema = 'doc'
|
||||
AND table_name = 'doc_text_representation'
|
||||
AND column_name IN ('search_config', 'search_vector')
|
||||
""")
|
||||
.getSingleResult();
|
||||
return count != null && count.intValue() >= 2;
|
||||
AND column_name IN ('search_vector', 'search_config')
|
||||
""", Integer.class);
|
||||
return count != null && count >= 2;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,25 @@
|
|||
package at.procon.dip.search.service;
|
||||
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.boot.ApplicationArguments;
|
||||
import org.springframework.boot.ApplicationRunner;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class SearchLexicalIndexStartupRunner implements ApplicationRunner {
|
||||
|
||||
private final TedProcessorProperties properties;
|
||||
private final DocumentLexicalIndexService lexicalIndexService;
|
||||
|
||||
@Override
|
||||
public void run(ApplicationArguments args) {
|
||||
int updated = lexicalIndexService.backfillMissingVectors(properties.getSearch().getStartupLexicalBackfillLimit());
|
||||
if (updated > 0) {
|
||||
log.info("Search lexical index startup backfill updated {} representations", updated);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,9 +1,11 @@
|
|||
package at.procon.dip.search.service;
|
||||
|
||||
import at.procon.dip.search.dto.SearchDebugResponse;
|
||||
import at.procon.dip.search.dto.SearchRequest;
|
||||
import at.procon.dip.search.dto.SearchResponse;
|
||||
import at.procon.dip.search.spi.SearchDocumentScope;
|
||||
|
||||
public interface SearchOrchestrator {
|
||||
SearchResponse search(SearchRequest request, SearchDocumentScope scope);
|
||||
SearchDebugResponse debug(SearchRequest request, SearchDocumentScope scope);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,39 @@
|
|||
package at.procon.dip.search.service;
|
||||
|
||||
import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
|
||||
import at.procon.dip.domain.document.service.DocumentEmbeddingService;
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import at.procon.ted.service.VectorizationService;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class SemanticQueryEmbeddingService {
|
||||
|
||||
private final TedProcessorProperties properties;
|
||||
private final DocumentEmbeddingService documentEmbeddingService;
|
||||
private final VectorizationService vectorizationService;
|
||||
|
||||
public Optional<QueryEmbedding> buildQueryEmbedding(String queryText) {
|
||||
if (!properties.getVectorization().isEnabled()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
try {
|
||||
DocumentEmbeddingModel model = documentEmbeddingService.findActiveModelByKey(
|
||||
properties.getVectorization().getModelName());
|
||||
float[] vector = vectorizationService.generateQueryEmbedding(queryText);
|
||||
return Optional.of(new QueryEmbedding(model.getId(), vectorizationService.floatArrayToVectorString(vector)));
|
||||
} catch (Exception e) {
|
||||
log.warn("Failed to generate semantic query embedding: {}", e.getMessage());
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
public record QueryEmbedding(UUID modelId, String vectorString) {
|
||||
}
|
||||
}
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
package at.procon.dip.search.web;
|
||||
|
||||
import at.procon.dip.search.dto.SearchDebugResponse;
|
||||
import at.procon.dip.search.dto.SearchRequest;
|
||||
import at.procon.dip.search.dto.SearchResponse;
|
||||
import at.procon.dip.search.service.SearchOrchestrator;
|
||||
|
|
@ -21,15 +22,24 @@ public class GenericSearchController {
|
|||
|
||||
@PostMapping
|
||||
public SearchResponse search(@Valid @RequestBody SearchRequest request) {
|
||||
SearchDocumentScope scope = new SearchDocumentScope(
|
||||
return searchOrchestrator.search(request, buildScope(request));
|
||||
}
|
||||
|
||||
@PostMapping("/debug")
|
||||
public SearchDebugResponse debug(@Valid @RequestBody SearchRequest request) {
|
||||
return searchOrchestrator.debug(request, buildScope(request));
|
||||
}
|
||||
|
||||
private SearchDocumentScope buildScope(SearchRequest request) {
|
||||
String scopeLanguage = (request.getLanguageCodes() == null || request.getLanguageCodes().isEmpty())
|
||||
? null
|
||||
: request.getLanguageCodes().iterator().next();
|
||||
return new SearchDocumentScope(
|
||||
Set.of(),
|
||||
request.getDocumentTypes(),
|
||||
request.getDocumentFamilies(),
|
||||
request.getVisibilities(),
|
||||
request.getLanguageCodes() == null || request.getLanguageCodes().isEmpty()
|
||||
? null
|
||||
: request.getLanguageCodes().iterator().next()
|
||||
scopeLanguage
|
||||
);
|
||||
return searchOrchestrator.search(request, scope);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -209,6 +209,42 @@ public class TedProcessorProperties {
|
|||
* Similarity threshold for vector search (0.0 - 1.0).
|
||||
*/
|
||||
private double similarityThreshold = 0.7;
|
||||
|
||||
/**
|
||||
* Minimum trigram similarity for fuzzy lexical matches.
|
||||
*/
|
||||
private double trigramSimilarityThreshold = 0.12;
|
||||
|
||||
/**
|
||||
* Candidate limits per search engine before fusion/collapse.
|
||||
*/
|
||||
@Positive
|
||||
private int fulltextCandidateLimit = 120;
|
||||
|
||||
@Positive
|
||||
private int trigramCandidateLimit = 120;
|
||||
|
||||
@Positive
|
||||
private int semanticCandidateLimit = 120;
|
||||
|
||||
/**
|
||||
* Hybrid fusion weights.
|
||||
*/
|
||||
private double fulltextWeight = 0.35;
|
||||
private double trigramWeight = 0.20;
|
||||
private double semanticWeight = 0.45;
|
||||
|
||||
/**
|
||||
* Startup backfill limit for missing DOC lexical vectors.
|
||||
*/
|
||||
@Positive
|
||||
private int startupLexicalBackfillLimit = 500;
|
||||
|
||||
/**
|
||||
* Number of hits per engine returned by the debug endpoint.
|
||||
*/
|
||||
@Positive
|
||||
private int debugTopHitsPerEngine = 10;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -124,11 +124,25 @@ ted:
|
|||
max-page-size: 100
|
||||
# Similarity threshold for vector search (0.0 - 1.0)
|
||||
similarity-threshold: 0.7
|
||||
# Minimum trigram similarity for fuzzy lexical matches
|
||||
trigram-similarity-threshold: 0.12
|
||||
# Candidate limits per engine before fusion/collapse
|
||||
fulltext-candidate-limit: 120
|
||||
trigram-candidate-limit: 120
|
||||
semantic-candidate-limit: 120
|
||||
# Hybrid fusion weights
|
||||
fulltext-weight: 0.35
|
||||
trigram-weight: 0.20
|
||||
semantic-weight: 0.45
|
||||
# Startup backfill limit for missing lexical vectors
|
||||
startup-lexical-backfill-limit: 500
|
||||
# Number of top hits per engine returned by /search/debug
|
||||
debug-top-hits-per-engine: 10
|
||||
|
||||
# TED Daily Package Download configuration
|
||||
download:
|
||||
# Enable/disable automatic package download
|
||||
enabled: true
|
||||
enabled: false
|
||||
# User service-based camel route
|
||||
use-service-based: false
|
||||
# Base URL for TED Daily Packages
|
||||
|
|
@ -142,7 +156,7 @@ ted:
|
|||
# Max consecutive 404 errors before stopping
|
||||
max-consecutive-404: 4
|
||||
# Polling interval (milliseconds) - 2 minutes
|
||||
poll-interval: 3600000
|
||||
poll-interval: 1800000
|
||||
# Retry interval for tail NOT_FOUND packages - 6 hours
|
||||
not-found-retry-interval: 21600000
|
||||
# Grace period after year end before a previous-year tail 404 is treated as final
|
||||
|
|
@ -163,7 +177,7 @@ ted:
|
|||
# IMAP Mail configuration
|
||||
mail:
|
||||
# Enable/disable mail processing
|
||||
enabled: false
|
||||
enabled: true
|
||||
# IMAP server hostname
|
||||
host: mail.mymagenta.business
|
||||
# IMAP server port (993 for IMAPS)
|
||||
|
|
@ -185,7 +199,7 @@ ted:
|
|||
# Polling delay in milliseconds (1 minute)
|
||||
delay: 60000
|
||||
# Max messages per poll
|
||||
max-messages-per-poll: 10
|
||||
max-messages-per-poll: 100
|
||||
# Output directory for processed attachments
|
||||
attachment-output-directory: /ted.europe/mail-attachments
|
||||
# Enable/disable MIME file input processing
|
||||
|
|
@ -195,7 +209,7 @@ ted:
|
|||
# File pattern for MIME files (regex)
|
||||
mime-input-pattern: .*\\.eml
|
||||
# Polling interval for MIME input directory (milliseconds)
|
||||
mime-input-poll-interval: 10000
|
||||
mime-input-poll-interval: 1000000
|
||||
|
||||
# Phase 3 TED projection configuration
|
||||
projection:
|
||||
|
|
@ -225,7 +239,7 @@ ted:
|
|||
# Polling interval for the generic route
|
||||
poll-interval: 15000
|
||||
# Maximum files per poll
|
||||
max-messages-per-poll: 10
|
||||
max-messages-per-poll: 200
|
||||
# Optional default owner tenant; leave empty for PUBLIC docs like TED or public knowledge docs
|
||||
default-owner-tenant-key:
|
||||
# Default visibility when no explicit access context is provided
|
||||
|
|
@ -247,7 +261,7 @@ ted:
|
|||
# Enable Phase 4.1 TED package adapter on top of the generic DOC ingestion SPI
|
||||
ted-package-adapter-enabled: true
|
||||
# Enable Phase 4.1 mail/document adapter on top of the generic DOC ingestion SPI
|
||||
mail-adapter-enabled: false
|
||||
mail-adapter-enabled: true
|
||||
# Optional dedicated mail owner tenant, falls back to default-owner-tenant-key
|
||||
mail-default-owner-tenant-key:
|
||||
# Visibility for imported mail messages and attachments
|
||||
|
|
|
|||
|
|
@ -0,0 +1,26 @@
|
|||
-- Slice 1 + Slice 2 generic search support for DOC documents.
|
||||
-- Adds lexical-search support columns/indexes and pg_trgm extension.
|
||||
|
||||
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||
|
||||
ALTER TABLE DOC.doc_text_representation
|
||||
ADD COLUMN IF NOT EXISTS search_config VARCHAR(64);
|
||||
|
||||
ALTER TABLE DOC.doc_text_representation
|
||||
ADD COLUMN IF NOT EXISTS search_vector tsvector;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_search_vector
|
||||
ON DOC.doc_text_representation
|
||||
USING GIN (search_vector);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_document_title_trgm
|
||||
ON DOC.doc_document
|
||||
USING GIN (title gin_trgm_ops);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_document_summary_trgm
|
||||
ON DOC.doc_document
|
||||
USING GIN (summary gin_trgm_ops);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_text_trgm
|
||||
ON DOC.doc_text_representation
|
||||
USING GIN (text_body gin_trgm_ops);
|
||||
Loading…
Reference in New Issue