Refactor phases 5 - search - slice 1

master
trifonovt 1 month ago
parent 90093ab98d
commit 47894257a4

@ -0,0 +1,16 @@
Slice 1 generic search patch
Included changes:
- Generic search DTOs, planner, orchestrator, engine SPI
- PostgreSQL full-text engine and repository
- PostgreSQL trigram engine and repository
- Score normalization and result fusion
- Generic /search endpoint
- Lexical index maintenance service and startup backfill runner
- DOC lexical search migration (V9)
- Modified DOC representation write path to refresh search vectors
Important note:
- Full-text search requires V9__doc_search_slice1_support.sql to be applied.
- The lexical index service is guarded and will no-op if the search columns are not yet present.
- Because Flyway is currently disabled in application.yml, apply the migration manually or enable Flyway before using the new search endpoint.

@ -84,6 +84,9 @@ public class DocumentTextRepresentation {
@Column(name = "text_body", columnDefinition = "TEXT", nullable = false)
private String textBody;
@Column(name = "search_config", length = 64)
private String searchConfig;
@Builder.Default
@Column(name = "created_at", nullable = false, updatable = false)
private OffsetDateTime createdAt = OffsetDateTime.now();

@ -4,6 +4,7 @@ import at.procon.dip.domain.document.entity.DocumentContent;
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
import at.procon.dip.search.service.DocumentLexicalIndexService;
import java.util.List;
import java.util.UUID;
import lombok.RequiredArgsConstructor;
@ -18,6 +19,7 @@ public class DocumentRepresentationService {
private final DocumentService documentService;
private final DocumentContentService contentService;
private final DocumentTextRepresentationRepository representationRepository;
private final DocumentLexicalIndexService lexicalIndexService;
public DocumentTextRepresentation addRepresentation(AddDocumentTextRepresentationCommand command) {
DocumentContent content = command.contentId() == null ? null : contentService.getRequired(command.contentId());
@ -34,7 +36,9 @@ public class DocumentRepresentationService {
.primaryRepresentation(command.primaryRepresentation())
.textBody(command.textBody())
.build();
return representationRepository.save(representation);
DocumentTextRepresentation saved = representationRepository.save(representation);
lexicalIndexService.refreshRepresentationLexicalIndex(saved.getId());
return saved;
}
@Transactional(readOnly = true)

@ -41,6 +41,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
String semantic = buildSemanticText(title, summary, request.detectionResult().documentType());
List<TextRepresentationDraft> drafts = new ArrayList<>();
/*
drafts.add(new TextRepresentationDraft(
RepresentationType.FULLTEXT,
BUILDER_KEY,
@ -51,6 +52,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
ContentRole.NORMALIZED_TEXT,
Boolean.TRUE
));
*/
drafts.add(new TextRepresentationDraft(
RepresentationType.SEMANTIC_TEXT,
BUILDER_KEY,
@ -61,6 +63,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
ContentRole.NORMALIZED_TEXT,
Boolean.TRUE
));
/*
if (StringUtils.hasText(title)) {
drafts.add(new TextRepresentationDraft(
RepresentationType.TITLE_ABSTRACT,
@ -73,6 +76,7 @@ public class DefaultGenericTextRepresentationBuilder implements TextRepresentati
Boolean.FALSE
));
}
*/
return drafts;
}

@ -0,0 +1,16 @@
package at.procon.dip.search.api;
import at.procon.dip.search.dto.SearchRequest;
import at.procon.dip.search.spi.SearchDocumentScope;
import lombok.Builder;
import lombok.Getter;
@Getter
@Builder
public class SearchExecutionContext {
private final SearchRequest request;
private final SearchDocumentScope scope;
private final int page;
private final int size;
}

@ -0,0 +1,16 @@
package at.procon.dip.search.api;
import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchSortMode;
import java.util.List;
import lombok.Builder;
import lombok.Getter;
@Getter
@Builder
public class SearchExecutionPlan {
private final List<SearchEngineType> engines;
private final boolean collapseByDocument;
private final SearchSortMode sortMode;
}

@ -0,0 +1,6 @@
package at.procon.dip.search.dto;
public enum SearchEngineType {
POSTGRES_FULLTEXT,
POSTGRES_TRIGRAM
}

@ -0,0 +1,41 @@
package at.procon.dip.search.dto;
import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import java.time.OffsetDateTime;
import java.util.UUID;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder(toBuilder = true)
@NoArgsConstructor
@AllArgsConstructor
public class SearchHit {
private UUID documentId;
private UUID representationId;
private DocumentType documentType;
private DocumentFamily documentFamily;
private DocumentVisibility visibility;
private String title;
private String summary;
private String languageCode;
private String mimeType;
private SearchEngineType primaryEngine;
private SearchMatchField matchedField;
private String snippet;
private double rawScore;
private double normalizedScore;
private double finalScore;
private OffsetDateTime createdAt;
private OffsetDateTime updatedAt;
}

@ -0,0 +1,7 @@
package at.procon.dip.search.dto;
public enum SearchMatchField {
DOCUMENT_TITLE,
DOCUMENT_SUMMARY,
REPRESENTATION_TEXT
}

@ -0,0 +1,7 @@
package at.procon.dip.search.dto;
public enum SearchMode {
FULLTEXT,
TRIGRAM,
HYBRID
}

@ -0,0 +1,48 @@
package at.procon.dip.search.dto;
import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType;
import jakarta.validation.constraints.Min;
import jakarta.validation.constraints.NotBlank;
import java.time.OffsetDateTime;
import java.util.LinkedHashSet;
import java.util.Set;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SearchRequest {
@NotBlank
private String queryText;
@Builder.Default
private Set<SearchMode> modes = new LinkedHashSet<>(Set.of(SearchMode.HYBRID));
private Set<DocumentType> documentTypes;
private Set<DocumentFamily> documentFamilies;
private Set<DocumentVisibility> visibilities;
private Set<String> languageCodes;
private Set<RepresentationType> representationTypes;
private OffsetDateTime createdFrom;
private OffsetDateTime createdTo;
@Min(0)
private Integer page;
@Min(1)
private Integer size;
@Builder.Default
private SearchSortMode sortMode = SearchSortMode.SCORE_DESC;
@Builder.Default
private boolean collapseByDocument = true;
}

@ -0,0 +1,22 @@
package at.procon.dip.search.dto;
import java.util.List;
import java.util.Set;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SearchResponse {
private List<SearchHit> hits;
private int page;
private int size;
private long totalHits;
private boolean truncated;
private Set<SearchEngineType> enginesUsed;
}

@ -0,0 +1,7 @@
package at.procon.dip.search.dto;
public enum SearchSortMode {
SCORE_DESC,
CREATED_AT_DESC,
TITLE_ASC
}

@ -0,0 +1,12 @@
package at.procon.dip.search.engine;
import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit;
import java.util.List;
public interface SearchEngine {
SearchEngineType type();
boolean supports(SearchExecutionContext context);
List<SearchHit> execute(SearchExecutionContext context);
}

@ -0,0 +1,71 @@
package at.procon.dip.search.engine.fulltext;
import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit;
import at.procon.dip.search.dto.SearchMatchField;
import at.procon.dip.search.engine.SearchEngine;
import at.procon.dip.search.repository.DocumentFullTextSearchRepository;
import at.procon.dip.search.repository.FullTextSearchRow;
import java.util.List;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component;
@Component
@RequiredArgsConstructor
public class PostgresFullTextSearchEngine implements SearchEngine {
private final DocumentFullTextSearchRepository repository;
@Override
public SearchEngineType type() {
return SearchEngineType.POSTGRES_FULLTEXT;
}
@Override
public boolean supports(SearchExecutionContext context) {
return context.getRequest().getQueryText() != null && !context.getRequest().getQueryText().isBlank();
}
@Override
public List<SearchHit> execute(SearchExecutionContext context) {
return repository.search(context).stream()
.map(this::mapRow)
.toList();
}
private SearchHit mapRow(FullTextSearchRow row) {
return SearchHit.builder()
.documentId(row.documentId())
.representationId(row.representationId())
.documentType(parseDocumentType(row.documentType()))
.documentFamily(parseDocumentFamily(row.documentFamily()))
.visibility(parseVisibility(row.visibility()))
.title(row.title())
.summary(row.summary())
.languageCode(row.languageCode())
.mimeType(row.mimeType())
.primaryEngine(SearchEngineType.POSTGRES_FULLTEXT)
.matchedField(SearchMatchField.REPRESENTATION_TEXT)
.snippet(row.snippet())
.rawScore(row.score() == null ? 0.0d : row.score())
.createdAt(row.createdAt())
.updatedAt(row.updatedAt())
.build();
}
private DocumentType parseDocumentType(String value) {
return value == null ? null : DocumentType.valueOf(value);
}
private DocumentFamily parseDocumentFamily(String value) {
return value == null ? null : DocumentFamily.valueOf(value);
}
private DocumentVisibility parseVisibility(String value) {
return value == null ? null : DocumentVisibility.valueOf(value);
}
}

@ -0,0 +1,75 @@
package at.procon.dip.search.engine.trigram;
import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit;
import at.procon.dip.search.dto.SearchMatchField;
import at.procon.dip.search.engine.SearchEngine;
import at.procon.dip.search.repository.DocumentTrigramSearchRepository;
import at.procon.dip.search.repository.TrigramSearchRow;
import java.util.List;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component;
@Component
@RequiredArgsConstructor
public class PostgresTrigramSearchEngine implements SearchEngine {
private final DocumentTrigramSearchRepository repository;
@Override
public SearchEngineType type() {
return SearchEngineType.POSTGRES_TRIGRAM;
}
@Override
public boolean supports(SearchExecutionContext context) {
return context.getRequest().getQueryText() != null && !context.getRequest().getQueryText().isBlank();
}
@Override
public List<SearchHit> execute(SearchExecutionContext context) {
return repository.search(context).stream()
.map(this::mapRow)
.toList();
}
private SearchHit mapRow(TrigramSearchRow row) {
return SearchHit.builder()
.documentId(row.documentId())
.representationId(row.representationId())
.documentType(parseDocumentType(row.documentType()))
.documentFamily(parseDocumentFamily(row.documentFamily()))
.visibility(parseVisibility(row.visibility()))
.title(row.title())
.summary(row.summary())
.languageCode(row.languageCode())
.mimeType(row.mimeType())
.primaryEngine(SearchEngineType.POSTGRES_TRIGRAM)
.matchedField(parseMatchField(row.matchedField()))
.snippet(row.snippet())
.rawScore(row.score() == null ? 0.0d : row.score())
.createdAt(row.createdAt())
.updatedAt(row.updatedAt())
.build();
}
private SearchMatchField parseMatchField(String value) {
return value == null ? SearchMatchField.REPRESENTATION_TEXT : SearchMatchField.valueOf(value);
}
private DocumentType parseDocumentType(String value) {
return value == null ? null : DocumentType.valueOf(value);
}
private DocumentFamily parseDocumentFamily(String value) {
return value == null ? null : DocumentFamily.valueOf(value);
}
private DocumentVisibility parseVisibility(String value) {
return value == null ? null : DocumentVisibility.valueOf(value);
}
}

@ -0,0 +1,33 @@
package at.procon.dip.search.plan;
import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.api.SearchExecutionPlan;
import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchMode;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import org.springframework.stereotype.Component;
@Component
public class DefaultSearchPlanner implements SearchPlanner {
@Override
public SearchExecutionPlan plan(SearchExecutionContext context) {
Set<SearchMode> modes = context.getRequest().getModes();
List<SearchEngineType> engines = new ArrayList<>();
if (modes == null || modes.isEmpty() || modes.contains(SearchMode.HYBRID) || modes.contains(SearchMode.FULLTEXT)) {
engines.add(SearchEngineType.POSTGRES_FULLTEXT);
}
if (modes == null || modes.isEmpty() || modes.contains(SearchMode.HYBRID) || modes.contains(SearchMode.TRIGRAM)) {
engines.add(SearchEngineType.POSTGRES_TRIGRAM);
}
return SearchExecutionPlan.builder()
.engines(engines)
.collapseByDocument(context.getRequest().isCollapseByDocument())
.sortMode(context.getRequest().getSortMode())
.build();
}
}

@ -0,0 +1,8 @@
package at.procon.dip.search.plan;
import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.api.SearchExecutionPlan;
public interface SearchPlanner {
SearchExecutionPlan plan(SearchExecutionContext context);
}

@ -0,0 +1,107 @@
package at.procon.dip.search.rank;
import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.api.SearchExecutionPlan;
import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit;
import at.procon.dip.search.dto.SearchResponse;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import org.springframework.stereotype.Component;
@Component
public class DefaultSearchResultFusionService implements SearchResultFusionService {
private static final double FULLTEXT_WEIGHT = 0.60d;
private static final double TRIGRAM_WEIGHT = 0.40d;
private final SearchScoreNormalizer scoreNormalizer;
public DefaultSearchResultFusionService(SearchScoreNormalizer scoreNormalizer) {
this.scoreNormalizer = scoreNormalizer;
}
@Override
public SearchResponse fuse(SearchExecutionContext context,
SearchExecutionPlan plan,
Map<SearchEngineType, List<SearchHit>> engineResults) {
Map<SearchEngineType, List<SearchHit>> normalizedResults = new LinkedHashMap<>();
for (Map.Entry<SearchEngineType, List<SearchHit>> entry : engineResults.entrySet()) {
normalizedResults.put(entry.getKey(), scoreNormalizer.normalize(entry.getKey(), entry.getValue()));
}
List<SearchHit> ranked = plan.isCollapseByDocument()
? collapseByDocument(normalizedResults)
: flatten(normalizedResults);
ranked.sort(Comparator
.comparingDouble(SearchHit::getFinalScore).reversed()
.thenComparing(SearchHit::getUpdatedAt, Comparator.nullsLast(Comparator.reverseOrder())));
int totalHits = ranked.size();
int fromIndex = Math.min(context.getPage() * context.getSize(), ranked.size());
int toIndex = Math.min(fromIndex + context.getSize(), ranked.size());
List<SearchHit> pageHits = ranked.subList(fromIndex, toIndex);
return SearchResponse.builder()
.hits(new ArrayList<>(pageHits))
.page(context.getPage())
.size(context.getSize())
.totalHits(totalHits)
.truncated(toIndex < totalHits)
.enginesUsed(new LinkedHashSet<>(normalizedResults.keySet()))
.build();
}
private List<SearchHit> flatten(Map<SearchEngineType, List<SearchHit>> normalizedResults) {
List<SearchHit> merged = new ArrayList<>();
for (Map.Entry<SearchEngineType, List<SearchHit>> entry : normalizedResults.entrySet()) {
for (SearchHit hit : entry.getValue()) {
merged.add(hit.toBuilder().finalScore(weight(entry.getKey()) * hit.getNormalizedScore()).build());
}
}
return merged;
}
private List<SearchHit> collapseByDocument(Map<SearchEngineType, List<SearchHit>> normalizedResults) {
Map<UUID, SearchHit> collapsed = new LinkedHashMap<>();
Map<UUID, Double> accumulatedScores = new LinkedHashMap<>();
Set<UUID> docOrder = new LinkedHashSet<>();
for (Map.Entry<SearchEngineType, List<SearchHit>> entry : normalizedResults.entrySet()) {
double weight = weight(entry.getKey());
for (SearchHit hit : entry.getValue()) {
docOrder.add(hit.getDocumentId());
double contribution = weight * hit.getNormalizedScore();
accumulatedScores.merge(hit.getDocumentId(), contribution, Double::sum);
SearchHit existing = collapsed.get(hit.getDocumentId());
if (existing == null || hit.getNormalizedScore() > existing.getNormalizedScore()) {
collapsed.put(hit.getDocumentId(), hit);
}
}
}
List<SearchHit> results = new ArrayList<>(docOrder.size());
for (UUID documentId : docOrder) {
SearchHit base = collapsed.get(documentId);
if (base != null) {
results.add(base.toBuilder().finalScore(accumulatedScores.getOrDefault(documentId, 0.0d)).build());
}
}
return results;
}
private double weight(SearchEngineType engineType) {
return switch (engineType) {
case POSTGRES_FULLTEXT -> FULLTEXT_WEIGHT;
case POSTGRES_TRIGRAM -> TRIGRAM_WEIGHT;
};
}
}

@ -0,0 +1,28 @@
package at.procon.dip.search.rank;
import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit;
import java.util.ArrayList;
import java.util.List;
import org.springframework.stereotype.Component;
@Component
public class DefaultSearchScoreNormalizer implements SearchScoreNormalizer {
@Override
public List<SearchHit> normalize(SearchEngineType engineType, List<SearchHit> hits) {
if (hits == null || hits.isEmpty()) {
return List.of();
}
double max = hits.stream().mapToDouble(SearchHit::getRawScore).max().orElse(0.0d);
if (max <= 0.0d) {
max = 1.0d;
}
List<SearchHit> normalized = new ArrayList<>(hits.size());
for (SearchHit hit : hits) {
double score = Math.max(0.0d, Math.min(1.0d, hit.getRawScore() / max));
normalized.add(hit.toBuilder().normalizedScore(score).build());
}
return normalized;
}
}

@ -0,0 +1,17 @@
package at.procon.dip.search.rank;
import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.api.SearchExecutionPlan;
import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit;
import at.procon.dip.search.dto.SearchResponse;
import java.util.List;
import java.util.Map;
public interface SearchResultFusionService {
SearchResponse fuse(
SearchExecutionContext context,
SearchExecutionPlan plan,
Map<SearchEngineType, List<SearchHit>> engineResults
);
}

@ -0,0 +1,9 @@
package at.procon.dip.search.rank;
import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit;
import java.util.List;
public interface SearchScoreNormalizer {
List<SearchHit> normalize(SearchEngineType engineType, List<SearchHit> hits);
}

@ -0,0 +1,140 @@
package at.procon.dip.search.repository;
import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.dto.SearchRequest;
import jakarta.persistence.Query;
import java.sql.Timestamp;
import java.time.LocalDateTime;
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.UUID;
abstract class AbstractNativeSearchRepository {
protected int engineLimit(SearchExecutionContext context) {
return Math.max(50, (context.getPage() + 1) * context.getSize() * 3);
}
protected void appendGenericFilters(StringBuilder sql, Map<String, Object> params, SearchExecutionContext context) {
SearchRequest request = context.getRequest();
appendEnumInClause(sql, params, "d.document_type::text", "documentType", request.getDocumentTypes());
appendEnumInClause(sql, params, "d.document_family::text", "documentFamily", request.getDocumentFamilies());
Collection<DocumentVisibility> visibilities = request.getVisibilities();
if ((visibilities == null || visibilities.isEmpty()) && context.getScope() != null) {
visibilities = context.getScope().visibilities();
}
appendEnumInClause(sql, params, "d.visibility::text", "visibility", visibilities);
Collection<String> ownerTenantKeys = context.getScope() == null ? null : context.getScope().ownerTenantKeys();
if (ownerTenantKeys != null && !ownerTenantKeys.isEmpty()) {
appendStringInClause(sql, params, "COALESCE(dt.tenant_key, '')", "tenantKey", ownerTenantKeys);
}
Collection<String> languageCodes = request.getLanguageCodes();
if ((languageCodes == null || languageCodes.isEmpty()) && context.getScope() != null && context.getScope().languageCode() != null) {
languageCodes = java.util.List.of(context.getScope().languageCode());
}
appendStringInClause(sql, params, "COALESCE(dtr.language_code, d.language_code, '')", "languageCode", languageCodes);
appendEnumInClause(sql, params, "dtr.representation_type::text", "representationType", request.getRepresentationTypes());
if (request.getCreatedFrom() != null) {
sql.append(" AND d.created_at >= :createdFrom");
params.put("createdFrom", request.getCreatedFrom());
}
if (request.getCreatedTo() != null) {
sql.append(" AND d.created_at <= :createdTo");
params.put("createdTo", request.getCreatedTo());
}
}
protected void bindParameters(Query query, Map<String, Object> params) {
for (Map.Entry<String, Object> entry : params.entrySet()) {
query.setParameter(entry.getKey(), entry.getValue());
}
}
protected Map<String, Object> newParams() {
return new LinkedHashMap<>();
}
protected void appendEnumInClause(StringBuilder sql, Map<String, Object> params, String expression, String baseParam, Collection<?> values) {
if (values == null || values.isEmpty()) {
return;
}
sql.append(" AND ").append(expression).append(" IN (");
int i = 0;
for (Object value : values) {
String param = baseParam + i++;
if (i > 1) {
sql.append(", ");
}
sql.append(':').append(param);
params.put(param, value.toString());
}
sql.append(')');
}
protected void appendStringInClause(StringBuilder sql, Map<String, Object> params, String expression, String baseParam, Collection<String> values) {
if (values == null || values.isEmpty()) {
return;
}
sql.append(" AND ").append(expression).append(" IN (");
int i = 0;
for (String value : values) {
String param = baseParam + i++;
if (i > 1) {
sql.append(", ");
}
sql.append(':').append(param);
params.put(param, value);
}
sql.append(')');
}
protected UUID asUuid(Object value) {
if (value == null) {
return null;
}
if (value instanceof UUID uuid) {
return uuid;
}
return UUID.fromString(value.toString());
}
protected OffsetDateTime asOffsetDateTime(Object value) {
if (value == null) {
return null;
}
if (value instanceof OffsetDateTime odt) {
return odt;
}
if (value instanceof Timestamp timestamp) {
return timestamp.toInstant().atOffset(ZoneOffset.UTC);
}
if (value instanceof LocalDateTime ldt) {
return ldt.atOffset(ZoneOffset.UTC);
}
throw new IllegalArgumentException("Unsupported timestamp value: " + value.getClass());
}
protected String asString(Object value) {
return value == null ? null : value.toString();
}
protected Double asDouble(Object value) {
if (value == null) {
return null;
}
if (value instanceof Number number) {
return number.doubleValue();
}
return Double.parseDouble(value.toString());
}
}

@ -0,0 +1,8 @@
package at.procon.dip.search.repository;
import at.procon.dip.search.api.SearchExecutionContext;
import java.util.List;
public interface DocumentFullTextSearchRepository {
List<FullTextSearchRow> search(SearchExecutionContext context);
}

@ -0,0 +1,72 @@
package at.procon.dip.search.repository;
import at.procon.dip.search.api.SearchExecutionContext;
import jakarta.persistence.EntityManager;
import jakarta.persistence.PersistenceContext;
import jakarta.persistence.Query;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.springframework.stereotype.Repository;
@Repository
public class DocumentFullTextSearchRepositoryImpl extends AbstractNativeSearchRepository implements DocumentFullTextSearchRepository {
@PersistenceContext
private EntityManager entityManager;
@Override
public List<FullTextSearchRow> search(SearchExecutionContext context) {
StringBuilder sql = new StringBuilder("""
SELECT
d.id AS document_id,
dtr.id AS representation_id,
d.title AS title,
d.summary AS summary,
COALESCE(dtr.language_code, d.language_code) AS language_code,
d.mime_type AS mime_type,
d.document_type AS document_type,
d.document_family AS document_family,
d.visibility AS visibility,
d.created_at AS created_at,
d.updated_at AS updated_at,
ts_headline('simple', COALESCE(dtr.text_body, ''), websearch_to_tsquery('simple', :queryText)) AS snippet,
ts_rank_cd(dtr.search_vector, websearch_to_tsquery('simple', :queryText)) AS score
FROM DOC.doc_text_representation dtr
JOIN DOC.doc_document d ON d.id = dtr.document_id
LEFT JOIN DOC.doc_tenant dt ON dt.id = d.owner_tenant_id
WHERE dtr.search_vector @@ websearch_to_tsquery('simple', :queryText)
""");
Map<String, Object> params = newParams();
params.put("queryText", context.getRequest().getQueryText().trim());
appendGenericFilters(sql, params, context);
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
params.put("limit", engineLimit(context));
Query query = entityManager.createNativeQuery(sql.toString());
bindParameters(query, params);
List<?> rows = query.getResultList();
List<FullTextSearchRow> results = new ArrayList<>(rows.size());
for (Object row : rows) {
Object[] cols = (Object[]) row;
results.add(new FullTextSearchRow(
asUuid(cols[0]),
asUuid(cols[1]),
asString(cols[2]),
asString(cols[3]),
asString(cols[4]),
asString(cols[5]),
asString(cols[6]),
asString(cols[7]),
asString(cols[8]),
asOffsetDateTime(cols[9]),
asOffsetDateTime(cols[10]),
asString(cols[11]),
asDouble(cols[12])
));
}
return results;
}
}

@ -0,0 +1,8 @@
package at.procon.dip.search.repository;
import at.procon.dip.search.api.SearchExecutionContext;
import java.util.List;
public interface DocumentTrigramSearchRepository {
List<TrigramSearchRow> search(SearchExecutionContext context);
}

@ -0,0 +1,102 @@
package at.procon.dip.search.repository;
import at.procon.dip.search.api.SearchExecutionContext;
import jakarta.persistence.EntityManager;
import jakarta.persistence.PersistenceContext;
import jakarta.persistence.Query;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.springframework.stereotype.Repository;
@Repository
public class DocumentTrigramSearchRepositoryImpl extends AbstractNativeSearchRepository implements DocumentTrigramSearchRepository {
@PersistenceContext
private EntityManager entityManager;
@Override
public List<TrigramSearchRow> search(SearchExecutionContext context) {
StringBuilder sql = new StringBuilder("""
SELECT
d.id AS document_id,
dtr.id AS representation_id,
d.title AS title,
d.summary AS summary,
COALESCE(dtr.language_code, d.language_code) AS language_code,
d.mime_type AS mime_type,
d.document_type AS document_type,
d.document_family AS document_family,
d.visibility AS visibility,
d.created_at AS created_at,
d.updated_at AS updated_at,
CASE
WHEN similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(d.summary, ''), :queryText)
AND similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText)
THEN COALESCE(d.title, '')
WHEN similarity(COALESCE(d.summary, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText)
THEN COALESCE(d.summary, '')
ELSE LEFT(COALESCE(dtr.text_body, ''), 400)
END AS snippet,
GREATEST(
similarity(COALESCE(d.title, ''), :queryText),
similarity(COALESCE(d.summary, ''), :queryText),
similarity(COALESCE(dtr.text_body, ''), :queryText)
) AS score,
CASE
WHEN similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(d.summary, ''), :queryText)
AND similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText)
THEN 'DOCUMENT_TITLE'
WHEN similarity(COALESCE(d.summary, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText)
THEN 'DOCUMENT_SUMMARY'
ELSE 'REPRESENTATION_TEXT'
END AS matched_field
FROM DOC.doc_text_representation dtr
JOIN DOC.doc_document d ON d.id = dtr.document_id
LEFT JOIN DOC.doc_tenant dt ON dt.id = d.owner_tenant_id
WHERE (
COALESCE(d.title, '') % :queryText
OR COALESCE(d.summary, '') % :queryText
OR COALESCE(dtr.text_body, '') % :queryText
)
""");
Map<String, Object> params = newParams();
params.put("queryText", context.getRequest().getQueryText().trim());
appendGenericFilters(sql, params, context);
sql.append(" AND GREATEST(")
.append(" similarity(COALESCE(d.title, ''), :queryText),")
.append(" similarity(COALESCE(d.summary, ''), :queryText),")
.append(" similarity(COALESCE(dtr.text_body, ''), :queryText)")
.append(") >= :minSimilarity");
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
params.put("minSimilarity", 0.10d);
params.put("limit", engineLimit(context));
Query query = entityManager.createNativeQuery(sql.toString());
bindParameters(query, params);
List<?> rows = query.getResultList();
List<TrigramSearchRow> results = new ArrayList<>(rows.size());
for (Object row : rows) {
Object[] cols = (Object[]) row;
results.add(new TrigramSearchRow(
asUuid(cols[0]),
asUuid(cols[1]),
asString(cols[2]),
asString(cols[3]),
asString(cols[4]),
asString(cols[5]),
asString(cols[6]),
asString(cols[7]),
asString(cols[8]),
asOffsetDateTime(cols[9]),
asOffsetDateTime(cols[10]),
asString(cols[11]),
asDouble(cols[12]),
asString(cols[13])
));
}
return results;
}
}

@ -0,0 +1,21 @@
package at.procon.dip.search.repository;
import java.time.OffsetDateTime;
import java.util.UUID;
public record FullTextSearchRow(
UUID documentId,
UUID representationId,
String title,
String summary,
String languageCode,
String mimeType,
String documentType,
String documentFamily,
String visibility,
OffsetDateTime createdAt,
OffsetDateTime updatedAt,
String snippet,
Double score
) {
}

@ -0,0 +1,22 @@
package at.procon.dip.search.repository;
import java.time.OffsetDateTime;
import java.util.UUID;
public record TrigramSearchRow(
UUID documentId,
UUID representationId,
String title,
String summary,
String languageCode,
String mimeType,
String documentType,
String documentFamily,
String visibility,
OffsetDateTime createdAt,
OffsetDateTime updatedAt,
String snippet,
Double score,
String matchedField
) {
}

@ -0,0 +1,47 @@
package at.procon.dip.search.service;
import at.procon.dip.search.api.SearchExecutionContext;
import at.procon.dip.search.api.SearchExecutionPlan;
import at.procon.dip.search.dto.SearchEngineType;
import at.procon.dip.search.dto.SearchHit;
import at.procon.dip.search.dto.SearchRequest;
import at.procon.dip.search.dto.SearchResponse;
import at.procon.dip.search.engine.SearchEngine;
import at.procon.dip.search.plan.SearchPlanner;
import at.procon.dip.search.rank.SearchResultFusionService;
import at.procon.dip.search.spi.SearchDocumentScope;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Service;
@Service
@RequiredArgsConstructor
public class DefaultSearchOrchestrator implements SearchOrchestrator {
private final SearchPlanner planner;
private final List<SearchEngine> engines;
private final SearchResultFusionService fusionService;
@Override
public SearchResponse search(SearchRequest request, SearchDocumentScope scope) {
SearchExecutionContext context = SearchExecutionContext.builder()
.request(request)
.scope(scope)
.page(request.getPage() == null ? 0 : request.getPage())
.size(request.getSize() == null ? 20 : request.getSize())
.build();
SearchExecutionPlan plan = planner.plan(context);
Map<SearchEngineType, List<SearchHit>> engineResults = new LinkedHashMap<>();
for (SearchEngine engine : engines) {
if (plan.getEngines().contains(engine.type()) && engine.supports(context)) {
engineResults.put(engine.type(), engine.execute(context));
}
}
return fusionService.fuse(context, plan, engineResults);
}
}

@ -0,0 +1,80 @@
package at.procon.dip.search.service;
import jakarta.persistence.EntityManager;
import jakarta.persistence.PersistenceContext;
import jakarta.transaction.Transactional;
import java.util.UUID;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
@Service
@Transactional
@Slf4j
public class DocumentLexicalIndexService {
@PersistenceContext
private EntityManager entityManager;
public void refreshRepresentationLexicalIndex(UUID representationId) {
if (!isLexicalSearchSchemaAvailable()) {
log.debug("Skipping lexical index refresh for representation {} because search columns are not available yet", representationId);
return;
}
entityManager.createNativeQuery("""
UPDATE DOC.doc_text_representation
SET search_config = CASE
WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german'
WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english'
ELSE 'simple'
END,
search_vector = to_tsvector(
CASE
WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german'::regconfig
WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english'::regconfig
ELSE 'simple'::regconfig
END,
coalesce(text_body, '')
)
WHERE id = :representationId
""")
.setParameter("representationId", representationId)
.executeUpdate();
}
public void refreshAllMissingLexicalIndexes() {
if (!isLexicalSearchSchemaAvailable()) {
log.info("Lexical search columns are not available yet. Skipping startup backfill for DOC lexical indexes.");
return;
}
entityManager.createNativeQuery("""
UPDATE DOC.doc_text_representation
SET search_config = CASE
WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german'
WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english'
ELSE 'simple'
END,
search_vector = to_tsvector(
CASE
WHEN lower(coalesce(language_code, '')) = 'de' THEN 'german'::regconfig
WHEN lower(coalesce(language_code, '')) = 'en' THEN 'english'::regconfig
ELSE 'simple'::regconfig
END,
coalesce(text_body, '')
)
WHERE search_vector IS NULL
""")
.executeUpdate();
}
private boolean isLexicalSearchSchemaAvailable() {
Number count = (Number) entityManager.createNativeQuery("""
SELECT COUNT(*)
FROM information_schema.columns
WHERE table_schema = 'doc'
AND table_name = 'doc_text_representation'
AND column_name IN ('search_config', 'search_vector')
""")
.getSingleResult();
return count != null && count.intValue() >= 2;
}
}

@ -0,0 +1,9 @@
package at.procon.dip.search.service;
import at.procon.dip.search.dto.SearchRequest;
import at.procon.dip.search.dto.SearchResponse;
import at.procon.dip.search.spi.SearchDocumentScope;
public interface SearchOrchestrator {
SearchResponse search(SearchRequest request, SearchDocumentScope scope);
}

@ -0,0 +1,21 @@
package at.procon.dip.search.startup;
import at.procon.dip.search.service.DocumentLexicalIndexService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.boot.CommandLineRunner;
import org.springframework.stereotype.Component;
@Component
@RequiredArgsConstructor
@Slf4j
public class LexicalSearchStartupRunner implements CommandLineRunner {
private final DocumentLexicalIndexService lexicalIndexService;
@Override
public void run(String... args) {
log.info("Refreshing missing lexical search vectors for DOC text representations");
lexicalIndexService.refreshAllMissingLexicalIndexes();
}
}

@ -0,0 +1,35 @@
package at.procon.dip.search.web;
import at.procon.dip.search.dto.SearchRequest;
import at.procon.dip.search.dto.SearchResponse;
import at.procon.dip.search.service.SearchOrchestrator;
import at.procon.dip.search.spi.SearchDocumentScope;
import jakarta.validation.Valid;
import java.util.Set;
import lombok.RequiredArgsConstructor;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
@RestController
@RequestMapping("/search")
@RequiredArgsConstructor
public class GenericSearchController {
private final SearchOrchestrator searchOrchestrator;
@PostMapping
public SearchResponse search(@Valid @RequestBody SearchRequest request) {
SearchDocumentScope scope = new SearchDocumentScope(
Set.of(),
request.getDocumentTypes(),
request.getDocumentFamilies(),
request.getVisibilities(),
request.getLanguageCodes() == null || request.getLanguageCodes().isEmpty()
? null
: request.getLanguageCodes().iterator().next()
);
return searchOrchestrator.search(request, scope);
}
}

@ -0,0 +1,26 @@
-- Slice 1 generic lexical search support.
-- Adds PostgreSQL full-text and trigram search infrastructure for DOC-side search.
CREATE EXTENSION IF NOT EXISTS pg_trgm;
ALTER TABLE DOC.doc_text_representation
ADD COLUMN IF NOT EXISTS search_config VARCHAR(64);
ALTER TABLE DOC.doc_text_representation
ADD COLUMN IF NOT EXISTS search_vector tsvector;
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_search_vector
ON DOC.doc_text_representation
USING GIN (search_vector);
CREATE INDEX IF NOT EXISTS idx_doc_document_title_trgm
ON DOC.doc_document
USING GIN (title gin_trgm_ops);
CREATE INDEX IF NOT EXISTS idx_doc_document_summary_trgm
ON DOC.doc_document
USING GIN (summary gin_trgm_ops);
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_text_trgm
ON DOC.doc_text_representation
USING GIN (text_body gin_trgm_ops);
Loading…
Cancel
Save