Refactor phases 5 - semantic search - slice 2
parent
47894257a4
commit
039b5a5f0a
@ -0,0 +1,19 @@
|
||||
package at.procon.dip.search.dto;
|
||||
|
||||
import at.procon.dip.search.api.SearchExecutionPlan;
|
||||
import java.util.List;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class SearchDebugResponse {
|
||||
private SearchRequest request;
|
||||
private SearchExecutionPlan plan;
|
||||
private List<SearchEngineDebugResult> engineResults;
|
||||
private SearchResponse fusedResponse;
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
package at.procon.dip.search.dto;
|
||||
|
||||
import java.util.List;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class SearchEngineDebugResult {
|
||||
private SearchEngineType engineType;
|
||||
private int hitCount;
|
||||
private List<SearchHit> topHits;
|
||||
}
|
||||
@ -0,0 +1,45 @@
|
||||
package at.procon.dip.search.engine.semantic;
|
||||
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import at.procon.dip.search.engine.SearchEngine;
|
||||
import at.procon.dip.search.repository.DocumentSemanticSearchRepository;
|
||||
import at.procon.dip.search.service.SemanticQueryEmbeddingService;
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import java.util.List;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
public class PgVectorSemanticSearchEngine implements SearchEngine {
|
||||
|
||||
private final TedProcessorProperties properties;
|
||||
private final SemanticQueryEmbeddingService queryEmbeddingService;
|
||||
private final DocumentSemanticSearchRepository repository;
|
||||
|
||||
@Override
|
||||
public SearchEngineType type() {
|
||||
return SearchEngineType.PGVECTOR_SEMANTIC;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean supports(SearchExecutionContext context) {
|
||||
return properties.getVectorization().isEnabled()
|
||||
&& context.getRequest().getQueryText() != null
|
||||
&& !context.getRequest().getQueryText().isBlank();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<SearchHit> execute(SearchExecutionContext context) {
|
||||
return queryEmbeddingService.buildQueryEmbedding(context.getRequest().getQueryText())
|
||||
.map(query -> repository.search(
|
||||
context,
|
||||
query.modelId(),
|
||||
query.vectorString(),
|
||||
properties.getSearch().getSemanticCandidateLimit(),
|
||||
properties.getSearch().getSimilarityThreshold()))
|
||||
.orElse(List.of());
|
||||
}
|
||||
}
|
||||
@ -1,8 +1,10 @@
|
||||
package at.procon.dip.search.repository;
|
||||
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import java.util.List;
|
||||
|
||||
public interface DocumentFullTextSearchRepository {
|
||||
List<FullTextSearchRow> search(SearchExecutionContext context);
|
||||
|
||||
List<SearchHit> search(SearchExecutionContext context, int limit);
|
||||
}
|
||||
|
||||
@ -1,72 +1,53 @@
|
||||
package at.procon.dip.search.repository;
|
||||
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import jakarta.persistence.EntityManager;
|
||||
import jakarta.persistence.PersistenceContext;
|
||||
import jakarta.persistence.Query;
|
||||
import java.util.ArrayList;
|
||||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import at.procon.dip.search.dto.SearchMatchField;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.jdbc.core.namedparam.MapSqlParameterSource;
|
||||
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
@Repository
|
||||
public class DocumentFullTextSearchRepositoryImpl extends AbstractNativeSearchRepository implements DocumentFullTextSearchRepository {
|
||||
@RequiredArgsConstructor
|
||||
public class DocumentFullTextSearchRepositoryImpl implements DocumentFullTextSearchRepository {
|
||||
|
||||
@PersistenceContext
|
||||
private EntityManager entityManager;
|
||||
private final NamedParameterJdbcTemplate jdbcTemplate;
|
||||
|
||||
@Override
|
||||
public List<FullTextSearchRow> search(SearchExecutionContext context) {
|
||||
public List<SearchHit> search(SearchExecutionContext context, int limit) {
|
||||
StringBuilder sql = new StringBuilder("""
|
||||
SELECT
|
||||
d.id AS document_id,
|
||||
dtr.id AS representation_id,
|
||||
CAST(d.document_type AS text) AS document_type,
|
||||
CAST(d.document_family AS text) AS document_family,
|
||||
CAST(d.visibility AS text) AS visibility,
|
||||
d.title AS title,
|
||||
d.summary AS summary,
|
||||
COALESCE(dtr.language_code, d.language_code) AS language_code,
|
||||
d.mime_type AS mime_type,
|
||||
d.document_type AS document_type,
|
||||
d.document_family AS document_family,
|
||||
d.visibility AS visibility,
|
||||
d.created_at AS created_at,
|
||||
d.updated_at AS updated_at,
|
||||
ts_headline('simple', COALESCE(dtr.text_body, ''), websearch_to_tsquery('simple', :queryText)) AS snippet,
|
||||
ts_headline('simple', COALESCE(dtr.text_body, ''), websearch_to_tsquery('simple', :queryText),
|
||||
'MaxFragments=2, MinWords=5, MaxWords=20') AS snippet,
|
||||
ts_rank_cd(dtr.search_vector, websearch_to_tsquery('simple', :queryText)) AS score
|
||||
FROM DOC.doc_text_representation dtr
|
||||
JOIN DOC.doc_document d ON d.id = dtr.document_id
|
||||
LEFT JOIN DOC.doc_tenant dt ON dt.id = d.owner_tenant_id
|
||||
WHERE dtr.search_vector @@ websearch_to_tsquery('simple', :queryText)
|
||||
FROM doc.doc_text_representation dtr
|
||||
JOIN doc.doc_document d ON d.id = dtr.document_id
|
||||
LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id
|
||||
WHERE dtr.search_vector IS NOT NULL
|
||||
AND dtr.search_vector @@ websearch_to_tsquery('simple', :queryText)
|
||||
""");
|
||||
|
||||
Map<String, Object> params = newParams();
|
||||
params.put("queryText", context.getRequest().getQueryText().trim());
|
||||
appendGenericFilters(sql, params, context);
|
||||
MapSqlParameterSource params = new MapSqlParameterSource();
|
||||
params.addValue("queryText", context.getRequest().getQueryText());
|
||||
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
|
||||
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
|
||||
params.put("limit", engineLimit(context));
|
||||
params.addValue("limit", limit);
|
||||
|
||||
Query query = entityManager.createNativeQuery(sql.toString());
|
||||
bindParameters(query, params);
|
||||
|
||||
List<?> rows = query.getResultList();
|
||||
List<FullTextSearchRow> results = new ArrayList<>(rows.size());
|
||||
for (Object row : rows) {
|
||||
Object[] cols = (Object[]) row;
|
||||
results.add(new FullTextSearchRow(
|
||||
asUuid(cols[0]),
|
||||
asUuid(cols[1]),
|
||||
asString(cols[2]),
|
||||
asString(cols[3]),
|
||||
asString(cols[4]),
|
||||
asString(cols[5]),
|
||||
asString(cols[6]),
|
||||
asString(cols[7]),
|
||||
asString(cols[8]),
|
||||
asOffsetDateTime(cols[9]),
|
||||
asOffsetDateTime(cols[10]),
|
||||
asString(cols[11]),
|
||||
asDouble(cols[12])
|
||||
));
|
||||
}
|
||||
return results;
|
||||
return jdbcTemplate.query(sql.toString(), params,
|
||||
new SearchHitRowMapper(SearchEngineType.POSTGRES_FULLTEXT, SearchMatchField.REPRESENTATION_TEXT));
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,61 @@
|
||||
package at.procon.dip.search.repository;
|
||||
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import at.procon.dip.search.dto.SearchMatchField;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.jdbc.core.namedparam.MapSqlParameterSource;
|
||||
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
@Repository
|
||||
@RequiredArgsConstructor
|
||||
public class DocumentSemanticSearchRepository {
|
||||
|
||||
private final NamedParameterJdbcTemplate jdbcTemplate;
|
||||
|
||||
public List<SearchHit> search(SearchExecutionContext context,
|
||||
UUID modelId,
|
||||
String queryVector,
|
||||
int limit,
|
||||
double threshold) {
|
||||
StringBuilder sql = new StringBuilder("""
|
||||
SELECT
|
||||
d.id AS document_id,
|
||||
dtr.id AS representation_id,
|
||||
CAST(d.document_type AS text) AS document_type,
|
||||
CAST(d.document_family AS text) AS document_family,
|
||||
CAST(d.visibility AS text) AS visibility,
|
||||
d.title AS title,
|
||||
d.summary AS summary,
|
||||
COALESCE(dtr.language_code, d.language_code) AS language_code,
|
||||
d.mime_type AS mime_type,
|
||||
d.created_at AS created_at,
|
||||
d.updated_at AS updated_at,
|
||||
LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet,
|
||||
(1 - (de.embedding_vector <=> CAST(:queryVector AS vector))) AS score
|
||||
FROM doc.doc_embedding de
|
||||
JOIN doc.doc_text_representation dtr ON dtr.id = de.representation_id
|
||||
JOIN doc.doc_document d ON d.id = de.document_id
|
||||
LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id
|
||||
WHERE de.embedding_status = 'COMPLETED'
|
||||
AND de.embedding_vector IS NOT NULL
|
||||
AND de.model_id = :modelId
|
||||
AND (1 - (de.embedding_vector <=> CAST(:queryVector AS vector))) >= :threshold
|
||||
""");
|
||||
|
||||
MapSqlParameterSource params = new MapSqlParameterSource();
|
||||
params.addValue("queryVector", queryVector);
|
||||
params.addValue("modelId", modelId);
|
||||
params.addValue("threshold", threshold);
|
||||
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
|
||||
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
|
||||
params.addValue("limit", limit);
|
||||
|
||||
return jdbcTemplate.query(sql.toString(), params,
|
||||
new SearchHitRowMapper(SearchEngineType.PGVECTOR_SEMANTIC, SearchMatchField.REPRESENTATION_TEXT));
|
||||
}
|
||||
}
|
||||
@ -1,8 +1,10 @@
|
||||
package at.procon.dip.search.repository;
|
||||
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import java.util.List;
|
||||
|
||||
public interface DocumentTrigramSearchRepository {
|
||||
List<TrigramSearchRow> search(SearchExecutionContext context);
|
||||
|
||||
List<SearchHit> search(SearchExecutionContext context, int limit, double threshold);
|
||||
}
|
||||
|
||||
@ -1,102 +1,60 @@
|
||||
package at.procon.dip.search.repository;
|
||||
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import jakarta.persistence.EntityManager;
|
||||
import jakarta.persistence.PersistenceContext;
|
||||
import jakarta.persistence.Query;
|
||||
import java.util.ArrayList;
|
||||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import at.procon.dip.search.dto.SearchMatchField;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.jdbc.core.namedparam.MapSqlParameterSource;
|
||||
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
@Repository
|
||||
public class DocumentTrigramSearchRepositoryImpl extends AbstractNativeSearchRepository implements DocumentTrigramSearchRepository {
|
||||
@RequiredArgsConstructor
|
||||
public class DocumentTrigramSearchRepositoryImpl implements DocumentTrigramSearchRepository {
|
||||
|
||||
@PersistenceContext
|
||||
private EntityManager entityManager;
|
||||
private final NamedParameterJdbcTemplate jdbcTemplate;
|
||||
|
||||
@Override
|
||||
public List<TrigramSearchRow> search(SearchExecutionContext context) {
|
||||
StringBuilder sql = new StringBuilder("""
|
||||
SELECT
|
||||
d.id AS document_id,
|
||||
dtr.id AS representation_id,
|
||||
d.title AS title,
|
||||
d.summary AS summary,
|
||||
COALESCE(dtr.language_code, d.language_code) AS language_code,
|
||||
d.mime_type AS mime_type,
|
||||
d.document_type AS document_type,
|
||||
d.document_family AS document_family,
|
||||
d.visibility AS visibility,
|
||||
d.created_at AS created_at,
|
||||
d.updated_at AS updated_at,
|
||||
CASE
|
||||
WHEN similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(d.summary, ''), :queryText)
|
||||
AND similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText)
|
||||
THEN COALESCE(d.title, '')
|
||||
WHEN similarity(COALESCE(d.summary, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText)
|
||||
THEN COALESCE(d.summary, '')
|
||||
ELSE LEFT(COALESCE(dtr.text_body, ''), 400)
|
||||
END AS snippet,
|
||||
GREATEST(
|
||||
similarity(COALESCE(d.title, ''), :queryText),
|
||||
similarity(COALESCE(d.summary, ''), :queryText),
|
||||
similarity(COALESCE(dtr.text_body, ''), :queryText)
|
||||
) AS score,
|
||||
CASE
|
||||
WHEN similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(d.summary, ''), :queryText)
|
||||
AND similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText)
|
||||
THEN 'DOCUMENT_TITLE'
|
||||
WHEN similarity(COALESCE(d.summary, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText)
|
||||
THEN 'DOCUMENT_SUMMARY'
|
||||
ELSE 'REPRESENTATION_TEXT'
|
||||
END AS matched_field
|
||||
FROM DOC.doc_text_representation dtr
|
||||
JOIN DOC.doc_document d ON d.id = dtr.document_id
|
||||
LEFT JOIN DOC.doc_tenant dt ON dt.id = d.owner_tenant_id
|
||||
WHERE (
|
||||
COALESCE(d.title, '') % :queryText
|
||||
OR COALESCE(d.summary, '') % :queryText
|
||||
OR COALESCE(dtr.text_body, '') % :queryText
|
||||
)
|
||||
""");
|
||||
public List<SearchHit> search(SearchExecutionContext context, int limit, double threshold) {
|
||||
String scoreExpr = "GREATEST(" +
|
||||
"similarity(COALESCE(d.title, ''), :queryText), " +
|
||||
"similarity(COALESCE(d.summary, ''), :queryText), " +
|
||||
"similarity(COALESCE(dtr.text_body, ''), :queryText))";
|
||||
|
||||
Map<String, Object> params = newParams();
|
||||
params.put("queryText", context.getRequest().getQueryText().trim());
|
||||
appendGenericFilters(sql, params, context);
|
||||
sql.append(" AND GREATEST(")
|
||||
.append(" similarity(COALESCE(d.title, ''), :queryText),")
|
||||
.append(" similarity(COALESCE(d.summary, ''), :queryText),")
|
||||
.append(" similarity(COALESCE(dtr.text_body, ''), :queryText)")
|
||||
.append(") >= :minSimilarity");
|
||||
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
|
||||
params.put("minSimilarity", 0.10d);
|
||||
params.put("limit", engineLimit(context));
|
||||
StringBuilder sql = new StringBuilder("SELECT " +
|
||||
"d.id AS document_id, " +
|
||||
"dtr.id AS representation_id, " +
|
||||
"CAST(d.document_type AS text) AS document_type, " +
|
||||
"CAST(d.document_family AS text) AS document_family, " +
|
||||
"CAST(d.visibility AS text) AS visibility, " +
|
||||
"d.title AS title, " +
|
||||
"d.summary AS summary, " +
|
||||
"COALESCE(dtr.language_code, d.language_code) AS language_code, " +
|
||||
"d.mime_type AS mime_type, " +
|
||||
"d.created_at AS created_at, " +
|
||||
"d.updated_at AS updated_at, " +
|
||||
"LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet, " +
|
||||
scoreExpr + " AS score, " +
|
||||
"CASE " +
|
||||
"WHEN similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(d.summary, ''), :queryText) " +
|
||||
" AND similarity(COALESCE(d.title, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText) THEN 'DOCUMENT_TITLE' " +
|
||||
"WHEN similarity(COALESCE(d.summary, ''), :queryText) >= similarity(COALESCE(dtr.text_body, ''), :queryText) THEN 'DOCUMENT_SUMMARY' " +
|
||||
"ELSE 'REPRESENTATION_TEXT' END AS matched_field " +
|
||||
"FROM doc.doc_text_representation dtr " +
|
||||
"JOIN doc.doc_document d ON d.id = dtr.document_id " +
|
||||
"LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id " +
|
||||
"WHERE " + scoreExpr + " >= :threshold");
|
||||
|
||||
Query query = entityManager.createNativeQuery(sql.toString());
|
||||
bindParameters(query, params);
|
||||
MapSqlParameterSource params = new MapSqlParameterSource();
|
||||
params.addValue("queryText", context.getRequest().getQueryText());
|
||||
params.addValue("threshold", threshold);
|
||||
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
|
||||
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
|
||||
params.addValue("limit", limit);
|
||||
|
||||
List<?> rows = query.getResultList();
|
||||
List<TrigramSearchRow> results = new ArrayList<>(rows.size());
|
||||
for (Object row : rows) {
|
||||
Object[] cols = (Object[]) row;
|
||||
results.add(new TrigramSearchRow(
|
||||
asUuid(cols[0]),
|
||||
asUuid(cols[1]),
|
||||
asString(cols[2]),
|
||||
asString(cols[3]),
|
||||
asString(cols[4]),
|
||||
asString(cols[5]),
|
||||
asString(cols[6]),
|
||||
asString(cols[7]),
|
||||
asString(cols[8]),
|
||||
asOffsetDateTime(cols[9]),
|
||||
asOffsetDateTime(cols[10]),
|
||||
asString(cols[11]),
|
||||
asDouble(cols[12]),
|
||||
asString(cols[13])
|
||||
));
|
||||
}
|
||||
return results;
|
||||
return jdbcTemplate.query(sql.toString(), params,
|
||||
new SearchHitRowMapper(SearchEngineType.POSTGRES_TRIGRAM, SearchMatchField.REPRESENTATION_TEXT));
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,54 @@
|
||||
package at.procon.dip.search.repository;
|
||||
|
||||
import at.procon.dip.domain.access.DocumentVisibility;
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.search.dto.SearchEngineType;
|
||||
import at.procon.dip.search.dto.SearchHit;
|
||||
import at.procon.dip.search.dto.SearchMatchField;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import org.springframework.jdbc.core.RowMapper;
|
||||
|
||||
final class SearchHitRowMapper implements RowMapper<SearchHit> {
|
||||
|
||||
private final SearchEngineType engineType;
|
||||
private final SearchMatchField defaultField;
|
||||
|
||||
SearchHitRowMapper(SearchEngineType engineType, SearchMatchField defaultField) {
|
||||
this.engineType = engineType;
|
||||
this.defaultField = defaultField;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SearchHit mapRow(ResultSet rs, int rowNum) throws SQLException {
|
||||
String matchedField = safeGetString(rs, "matched_field");
|
||||
return SearchHit.builder()
|
||||
.documentId(rs.getObject("document_id", java.util.UUID.class))
|
||||
.representationId(rs.getObject("representation_id", java.util.UUID.class))
|
||||
.documentType(DocumentType.valueOf(rs.getString("document_type")))
|
||||
.documentFamily(DocumentFamily.valueOf(rs.getString("document_family")))
|
||||
.visibility(DocumentVisibility.valueOf(rs.getString("visibility")))
|
||||
.title(safeGetString(rs, "title"))
|
||||
.summary(safeGetString(rs, "summary"))
|
||||
.languageCode(safeGetString(rs, "language_code"))
|
||||
.mimeType(safeGetString(rs, "mime_type"))
|
||||
.primaryEngine(engineType)
|
||||
.matchedField(matchedField == null || matchedField.isBlank()
|
||||
? defaultField
|
||||
: SearchMatchField.valueOf(matchedField))
|
||||
.snippet(safeGetString(rs, "snippet"))
|
||||
.rawScore(rs.getDouble("score"))
|
||||
.createdAt(rs.getObject("created_at", java.time.OffsetDateTime.class))
|
||||
.updatedAt(rs.getObject("updated_at", java.time.OffsetDateTime.class))
|
||||
.build();
|
||||
}
|
||||
|
||||
private String safeGetString(ResultSet rs, String column) {
|
||||
try {
|
||||
return rs.getString(column);
|
||||
} catch (SQLException ignore) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,84 @@
|
||||
package at.procon.dip.search.repository;
|
||||
|
||||
import at.procon.dip.domain.access.DocumentVisibility;
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import at.procon.dip.search.api.SearchExecutionContext;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import org.springframework.jdbc.core.namedparam.MapSqlParameterSource;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
final class SearchSqlFilterSupport {
|
||||
|
||||
private SearchSqlFilterSupport() {
|
||||
}
|
||||
|
||||
static void appendCommonFilters(StringBuilder sql,
|
||||
MapSqlParameterSource params,
|
||||
SearchExecutionContext context,
|
||||
String documentAlias,
|
||||
String representationAlias,
|
||||
boolean tenantJoinPresent) {
|
||||
Set<DocumentType> documentTypes = firstNonEmpty(context.getRequest().getDocumentTypes(), context.getScope().documentTypes());
|
||||
if (!CollectionUtils.isEmpty(documentTypes)) {
|
||||
sql.append(" AND CAST(").append(documentAlias).append(".document_type AS text) IN (:documentTypes)");
|
||||
params.addValue("documentTypes", enumNames(documentTypes));
|
||||
}
|
||||
|
||||
Set<DocumentFamily> documentFamilies = firstNonEmpty(context.getRequest().getDocumentFamilies(), context.getScope().documentFamilies());
|
||||
if (!CollectionUtils.isEmpty(documentFamilies)) {
|
||||
sql.append(" AND CAST(").append(documentAlias).append(".document_family AS text) IN (:documentFamilies)");
|
||||
params.addValue("documentFamilies", enumNames(documentFamilies));
|
||||
}
|
||||
|
||||
Set<DocumentVisibility> visibilities = firstNonEmpty(context.getRequest().getVisibilities(), context.getScope().visibilities());
|
||||
if (!CollectionUtils.isEmpty(visibilities)) {
|
||||
sql.append(" AND CAST(").append(documentAlias).append(".visibility AS text) IN (:visibilities)");
|
||||
params.addValue("visibilities", enumNames(visibilities));
|
||||
}
|
||||
|
||||
Set<String> languageCodes = context.getRequest().getLanguageCodes();
|
||||
if (CollectionUtils.isEmpty(languageCodes) && context.getScope().languageCode() != null && !context.getScope().languageCode().isBlank()) {
|
||||
languageCodes = Set.of(context.getScope().languageCode());
|
||||
}
|
||||
if (!CollectionUtils.isEmpty(languageCodes)) {
|
||||
sql.append(" AND COALESCE(").append(representationAlias).append(".language_code, ")
|
||||
.append(documentAlias).append(".language_code, '') IN (:languageCodes)");
|
||||
params.addValue("languageCodes", languageCodes);
|
||||
}
|
||||
|
||||
Set<RepresentationType> representationTypes = context.getRequest().getRepresentationTypes();
|
||||
if (!CollectionUtils.isEmpty(representationTypes)) {
|
||||
sql.append(" AND CAST(").append(representationAlias).append(".representation_type AS text) IN (:representationTypes)");
|
||||
params.addValue("representationTypes", enumNames(representationTypes));
|
||||
} else {
|
||||
sql.append(" AND ").append(representationAlias).append(".is_primary = true");
|
||||
}
|
||||
|
||||
if (context.getRequest().getCreatedFrom() != null) {
|
||||
sql.append(" AND ").append(documentAlias).append(".created_at >= :createdFrom");
|
||||
params.addValue("createdFrom", context.getRequest().getCreatedFrom());
|
||||
}
|
||||
if (context.getRequest().getCreatedTo() != null) {
|
||||
sql.append(" AND ").append(documentAlias).append(".created_at <= :createdTo");
|
||||
params.addValue("createdTo", context.getRequest().getCreatedTo());
|
||||
}
|
||||
|
||||
if (tenantJoinPresent && !CollectionUtils.isEmpty(context.getScope().ownerTenantKeys())) {
|
||||
sql.append(" AND dt.tenant_key IN (:ownerTenantKeys)");
|
||||
params.addValue("ownerTenantKeys", context.getScope().ownerTenantKeys());
|
||||
}
|
||||
}
|
||||
|
||||
private static <T> Set<T> firstNonEmpty(Set<T> primary, Set<T> fallback) {
|
||||
return !CollectionUtils.isEmpty(primary) ? primary : fallback;
|
||||
}
|
||||
|
||||
private static List<String> enumNames(Collection<? extends Enum<?>> values) {
|
||||
return values.stream().map(Enum::name).collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,25 @@
|
||||
package at.procon.dip.search.service;
|
||||
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.boot.ApplicationArguments;
|
||||
import org.springframework.boot.ApplicationRunner;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class SearchLexicalIndexStartupRunner implements ApplicationRunner {
|
||||
|
||||
private final TedProcessorProperties properties;
|
||||
private final DocumentLexicalIndexService lexicalIndexService;
|
||||
|
||||
@Override
|
||||
public void run(ApplicationArguments args) {
|
||||
int updated = lexicalIndexService.backfillMissingVectors(properties.getSearch().getStartupLexicalBackfillLimit());
|
||||
if (updated > 0) {
|
||||
log.info("Search lexical index startup backfill updated {} representations", updated);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,9 +1,11 @@
|
||||
package at.procon.dip.search.service;
|
||||
|
||||
import at.procon.dip.search.dto.SearchDebugResponse;
|
||||
import at.procon.dip.search.dto.SearchRequest;
|
||||
import at.procon.dip.search.dto.SearchResponse;
|
||||
import at.procon.dip.search.spi.SearchDocumentScope;
|
||||
|
||||
public interface SearchOrchestrator {
|
||||
SearchResponse search(SearchRequest request, SearchDocumentScope scope);
|
||||
SearchDebugResponse debug(SearchRequest request, SearchDocumentScope scope);
|
||||
}
|
||||
|
||||
@ -0,0 +1,39 @@
|
||||
package at.procon.dip.search.service;
|
||||
|
||||
import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
|
||||
import at.procon.dip.domain.document.service.DocumentEmbeddingService;
|
||||
import at.procon.ted.config.TedProcessorProperties;
|
||||
import at.procon.ted.service.VectorizationService;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class SemanticQueryEmbeddingService {
|
||||
|
||||
private final TedProcessorProperties properties;
|
||||
private final DocumentEmbeddingService documentEmbeddingService;
|
||||
private final VectorizationService vectorizationService;
|
||||
|
||||
public Optional<QueryEmbedding> buildQueryEmbedding(String queryText) {
|
||||
if (!properties.getVectorization().isEnabled()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
try {
|
||||
DocumentEmbeddingModel model = documentEmbeddingService.findActiveModelByKey(
|
||||
properties.getVectorization().getModelName());
|
||||
float[] vector = vectorizationService.generateQueryEmbedding(queryText);
|
||||
return Optional.of(new QueryEmbedding(model.getId(), vectorizationService.floatArrayToVectorString(vector)));
|
||||
} catch (Exception e) {
|
||||
log.warn("Failed to generate semantic query embedding: {}", e.getMessage());
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
public record QueryEmbedding(UUID modelId, String vectorString) {
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,26 @@
|
||||
-- Slice 1 + Slice 2 generic search support for DOC documents.
|
||||
-- Adds lexical-search support columns/indexes and pg_trgm extension.
|
||||
|
||||
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||
|
||||
ALTER TABLE DOC.doc_text_representation
|
||||
ADD COLUMN IF NOT EXISTS search_config VARCHAR(64);
|
||||
|
||||
ALTER TABLE DOC.doc_text_representation
|
||||
ADD COLUMN IF NOT EXISTS search_vector tsvector;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_search_vector
|
||||
ON DOC.doc_text_representation
|
||||
USING GIN (search_vector);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_document_title_trgm
|
||||
ON DOC.doc_document
|
||||
USING GIN (title gin_trgm_ops);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_document_summary_trgm
|
||||
ON DOC.doc_document
|
||||
USING GIN (summary gin_trgm_ops);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_text_trgm
|
||||
ON DOC.doc_text_representation
|
||||
USING GIN (text_body gin_trgm_ops);
|
||||
Loading…
Reference in New Issue