From 430885b5afa9b400c3c55dae3049d6c7ca0adb24 Mon Sep 17 00:00:00 2001 From: trifonovt <87468028+TihomirTrifonov@users.noreply.github.com> Date: Mon, 18 May 2026 14:00:38 +0200 Subject: [PATCH] Optimize trigram search candidate selection --- .../DocumentTrigramSearchRepositoryImpl.java | 130 ++++++++++++++---- 1 file changed, 100 insertions(+), 30 deletions(-) diff --git a/src/main/java/at/procon/dip/search/repository/DocumentTrigramSearchRepositoryImpl.java b/src/main/java/at/procon/dip/search/repository/DocumentTrigramSearchRepositoryImpl.java index c8375b6..e23b443 100644 --- a/src/main/java/at/procon/dip/search/repository/DocumentTrigramSearchRepositoryImpl.java +++ b/src/main/java/at/procon/dip/search/repository/DocumentTrigramSearchRepositoryImpl.java @@ -18,42 +18,112 @@ public class DocumentTrigramSearchRepositoryImpl implements DocumentTrigramSearc @Override public List search(SearchExecutionContext context, int limit, double threshold) { - String scoreExpr = "GREATEST(" + - "doc.similarity(COALESCE(d.title, ''), :queryText), " + - "doc.similarity(COALESCE(d.summary, ''), :queryText), " + - "doc.similarity(COALESCE(dtr.text_body, ''), :queryText))"; - - StringBuilder sql = new StringBuilder("SELECT " + - "d.id AS document_id, " + - "dtr.id AS representation_id, " + - "CAST(d.document_type AS text) AS document_type, " + - "CAST(d.document_family AS text) AS document_family, " + - "CAST(d.visibility AS text) AS visibility, " + - "d.title AS title, " + - "d.summary AS summary, " + - "COALESCE(dtr.language_code, d.language_code) AS language_code, " + - "d.mime_type AS mime_type, " + - "d.created_at AS created_at, " + - "d.updated_at AS updated_at, " + - "LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet, " + - scoreExpr + " AS score, " + - "CASE " + - "WHEN doc.similarity(COALESCE(d.title, ''), :queryText) >= doc.similarity(COALESCE(d.summary, ''), :queryText) " + - " AND doc.similarity(COALESCE(d.title, ''), :queryText) >= doc.similarity(COALESCE(dtr.text_body, ''), :queryText) THEN 'DOCUMENT_TITLE' " + - "WHEN doc.similarity(COALESCE(d.summary, ''), :queryText) >= doc.similarity(COALESCE(dtr.text_body, ''), :queryText) THEN 'DOCUMENT_SUMMARY' " + - "ELSE 'REPRESENTATION_TEXT' END AS matched_field " + - "FROM doc.doc_text_representation dtr " + - "JOIN doc.doc_document d ON d.id = dtr.document_id " + - "LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id " + - "WHERE " + scoreExpr + " >= :threshold"); + StringBuilder sql = new StringBuilder(""" + WITH title_candidates AS ( + SELECT + d.id AS document_id, + dtr.id AS representation_id, + 'DOCUMENT_TITLE' AS matched_field, + public.similarity(d.title, :queryText) AS score, + d.updated_at AS updated_at + FROM doc.doc_text_representation dtr + JOIN doc.doc_document d ON d.id = dtr.document_id + LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id + WHERE d.title IS NOT NULL + AND d.title OPERATOR(public.%) :queryText + """); MapSqlParameterSource params = new MapSqlParameterSource(); params.addValue("queryText", context.getRequest().getQueryText()); params.addValue("threshold", threshold); - SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true); - sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit"); + params.addValue("branchLimit", limit); params.addValue("limit", limit); + SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true); + sql.append(""" + ORDER BY score DESC, d.updated_at DESC + LIMIT :branchLimit + ), + summary_candidates AS ( + SELECT + d.id AS document_id, + dtr.id AS representation_id, + 'DOCUMENT_SUMMARY' AS matched_field, + public.similarity(d.summary, :queryText) AS score, + d.updated_at AS updated_at + FROM doc.doc_text_representation dtr + JOIN doc.doc_document d ON d.id = dtr.document_id + LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id + WHERE d.summary IS NOT NULL + AND d.summary OPERATOR(public.%) :queryText + """); + SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true); + sql.append(""" + ORDER BY score DESC, d.updated_at DESC + LIMIT :branchLimit + ), + text_candidates AS ( + SELECT + d.id AS document_id, + dtr.id AS representation_id, + 'REPRESENTATION_TEXT' AS matched_field, + public.similarity(dtr.text_body, :queryText) AS score, + d.updated_at AS updated_at + FROM doc.doc_text_representation dtr + JOIN doc.doc_document d ON d.id = dtr.document_id + LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id + WHERE dtr.text_body IS NOT NULL + AND dtr.text_body OPERATOR(public.%) :queryText + """); + SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true); + sql.append(""" + ORDER BY score DESC, d.updated_at DESC + LIMIT :branchLimit + ), + ranked AS ( + SELECT DISTINCT ON (representation_id) + document_id, + representation_id, + matched_field, + score, + updated_at + FROM ( + SELECT * FROM title_candidates + UNION ALL + SELECT * FROM summary_candidates + UNION ALL + SELECT * FROM text_candidates + ) all_candidates + WHERE score >= :threshold + ORDER BY representation_id, score DESC, updated_at DESC + ) + SELECT + d.id AS document_id, + dtr.id AS representation_id, + CAST(dtr.representation_type AS text) AS representation_type, + dtr.is_primary AS is_primary, + dtr.chunk_index AS chunk_index, + dtr.chunk_start_offset AS chunk_start_offset, + dtr.chunk_end_offset AS chunk_end_offset, + CAST(d.document_type AS text) AS document_type, + CAST(d.document_family AS text) AS document_family, + CAST(d.visibility AS text) AS visibility, + d.title AS title, + d.summary AS summary, + COALESCE(dtr.language_code, d.language_code) AS language_code, + d.mime_type AS mime_type, + d.created_at AS created_at, + d.updated_at AS updated_at, + LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet, + ranked.score AS score, + ranked.matched_field AS matched_field + FROM ranked + JOIN doc.doc_text_representation dtr ON dtr.id = ranked.representation_id + JOIN doc.doc_document d ON d.id = ranked.document_id + ORDER BY ranked.score DESC, d.updated_at DESC + LIMIT :limit + """); + return jdbcTemplate.query(sql.toString(), params, new SearchHitRowMapper(SearchEngineType.POSTGRES_TRIGRAM, SearchMatchField.REPRESENTATION_TEXT)); }