Optimize trigram search candidate selection

This commit is contained in:
trifonovt 2026-05-18 14:00:38 +02:00
parent 5c3133d19d
commit 430885b5af
1 changed files with 100 additions and 30 deletions

View File

@ -18,42 +18,112 @@ public class DocumentTrigramSearchRepositoryImpl implements DocumentTrigramSearc
@Override @Override
public List<SearchHit> search(SearchExecutionContext context, int limit, double threshold) { public List<SearchHit> search(SearchExecutionContext context, int limit, double threshold) {
String scoreExpr = "GREATEST(" + StringBuilder sql = new StringBuilder("""
"doc.similarity(COALESCE(d.title, ''), :queryText), " + WITH title_candidates AS (
"doc.similarity(COALESCE(d.summary, ''), :queryText), " + SELECT
"doc.similarity(COALESCE(dtr.text_body, ''), :queryText))"; d.id AS document_id,
dtr.id AS representation_id,
StringBuilder sql = new StringBuilder("SELECT " + 'DOCUMENT_TITLE' AS matched_field,
"d.id AS document_id, " + public.similarity(d.title, :queryText) AS score,
"dtr.id AS representation_id, " + d.updated_at AS updated_at
"CAST(d.document_type AS text) AS document_type, " + FROM doc.doc_text_representation dtr
"CAST(d.document_family AS text) AS document_family, " + JOIN doc.doc_document d ON d.id = dtr.document_id
"CAST(d.visibility AS text) AS visibility, " + LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id
"d.title AS title, " + WHERE d.title IS NOT NULL
"d.summary AS summary, " + AND d.title OPERATOR(public.%) :queryText
"COALESCE(dtr.language_code, d.language_code) AS language_code, " + """);
"d.mime_type AS mime_type, " +
"d.created_at AS created_at, " +
"d.updated_at AS updated_at, " +
"LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet, " +
scoreExpr + " AS score, " +
"CASE " +
"WHEN doc.similarity(COALESCE(d.title, ''), :queryText) >= doc.similarity(COALESCE(d.summary, ''), :queryText) " +
" AND doc.similarity(COALESCE(d.title, ''), :queryText) >= doc.similarity(COALESCE(dtr.text_body, ''), :queryText) THEN 'DOCUMENT_TITLE' " +
"WHEN doc.similarity(COALESCE(d.summary, ''), :queryText) >= doc.similarity(COALESCE(dtr.text_body, ''), :queryText) THEN 'DOCUMENT_SUMMARY' " +
"ELSE 'REPRESENTATION_TEXT' END AS matched_field " +
"FROM doc.doc_text_representation dtr " +
"JOIN doc.doc_document d ON d.id = dtr.document_id " +
"LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id " +
"WHERE " + scoreExpr + " >= :threshold");
MapSqlParameterSource params = new MapSqlParameterSource(); MapSqlParameterSource params = new MapSqlParameterSource();
params.addValue("queryText", context.getRequest().getQueryText()); params.addValue("queryText", context.getRequest().getQueryText());
params.addValue("threshold", threshold); params.addValue("threshold", threshold);
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true); params.addValue("branchLimit", limit);
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
params.addValue("limit", limit); params.addValue("limit", limit);
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
sql.append("""
ORDER BY score DESC, d.updated_at DESC
LIMIT :branchLimit
),
summary_candidates AS (
SELECT
d.id AS document_id,
dtr.id AS representation_id,
'DOCUMENT_SUMMARY' AS matched_field,
public.similarity(d.summary, :queryText) AS score,
d.updated_at AS updated_at
FROM doc.doc_text_representation dtr
JOIN doc.doc_document d ON d.id = dtr.document_id
LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id
WHERE d.summary IS NOT NULL
AND d.summary OPERATOR(public.%) :queryText
""");
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
sql.append("""
ORDER BY score DESC, d.updated_at DESC
LIMIT :branchLimit
),
text_candidates AS (
SELECT
d.id AS document_id,
dtr.id AS representation_id,
'REPRESENTATION_TEXT' AS matched_field,
public.similarity(dtr.text_body, :queryText) AS score,
d.updated_at AS updated_at
FROM doc.doc_text_representation dtr
JOIN doc.doc_document d ON d.id = dtr.document_id
LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id
WHERE dtr.text_body IS NOT NULL
AND dtr.text_body OPERATOR(public.%) :queryText
""");
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
sql.append("""
ORDER BY score DESC, d.updated_at DESC
LIMIT :branchLimit
),
ranked AS (
SELECT DISTINCT ON (representation_id)
document_id,
representation_id,
matched_field,
score,
updated_at
FROM (
SELECT * FROM title_candidates
UNION ALL
SELECT * FROM summary_candidates
UNION ALL
SELECT * FROM text_candidates
) all_candidates
WHERE score >= :threshold
ORDER BY representation_id, score DESC, updated_at DESC
)
SELECT
d.id AS document_id,
dtr.id AS representation_id,
CAST(dtr.representation_type AS text) AS representation_type,
dtr.is_primary AS is_primary,
dtr.chunk_index AS chunk_index,
dtr.chunk_start_offset AS chunk_start_offset,
dtr.chunk_end_offset AS chunk_end_offset,
CAST(d.document_type AS text) AS document_type,
CAST(d.document_family AS text) AS document_family,
CAST(d.visibility AS text) AS visibility,
d.title AS title,
d.summary AS summary,
COALESCE(dtr.language_code, d.language_code) AS language_code,
d.mime_type AS mime_type,
d.created_at AS created_at,
d.updated_at AS updated_at,
LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet,
ranked.score AS score,
ranked.matched_field AS matched_field
FROM ranked
JOIN doc.doc_text_representation dtr ON dtr.id = ranked.representation_id
JOIN doc.doc_document d ON d.id = ranked.document_id
ORDER BY ranked.score DESC, d.updated_at DESC
LIMIT :limit
""");
return jdbcTemplate.query(sql.toString(), params, return jdbcTemplate.query(sql.toString(), params,
new SearchHitRowMapper(SearchEngineType.POSTGRES_TRIGRAM, SearchMatchField.REPRESENTATION_TEXT)); new SearchHitRowMapper(SearchEngineType.POSTGRES_TRIGRAM, SearchMatchField.REPRESENTATION_TEXT));
} }