Optimize trigram search candidate selection
This commit is contained in:
parent
5c3133d19d
commit
430885b5af
|
|
@ -18,42 +18,112 @@ public class DocumentTrigramSearchRepositoryImpl implements DocumentTrigramSearc
|
|||
|
||||
@Override
|
||||
public List<SearchHit> search(SearchExecutionContext context, int limit, double threshold) {
|
||||
String scoreExpr = "GREATEST(" +
|
||||
"doc.similarity(COALESCE(d.title, ''), :queryText), " +
|
||||
"doc.similarity(COALESCE(d.summary, ''), :queryText), " +
|
||||
"doc.similarity(COALESCE(dtr.text_body, ''), :queryText))";
|
||||
|
||||
StringBuilder sql = new StringBuilder("SELECT " +
|
||||
"d.id AS document_id, " +
|
||||
"dtr.id AS representation_id, " +
|
||||
"CAST(d.document_type AS text) AS document_type, " +
|
||||
"CAST(d.document_family AS text) AS document_family, " +
|
||||
"CAST(d.visibility AS text) AS visibility, " +
|
||||
"d.title AS title, " +
|
||||
"d.summary AS summary, " +
|
||||
"COALESCE(dtr.language_code, d.language_code) AS language_code, " +
|
||||
"d.mime_type AS mime_type, " +
|
||||
"d.created_at AS created_at, " +
|
||||
"d.updated_at AS updated_at, " +
|
||||
"LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet, " +
|
||||
scoreExpr + " AS score, " +
|
||||
"CASE " +
|
||||
"WHEN doc.similarity(COALESCE(d.title, ''), :queryText) >= doc.similarity(COALESCE(d.summary, ''), :queryText) " +
|
||||
" AND doc.similarity(COALESCE(d.title, ''), :queryText) >= doc.similarity(COALESCE(dtr.text_body, ''), :queryText) THEN 'DOCUMENT_TITLE' " +
|
||||
"WHEN doc.similarity(COALESCE(d.summary, ''), :queryText) >= doc.similarity(COALESCE(dtr.text_body, ''), :queryText) THEN 'DOCUMENT_SUMMARY' " +
|
||||
"ELSE 'REPRESENTATION_TEXT' END AS matched_field " +
|
||||
"FROM doc.doc_text_representation dtr " +
|
||||
"JOIN doc.doc_document d ON d.id = dtr.document_id " +
|
||||
"LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id " +
|
||||
"WHERE " + scoreExpr + " >= :threshold");
|
||||
StringBuilder sql = new StringBuilder("""
|
||||
WITH title_candidates AS (
|
||||
SELECT
|
||||
d.id AS document_id,
|
||||
dtr.id AS representation_id,
|
||||
'DOCUMENT_TITLE' AS matched_field,
|
||||
public.similarity(d.title, :queryText) AS score,
|
||||
d.updated_at AS updated_at
|
||||
FROM doc.doc_text_representation dtr
|
||||
JOIN doc.doc_document d ON d.id = dtr.document_id
|
||||
LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id
|
||||
WHERE d.title IS NOT NULL
|
||||
AND d.title OPERATOR(public.%) :queryText
|
||||
""");
|
||||
|
||||
MapSqlParameterSource params = new MapSqlParameterSource();
|
||||
params.addValue("queryText", context.getRequest().getQueryText());
|
||||
params.addValue("threshold", threshold);
|
||||
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
|
||||
sql.append(" ORDER BY score DESC, d.updated_at DESC LIMIT :limit");
|
||||
params.addValue("branchLimit", limit);
|
||||
params.addValue("limit", limit);
|
||||
|
||||
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
|
||||
sql.append("""
|
||||
ORDER BY score DESC, d.updated_at DESC
|
||||
LIMIT :branchLimit
|
||||
),
|
||||
summary_candidates AS (
|
||||
SELECT
|
||||
d.id AS document_id,
|
||||
dtr.id AS representation_id,
|
||||
'DOCUMENT_SUMMARY' AS matched_field,
|
||||
public.similarity(d.summary, :queryText) AS score,
|
||||
d.updated_at AS updated_at
|
||||
FROM doc.doc_text_representation dtr
|
||||
JOIN doc.doc_document d ON d.id = dtr.document_id
|
||||
LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id
|
||||
WHERE d.summary IS NOT NULL
|
||||
AND d.summary OPERATOR(public.%) :queryText
|
||||
""");
|
||||
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
|
||||
sql.append("""
|
||||
ORDER BY score DESC, d.updated_at DESC
|
||||
LIMIT :branchLimit
|
||||
),
|
||||
text_candidates AS (
|
||||
SELECT
|
||||
d.id AS document_id,
|
||||
dtr.id AS representation_id,
|
||||
'REPRESENTATION_TEXT' AS matched_field,
|
||||
public.similarity(dtr.text_body, :queryText) AS score,
|
||||
d.updated_at AS updated_at
|
||||
FROM doc.doc_text_representation dtr
|
||||
JOIN doc.doc_document d ON d.id = dtr.document_id
|
||||
LEFT JOIN doc.doc_tenant dt ON dt.id = d.owner_tenant_id
|
||||
WHERE dtr.text_body IS NOT NULL
|
||||
AND dtr.text_body OPERATOR(public.%) :queryText
|
||||
""");
|
||||
SearchSqlFilterSupport.appendCommonFilters(sql, params, context, "d", "dtr", true);
|
||||
sql.append("""
|
||||
ORDER BY score DESC, d.updated_at DESC
|
||||
LIMIT :branchLimit
|
||||
),
|
||||
ranked AS (
|
||||
SELECT DISTINCT ON (representation_id)
|
||||
document_id,
|
||||
representation_id,
|
||||
matched_field,
|
||||
score,
|
||||
updated_at
|
||||
FROM (
|
||||
SELECT * FROM title_candidates
|
||||
UNION ALL
|
||||
SELECT * FROM summary_candidates
|
||||
UNION ALL
|
||||
SELECT * FROM text_candidates
|
||||
) all_candidates
|
||||
WHERE score >= :threshold
|
||||
ORDER BY representation_id, score DESC, updated_at DESC
|
||||
)
|
||||
SELECT
|
||||
d.id AS document_id,
|
||||
dtr.id AS representation_id,
|
||||
CAST(dtr.representation_type AS text) AS representation_type,
|
||||
dtr.is_primary AS is_primary,
|
||||
dtr.chunk_index AS chunk_index,
|
||||
dtr.chunk_start_offset AS chunk_start_offset,
|
||||
dtr.chunk_end_offset AS chunk_end_offset,
|
||||
CAST(d.document_type AS text) AS document_type,
|
||||
CAST(d.document_family AS text) AS document_family,
|
||||
CAST(d.visibility AS text) AS visibility,
|
||||
d.title AS title,
|
||||
d.summary AS summary,
|
||||
COALESCE(dtr.language_code, d.language_code) AS language_code,
|
||||
d.mime_type AS mime_type,
|
||||
d.created_at AS created_at,
|
||||
d.updated_at AS updated_at,
|
||||
LEFT(COALESCE(dtr.text_body, COALESCE(d.summary, d.title, '')), 400) AS snippet,
|
||||
ranked.score AS score,
|
||||
ranked.matched_field AS matched_field
|
||||
FROM ranked
|
||||
JOIN doc.doc_text_representation dtr ON dtr.id = ranked.representation_id
|
||||
JOIN doc.doc_document d ON d.id = ranked.document_id
|
||||
ORDER BY ranked.score DESC, d.updated_at DESC
|
||||
LIMIT :limit
|
||||
""");
|
||||
|
||||
return jdbcTemplate.query(sql.toString(), params,
|
||||
new SearchHitRowMapper(SearchEngineType.POSTGRES_TRIGRAM, SearchMatchField.REPRESENTATION_TEXT));
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue