TED Notice Lot documents embedding

This commit is contained in:
trifonovt 2026-04-28 17:26:12 +02:00
parent 902456001e
commit 4bc503ed29
20 changed files with 463 additions and 45 deletions

View File

@ -7,10 +7,10 @@ Included changes:
- Score normalization and result fusion
- Generic /search endpoint
- Lexical index maintenance service and startup backfill runner
- DOC lexical search migration (V9)
- DOC lexical search migration (V14)
- Modified DOC representation write path to refresh search vectors
Important note:
- Full-text search requires V9__doc_search_slice1_support.sql to be applied.
- Full-text search requires V14__doc_search_slice1_support.sql to be applied.
- The lexical index service is guarded and will no-op if the search columns are not yet present.
- Because Flyway is currently disabled in application.yml, apply the migration manually or enable Flyway before using the new search endpoint.

View File

@ -1,11 +1,16 @@
package at.procon.dip.domain.document.repository;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.EmbeddingStatus;
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
import java.util.List;
import java.util.Optional;
import java.util.UUID;
import org.springframework.data.domain.Pageable;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.data.jpa.repository.Query;
import org.springframework.data.repository.query.Param;
public interface DocumentTextRepresentationRepository extends JpaRepository<DocumentTextRepresentation, UUID> {
@ -20,4 +25,33 @@ public interface DocumentTextRepresentationRepository extends JpaRepository<Docu
long countByRepresentationType(RepresentationType representationType);
Optional<DocumentTextRepresentation> findFirstByDocument_IdAndPrimaryRepresentationTrue(UUID documentId);
@Query("""
SELECT r
FROM DocumentTextRepresentation r
JOIN r.document d
WHERE (:documentType IS NULL OR d.documentType = :documentType)
AND (:representationType IS NULL OR r.representationType = :representationType)
AND (:builderKey IS NULL OR r.builderKey = :builderKey)
AND (:primaryOnly = false OR r.primaryRepresentation = true)
AND r.textBody IS NOT NULL
AND r.textBody <> ''
AND (:includeCompleted = true OR NOT EXISTS (
SELECT e.id
FROM DocumentEmbedding e
WHERE e.representation.id = r.id
AND e.model.modelKey = :modelKey
AND e.embeddingStatus = :completedStatus
))
ORDER BY r.createdAt ASC, r.id ASC
""")
List<DocumentTextRepresentation> findEmbeddingCandidatesByDocumentType(
@Param("documentType") DocumentType documentType,
@Param("representationType") RepresentationType representationType,
@Param("builderKey") String builderKey,
@Param("primaryOnly") boolean primaryOnly,
@Param("modelKey") String modelKey,
@Param("completedStatus") EmbeddingStatus completedStatus,
@Param("includeCompleted") boolean includeCompleted,
Pageable pageable);
}

View File

@ -1,17 +1,23 @@
package at.procon.dip.domain.ted.config;
import jakarta.validation.constraints.Min;
import jakarta.validation.constraints.Positive;
import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;
import org.springframework.validation.annotation.Validated;
@Configuration
@ConfigurationProperties(prefix = "dip.ted.projection")
@Validated
@Data
public class TedProjectionProperties {
private boolean enabled = true;
private boolean startupBackfillEnabled = false;
@Positive
/**
* Maximum number of legacy TED documents to backfill on startup. 0 means no limit.
*/
@Min(0)
private int startupBackfillLimit = 250;
@Positive
private int structuredSearchHybridCandidateLimit = 5000;
@ -29,7 +35,10 @@ public class TedProjectionProperties {
* Optional startup/backfill path for notices that were imported before lot documents existed.
*/
private boolean startupBackfillEnabled = false;
@Positive
/**
* Maximum number of projections to backfill on startup. 0 means no limit.
*/
@Min(0)
private int startupBackfillLimit = 1000;
/**
* Queue embeddings whenever the lot semantic text representation is created or changed.

View File

@ -37,7 +37,7 @@ import org.springframework.util.StringUtils;
@Slf4j
public class TedLotDocumentMaterializationService {
public static final String BUILDER_KEY = "ted-lot-structured-text";
public static final String BUILDER_KEY = "ted-lot-clustering-text-v1";
private final TedProjectionProperties properties;
private final TedNoticeLotRepository lotRepository;
@ -174,22 +174,20 @@ public class TedLotDocumentMaterializationService {
private String buildSemanticText(TedNoticeProjection projection, TedNoticeLot lot) {
StringBuilder sb = new StringBuilder(1024);
append(sb, "Document type", "TED procurement lot");
append(sb, "Lot title", lot.getTitle());
append(sb, "Lot description", lot.getDescription());
append(sb, "Project title", projection.getProjectTitle());
append(sb, "Entity type", "Procurement lot");
append(sb, "Procurement scope", joinedLines(lot.getTitle(), lot.getDescription()));
append(sb, "Procurement category", joined(firstNonEmpty(lot.getCpvCodes(), projection.getCpvCodes())));
append(sb, "Relevant domain", projection.getBuyerActivityType());
append(sb, "Contract type", projection.getContractNature() == null ? null : projection.getContractNature().name());
append(sb, "Geographic context", joined(firstNonEmpty(lot.getNutsCodes(), projection.getNutsCodes()), projection.getBuyerCountryCode()));
if (!StringUtils.hasText(lot.getDescription())) {
append(sb, "Project description", projection.getProjectDescription());
append(sb, "Parent notice context", joinedLines(projection.getProjectTitle(), projection.getProjectDescription()));
} else if (properties.getLotDocuments().isIncludeProjectDescription()) {
append(sb, "Project context", projection.getProjectDescription());
append(sb, "Parent notice context", joinedLines(projection.getProjectTitle(), projection.getProjectDescription()));
} else {
append(sb, "Parent notice context", projection.getProjectTitle());
}
append(sb, "Contract nature", projection.getContractNature() == null ? null : projection.getContractNature().name());
append(sb, "Buyer activity", projection.getBuyerActivityType());
append(sb, "Buyer country", projection.getBuyerCountryCode());
append(sb, "CPV codes", joined(firstNonEmpty(lot.getCpvCodes(), projection.getCpvCodes())));
append(sb, "NUTS codes", joined(firstNonEmpty(lot.getNutsCodes(), projection.getNutsCodes())));
return sb.toString().trim();
}
@ -239,6 +237,23 @@ public class TedLotDocumentMaterializationService {
.collect(Collectors.joining(", "));
}
private String joined(String[] values, String fallback) {
String joined = joined(values);
if (StringUtils.hasText(joined)) {
return joined;
}
return fallback;
}
private String joinedLines(String... values) {
String joined = Arrays.stream(values)
.filter(StringUtils::hasText)
.map(String::trim)
.distinct()
.collect(Collectors.joining("\n"));
return StringUtils.hasText(joined) ? joined : null;
}
private String sanitizeKey(String value) {
return value == null ? "unknown" : value.trim().replaceAll("\\s+", "_");
}

View File

@ -24,6 +24,8 @@ import org.springframework.stereotype.Component;
@Slf4j
public class TedProjectionStartupRunner implements ApplicationRunner {
private static final int STARTUP_BACKFILL_BATCH_SIZE = 1000;
private final TedProjectionProperties properties;
private final ProcurementDocumentRepository procurementDocumentRepository;
private final TedNoticeProjectionRepository projectionRepository;
@ -47,13 +49,22 @@ public class TedProjectionStartupRunner implements ApplicationRunner {
private void backfillNoticeProjections() {
int limit = properties.getStartupBackfillLimit();
log.info("Phase 3 startup backfill enabled - ensuring TED projections for up to {} documents", limit);
var page = procurementDocumentRepository.findAll(
PageRequest.of(0, limit, Sort.by(Sort.Direction.ASC, "createdAt")));
log.info("Phase 3 startup backfill enabled - ensuring TED projections for {} documents", describeLimit(limit));
int synced = 0;
int processed = 0;
int pageNumber = 0;
Sort sort = Sort.by(Sort.Direction.ASC, "createdAt");
while (limit <= 0 || processed < limit) {
int pageSize = pageSizeFor(limit, processed);
var page = procurementDocumentRepository.findAll(PageRequest.of(pageNumber++, pageSize, sort));
if (page.isEmpty()) {
break;
}
for (var legacyDocument : page.getContent()) {
processed++;
if (projectionRepository.existsByLegacyProcurementDocumentId(legacyDocument.getId())) {
continue;
}
@ -61,21 +72,51 @@ public class TedProjectionStartupRunner implements ApplicationRunner {
synced++;
}
if (!page.hasNext()) {
break;
}
}
log.info("Phase 3 startup backfill completed - synced {} TED projections", synced);
}
private void backfillLotDocuments() {
int limit = properties.getLotDocuments().getStartupBackfillLimit();
log.info("TED lot document startup backfill enabled - materializing lots for up to {} projections", limit);
var page = projectionRepository.findAll(
PageRequest.of(0, limit, Sort.by(Sort.Direction.ASC, "createdAt")));
log.info("TED lot document startup backfill enabled - materializing lots for {} projections", describeLimit(limit));
int lotDocuments = 0;
int processed = 0;
int pageNumber = 0;
Sort sort = Sort.by(Sort.Direction.ASC, "createdAt");
while (limit <= 0 || processed < limit) {
int pageSize = pageSizeFor(limit, processed);
var page = projectionRepository.findAll(PageRequest.of(pageNumber++, pageSize, sort));
if (page.isEmpty()) {
break;
}
for (var projection : page.getContent()) {
processed++;
lotDocuments += lotDocumentMaterializationService.materializeProjectionLots(projection.getId());
}
if (!page.hasNext()) {
break;
}
}
log.info("TED lot document startup backfill completed - materialized/updated {} lot documents", lotDocuments);
}
private int pageSizeFor(int limit, int processed) {
if (limit <= 0) {
return STARTUP_BACKFILL_BATCH_SIZE;
}
return Math.min(STARTUP_BACKFILL_BATCH_SIZE, limit - processed);
}
private String describeLimit(int limit) {
return limit > 0 ? "up to " + limit : "all available";
}
}

View File

@ -1,6 +1,8 @@
package at.procon.dip.embedding.config;
import at.procon.dip.domain.document.DistanceMetric;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.embedding.model.EmbeddingPrefixMode;
import java.time.Duration;
import java.util.LinkedHashMap;
@ -21,6 +23,7 @@ public class EmbeddingProperties {
private Map<String, ModelProperties> models = new LinkedHashMap<>();
private IndexingProperties indexing = new IndexingProperties();
private JobsProperties jobs = new JobsProperties();
private StartupProperties startup = new StartupProperties();
@Data
public static class ProviderProperties {
@ -79,4 +82,18 @@ public class EmbeddingProperties {
private Duration maxRetryDelay = Duration.ofHours(6);
private long schedulerDelayMs = 5000;
}
@Data
public static class StartupProperties {
private boolean enqueueMissingEnabled = false;
private boolean processReadyEnabled = false;
private DocumentType documentType;
private RepresentationType representationType;
private String builderKey;
private boolean primaryOnly = false;
private String modelKey;
private boolean force = false;
private int batchSize = 1000;
private long maxRepresentationsPerRun = 0;
}
}

View File

@ -0,0 +1,17 @@
package at.procon.dip.embedding.service;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType;
public record ScopedEmbeddingEnqueueResult(
DocumentType documentType,
RepresentationType representationType,
String builderKey,
boolean primaryOnly,
String modelKey,
boolean force,
int requestedLimit,
int matchedRepresentations,
int jobsQueuedOrAlreadyActive
) {
}

View File

@ -0,0 +1,91 @@
package at.procon.dip.embedding.service;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.EmbeddingStatus;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
import at.procon.dip.embedding.job.service.EmbeddingJobService;
import at.procon.dip.embedding.model.EmbeddingJobType;
import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
import java.util.List;
import lombok.RequiredArgsConstructor;
import org.springframework.data.domain.PageRequest;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.StringUtils;
@Service
@RequiredArgsConstructor
public class ScopedEmbeddingEnqueueService {
private static final int MAX_LIMIT = 10_000;
private final DocumentTextRepresentationRepository representationRepository;
private final EmbeddingJobService jobService;
private final EmbeddingModelRegistry modelRegistry;
@Transactional
public ScopedEmbeddingEnqueueResult enqueueByDocumentType(DocumentType documentType,
RepresentationType representationType,
String builderKey,
boolean primaryOnly,
String modelKey,
boolean force,
int limit) {
return enqueueByDocumentType(documentType, representationType, builderKey, primaryOnly, modelKey, force, 0, limit);
}
@Transactional
public ScopedEmbeddingEnqueueResult enqueueByDocumentType(DocumentType documentType,
RepresentationType representationType,
String builderKey,
boolean primaryOnly,
String modelKey,
boolean force,
int pageNumber,
int limit) {
String effectiveModelKey = StringUtils.hasText(modelKey)
? modelKey
: modelRegistry.getRequiredDefaultDocumentModelKey();
modelRegistry.getRequired(effectiveModelKey);
int effectiveLimit = Math.max(1, Math.min(limit, MAX_LIMIT));
int effectivePageNumber = Math.max(0, pageNumber);
String normalizedBuilderKey = StringUtils.hasText(builderKey) ? builderKey.trim() : null;
List<DocumentTextRepresentation> representations = representationRepository.findEmbeddingCandidatesByDocumentType(
documentType,
representationType,
normalizedBuilderKey,
primaryOnly,
effectiveModelKey,
EmbeddingStatus.COMPLETED,
force,
PageRequest.of(effectivePageNumber, effectiveLimit)
);
int jobs = 0;
for (DocumentTextRepresentation representation : representations) {
jobService.enqueueForRepresentation(
representation.getDocument().getId(),
representation.getId(),
effectiveModelKey,
EmbeddingJobType.DOCUMENT_EMBED
);
jobs++;
}
return new ScopedEmbeddingEnqueueResult(
documentType,
representationType,
normalizedBuilderKey,
primaryOnly,
effectiveModelKey,
force,
effectiveLimit,
representations.size(),
jobs
);
}
}

View File

@ -0,0 +1,92 @@
package at.procon.dip.embedding.startup;
import at.procon.dip.embedding.config.EmbeddingProperties;
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueResult;
import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueService;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.boot.ApplicationArguments;
import org.springframework.boot.ApplicationRunner;
import org.springframework.core.Ordered;
import org.springframework.core.annotation.Order;
import org.springframework.stereotype.Component;
@Component
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
@RequiredArgsConstructor
@Order(Ordered.LOWEST_PRECEDENCE)
@Slf4j
public class EmbeddingStartupRunner implements ApplicationRunner {
private final EmbeddingProperties properties;
private final ScopedEmbeddingEnqueueService enqueueService;
private final RepresentationEmbeddingOrchestrator orchestrator;
@Override
public void run(ApplicationArguments args) {
EmbeddingProperties.StartupProperties startup = properties.getStartup();
if (!properties.isEnabled() || startup == null) {
return;
}
if (startup.isEnqueueMissingEnabled()) {
enqueueMissing(startup);
}
if (startup.isProcessReadyEnabled()) {
processReadyJobs();
}
}
private void enqueueMissing(EmbeddingProperties.StartupProperties startup) {
int batchSize = Math.max(1, startup.getBatchSize());
long max = startup.getMaxRepresentationsPerRun();
long remaining = max > 0 ? max : Long.MAX_VALUE;
int pageNumber = 0;
long matched = 0;
long queued = 0;
log.info("Startup embedding enqueue enabled (documentType={}, representationType={}, builderKey={}, primaryOnly={}, force={}, modelKey={}, maxRepresentationsPerRun={})",
startup.getDocumentType(), startup.getRepresentationType(), startup.getBuilderKey(), startup.isPrimaryOnly(),
startup.isForce(), startup.getModelKey(), max > 0 ? max : "unbounded");
while (remaining > 0) {
int requested = (int) Math.min(batchSize, remaining);
ScopedEmbeddingEnqueueResult result = enqueueService.enqueueByDocumentType(
startup.getDocumentType(),
startup.getRepresentationType(),
startup.getBuilderKey(),
startup.isPrimaryOnly(),
startup.getModelKey(),
startup.isForce(),
pageNumber,
requested
);
matched += result.matchedRepresentations();
queued += result.jobsQueuedOrAlreadyActive();
remaining -= result.matchedRepresentations();
if (result.matchedRepresentations() < requested) {
break;
}
pageNumber++;
}
log.info("Startup embedding enqueue completed - matched {} representation(s), queued/already-active {} job(s)",
matched, queued);
}
private void processReadyJobs() {
if (!properties.getJobs().isEnabled()) {
log.warn("Startup embedding processing was requested but dip.embedding.jobs.enabled=false; queued jobs will not be processed by the NEW embedding job subsystem");
return;
}
int processed = orchestrator.processNextReadyBatch();
log.info("Startup embedding processing completed - processed {} ready job(s)", processed);
}
}

View File

@ -0,0 +1,66 @@
package at.procon.dip.embedding.web;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueResult;
import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueService;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.RequiredArgsConstructor;
import org.springframework.http.HttpStatus;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.server.ResponseStatusException;
@RestController
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
@RequestMapping("/v1/dip/admin/embeddings")
@RequiredArgsConstructor
@Tag(name = "Embedding Admin", description = "Administrative operations for NEW-runtime representation embeddings")
public class EmbeddingAdminController {
private final ScopedEmbeddingEnqueueService enqueueService;
@PostMapping("/enqueue-by-document-type")
@Operation(
summary = "Queue embeddings by document type",
description = "Queues embedding jobs only for DOC text representations belonging to the requested document type."
)
public ScopedEmbeddingEnqueueResult enqueueByDocumentType(
@RequestParam String documentType,
@RequestParam(required = false) String representationType,
@RequestParam(required = false) String builderKey,
@RequestParam(required = false, defaultValue = "false") boolean primaryOnly,
@RequestParam(required = false) String modelKey,
@RequestParam(required = false, defaultValue = "false") boolean force,
@RequestParam(required = false, defaultValue = "1000") int limit) {
return enqueueService.enqueueByDocumentType(
parseEnum(DocumentType.class, documentType, "documentType"),
parseEnum(RepresentationType.class, representationType, "representationType"),
builderKey,
primaryOnly,
modelKey,
force,
limit
);
}
private <E extends Enum<E>> E parseEnum(Class<E> enumType, String value, String parameterName) {
if (value == null || value.isBlank()) {
return null;
}
String normalized = value.trim().replace('-', '_').toUpperCase(java.util.Locale.ROOT);
try {
return Enum.valueOf(enumType, normalized);
} catch (IllegalArgumentException ex) {
throw new ResponseStatusException(
HttpStatus.BAD_REQUEST,
"Invalid " + parameterName + ": " + value
);
}
}
}

View File

@ -119,7 +119,7 @@ ted:
# Mail account username (email address)
username: archiv@procon.co.at
# Mail account password
password: ${MAIL_PASSWORD:worasigg}
password: ${MAIL_PASSWORD:}
# Use SSL/TLS connection
ssl: true
# Mail folder to read from

View File

@ -43,10 +43,29 @@ dip:
enabled: true
jobs:
enabled: false
parallel-batch-count: 1
parallel-batch-count: 2
process-in-batches: true
batch-size: 16
execution-batch-size: 16
batch-size: 48
execution-batch-size: 48
startup:
# Enqueue missing DOC representation embeddings on NEW-runtime startup.
enqueue-missing-enabled: true
# Also process ready embedding jobs during startup. Requires dip.embedding.jobs.enabled=true.
process-ready-enabled: true
# Leave empty to enqueue missing embeddings for all document types, or set e.g. TED_NOTICE_LOT.
document-type: TED_NOTICE_LOT
# Optional representation filter, e.g. SEMANTIC_TEXT.
representation-type:
# Optional builder filter, e.g. ted-lot-clustering-text-v1.
builder-key:
primary-only: false
# Leave empty to use dip.embedding.default-document-model.
model-key:
# False skips representations that already have a COMPLETED embedding for the model.
force: false
batch-size: 1000
# 0 means enqueue all matching not-vectorized representations.
max-representations-per-run: 0
default-document-model: e5-default
default-query-model: e5-default
@ -258,7 +277,7 @@ dip:
# Mailbox username
username: archiv@procon.co.at
# Mailbox password
password: ${MAIL_PASSWORD:worasigg}
password: ${MAIL_PASSWORD}
# Folder/mailbox name
folder-name: INBOX
# Optional stable provider account key; falls back to username
@ -350,6 +369,17 @@ dip:
startup-backfill-limit: 250
structured-search-hybrid-candidate-limit: 5000
structured-search-facet-bucket-limit: 12
lot-documents:
# Materialize one canonical DOC document of type TED_NOTICE_LOT per TED lot.
enabled: true
# Optional startup/backfill path for notices that were imported before lot documents existed.
startup-backfill-enabled: true
# Maximum number of legacy TED lot documents to backfill during startup (0 = all)
startup-backfill-limit: 0
# Queue embeddings whenever the lot semantic text representation is created or changed.
queue-embeddings-on-change: true
# Include parent notice project description even when the lot already has its own description.
include-parent-description: false
migration:
legacy-audit:

View File

@ -14,12 +14,11 @@ spring:
name: document-intelligence-platform
datasource:
#url: jdbc:postgresql://localhost:5432/RELM
#username: ${DB_USERNAME:postgres}
#password: ${DB_PASSWORD:P54!pcd#Wi}
url: jdbc:postgresql://94.130.218.54:32333/RELM
username: ${DB_USERNAME:postgres}
password: ${DB_PASSWORD:PDmXRx0Rbk9OFOn9qO5Gm/mPCfqW8zwbZ+/YIU1lySc=}
url: jdbc:postgresql://localhost:5432/RELM
#url: jdbc:postgresql://94.130.218.54:32333/RELM
username: ${DB_USERNAME}
password: ${DB_PASSWORD}
driver-class-name: org.postgresql.Driver
hikari:
maximum-pool-size: 5
@ -27,7 +26,7 @@ spring:
connection-timeout: 30000
idle-timeout: 300000
max-lifetime: 900000
leak-detection-threshold: 120000 # 2 minutes - increased to avoid false positives with batch processing
leak-detection-threshold: 300000 # 5 minutes - increased to avoid false positives with batch processing
jpa:
hibernate:

View File

@ -0,0 +1,7 @@
-- Adds an index for the TED lot clustering-oriented semantic representation builder.
-- The original ted-lot-structured-text index is left in place for databases that
-- already materialized or queried the earlier representation shape.
CREATE INDEX IF NOT EXISTS idx_doc_text_repr_ted_lot_clustering_builder
ON DOC.doc_text_representation(builder_key)
WHERE builder_key = 'ted-lot-clustering-text-v1';