defer lexical indexing of TED text representations

This commit is contained in:
trifonovt 2026-04-23 00:07:57 +02:00
parent a501176c83
commit 06c1485df9
13 changed files with 280 additions and 6 deletions

View File

@ -37,7 +37,9 @@ public class DocumentRepresentationService {
.textBody(command.textBody()) .textBody(command.textBody())
.build(); .build();
DocumentTextRepresentation saved = representationRepository.saveAndFlush(representation); DocumentTextRepresentation saved = representationRepository.saveAndFlush(representation);
lexicalIndexService.indexRepresentation(saved.getId()); if (!command.deferLexicalIndex()) {
lexicalIndexService.indexRepresentation(saved.getId());
}
return saved; return saved;
} }

View File

@ -14,6 +14,7 @@ public record AddDocumentTextRepresentationCommand(
Integer chunkStartOffset, Integer chunkStartOffset,
Integer chunkEndOffset, Integer chunkEndOffset,
boolean primaryRepresentation, boolean primaryRepresentation,
String textBody String textBody,
boolean deferLexicalIndex
) { ) {
} }

View File

@ -76,7 +76,8 @@ public class TimeEntryRepresentationMaterializationService {
null, null,
null, null,
true, true,
projection.getSemanticText() projection.getSemanticText(),
false
))); )));
if (changed if (changed

View File

@ -54,6 +54,12 @@ public class DipIngestionProperties {
@NotBlank @NotBlank
private String tedPackageImportBatchId = "phase41-ted-package"; private String tedPackageImportBatchId = "phase41-ted-package";
/**
* Skip synchronous lexical tsvector indexing during TED package child import.
* Missing search vectors are backfilled asynchronously by the lexical index scheduler.
*/
private boolean tedPackageDeferLexicalIndexing = true;
private boolean gatewayOnlyForTedPackages = false; private boolean gatewayOnlyForTedPackages = false;
@NotBlank @NotBlank

View File

@ -36,6 +36,7 @@ import at.procon.dip.ingestion.config.DipIngestionProperties;
import at.procon.dip.ingestion.dto.ImportedDocumentResult; import at.procon.dip.ingestion.dto.ImportedDocumentResult;
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.dip.ingestion.service.IngestionInternalAttributes;
import at.procon.dip.ingestion.util.DocumentImportSupport; import at.procon.dip.ingestion.util.DocumentImportSupport;
import at.procon.dip.normalization.service.TextRepresentationBuildService; import at.procon.dip.normalization.service.TextRepresentationBuildService;
import at.procon.dip.normalization.spi.RepresentationBuildRequest; import at.procon.dip.normalization.spi.RepresentationBuildRequest;
@ -402,6 +403,8 @@ public class GenericDocumentImportService {
} }
} }
boolean deferLexicalIndex = shouldDeferLexicalIndex(sourceDescriptor);
for (TextRepresentationDraft draft : drafts) { for (TextRepresentationDraft draft : drafts) {
if (!StringUtils.hasText(draft.textBody())) { if (!StringUtils.hasText(draft.textBody())) {
continue; continue;
@ -420,7 +423,8 @@ public class GenericDocumentImportService {
draft.chunkStartOffset(), draft.chunkStartOffset(),
draft.chunkEndOffset(), draft.chunkEndOffset(),
draft.primary(), draft.primary(),
draft.textBody() draft.textBody(),
deferLexicalIndex
)); ));
if (shouldQueueEmbedding(draft, embeddingPolicy, embeddingProfile)) { if (shouldQueueEmbedding(draft, embeddingPolicy, embeddingProfile)) {
@ -434,6 +438,10 @@ public class GenericDocumentImportService {
documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED); documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED);
} }
private boolean shouldDeferLexicalIndex(SourceDescriptor sourceDescriptor) {
return IngestionInternalAttributes.isTruthy(sourceDescriptor.attributes(), IngestionInternalAttributes.DEFER_LEXICAL_INDEX);
}
private DocumentContent resolveLinkedContent(TextRepresentationDraft draft, private DocumentContent resolveLinkedContent(TextRepresentationDraft draft,
DocumentContent originalContent, DocumentContent originalContent,
Map<ContentRole, DocumentContent> derivedContent) { Map<ContentRole, DocumentContent> derivedContent) {

View File

@ -11,6 +11,7 @@ import at.procon.dip.ingestion.service.TedPackageExpansionService.TedPackageEntr
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.dip.ingestion.config.DipIngestionProperties; import at.procon.dip.ingestion.config.DipIngestionProperties;
import at.procon.dip.ingestion.service.IngestionInternalAttributes;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode; import at.procon.dip.runtime.config.RuntimeMode;
import java.time.OffsetDateTime; import java.time.OffsetDateTime;
@ -50,6 +51,9 @@ public class TedPackageChildImportProcessor {
childAttributes.put("archivePath", entry.archivePath()); childAttributes.put("archivePath", entry.archivePath());
childAttributes.put("title", entry.fileName()); childAttributes.put("title", entry.fileName());
childAttributes.put("importBatchId", properties.getTedPackageImportBatchId()); childAttributes.put("importBatchId", properties.getTedPackageImportBatchId());
if (properties.isTedPackageDeferLexicalIndexing()) {
childAttributes.put(IngestionInternalAttributes.DEFER_LEXICAL_INDEX, Boolean.TRUE.toString());
}
ImportedDocumentResult childResult = importService.importDocument(new SourceDescriptor( ImportedDocumentResult childResult = importService.importDocument(new SourceDescriptor(
accessContext == null ? DocumentAccessContext.publicDocument() : accessContext, accessContext == null ? DocumentAccessContext.publicDocument() : accessContext,

View File

@ -298,7 +298,8 @@ public class LegacyTedBackfillWorker {
draft.chunkStartOffset(), draft.chunkStartOffset(),
draft.chunkEndOffset(), draft.chunkEndOffset(),
draft.primary(), draft.primary(),
draft.textBody() draft.textBody(),
false
)); ));
existing.add(savedRepresentation); existing.add(savedRepresentation);
} else { } else {

View File

@ -78,4 +78,12 @@ public class DipSearchProperties {
/** Number of hits per engine returned by the debug endpoint. */ /** Number of hits per engine returned by the debug endpoint. */
@Positive @Positive
private int debugTopHitsPerEngine = 10; private int debugTopHitsPerEngine = 10;
private boolean scheduledLexicalBackfillEnabled;
@Positive
private int scheduledLexicalBackfillDelayMs = 30000;
@Positive
private int scheduledLexicalBackfillBatchSize = 200;
} }

View File

@ -0,0 +1,30 @@
package at.procon.dip.search.service;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import at.procon.dip.search.config.DipSearchProperties;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
@Component
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
@RequiredArgsConstructor
@Slf4j
public class SearchLexicalIndexScheduler {
private final DipSearchProperties properties;
private final DocumentLexicalIndexService lexicalIndexService;
@Scheduled(fixedDelayString = "${dip.search.scheduled-lexical-backfill-delay-ms:30000}")
public void backfillMissingLexicalIndexes() {
if (!properties.isScheduledLexicalBackfillEnabled()) {
return;
}
int updated = lexicalIndexService.backfillMissingVectors(properties.getScheduledLexicalBackfillBatchSize());
if (updated > 0) {
log.info("Search lexical index scheduled backfill updated {} representations", updated);
}
}
}

View File

@ -0,0 +1,97 @@
package at.procon.dip.domain.document.service;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.domain.document.entity.Document;
import at.procon.dip.domain.document.entity.DocumentContent;
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
import at.procon.dip.search.service.DocumentLexicalIndexService;
import java.util.UUID;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
@ExtendWith(MockitoExtension.class)
class DocumentRepresentationServiceTest {
@Mock
private DocumentService documentService;
@Mock
private DocumentContentService contentService;
@Mock
private DocumentTextRepresentationRepository representationRepository;
@Mock
private DocumentLexicalIndexService lexicalIndexService;
private DocumentRepresentationService service;
@BeforeEach
void setUp() {
service = new DocumentRepresentationService(documentService, contentService, representationRepository, lexicalIndexService);
}
@Test
void shouldIndexLexicallyByDefault() {
UUID documentId = UUID.randomUUID();
UUID contentId = UUID.randomUUID();
UUID representationId = UUID.randomUUID();
when(documentService.getRequired(documentId)).thenReturn(Document.builder().id(documentId).build());
when(contentService.getRequired(contentId)).thenReturn(DocumentContent.builder().id(contentId).build());
when(representationRepository.saveAndFlush(any(DocumentTextRepresentation.class))).thenAnswer(inv -> {
DocumentTextRepresentation representation = inv.getArgument(0);
representation.setId(representationId);
return representation;
});
service.addRepresentation(new AddDocumentTextRepresentationCommand(
documentId,
contentId,
RepresentationType.SEMANTIC_TEXT,
"test-builder",
"en",
null,
null,
null,
null,
true,
"Hello world",
false
));
verify(lexicalIndexService).indexRepresentation(representationId);
}
@Test
void shouldSkipImmediateLexicalIndexingWhenDeferred() {
UUID documentId = UUID.randomUUID();
UUID contentId = UUID.randomUUID();
when(documentService.getRequired(documentId)).thenReturn(Document.builder().id(documentId).build());
when(contentService.getRequired(contentId)).thenReturn(DocumentContent.builder().id(contentId).build());
when(representationRepository.saveAndFlush(any(DocumentTextRepresentation.class))).thenAnswer(inv -> inv.getArgument(0));
service.addRepresentation(new AddDocumentTextRepresentationCommand(
documentId,
contentId,
RepresentationType.SEMANTIC_TEXT,
"test-builder",
"en",
null,
null,
null,
null,
true,
"Hello world",
true
));
verify(lexicalIndexService, never()).indexRepresentation(any());
}
}

View File

@ -0,0 +1,67 @@
package at.procon.dip.ingestion.service;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import at.procon.dip.domain.access.DocumentAccessContext;
import at.procon.dip.domain.document.entity.Document;
import at.procon.dip.domain.document.service.DocumentRelationService;
import at.procon.dip.ingestion.config.DipIngestionProperties;
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
import at.procon.dip.ingestion.service.TedPackageExpansionService.TedPackageEntry;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import java.nio.charset.StandardCharsets;
import java.time.OffsetDateTime;
import java.util.List;
import java.util.UUID;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.ArgumentCaptor;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
@ExtendWith(MockitoExtension.class)
class TedPackageChildImportProcessorTest {
@Mock
private GenericDocumentImportService importService;
@Mock
private DocumentRelationService relationService;
private DipIngestionProperties properties;
private TedPackageChildImportProcessor processor;
@BeforeEach
void setUp() {
properties = new DipIngestionProperties();
properties.setTedPackageImportBatchId("ted-batch");
processor = new TedPackageChildImportProcessor(importService, relationService, properties);
}
@Test
void shouldMarkTedPackageChildrenForDeferredLexicalIndexingWhenEnabled() {
properties.setTedPackageDeferLexicalIndexing(true);
when(importService.importDocument(any())).thenReturn(new ImportedDocumentResult(
Document.builder().id(UUID.randomUUID()).build(),
null,
List.of(),
false
));
processor.processChild(
UUID.randomUUID(),
"PKG-1",
OffsetDateTime.now(),
DocumentAccessContext.publicDocument(),
new TedPackageEntry("notice-1.xml", "path/notice-1.xml", "<xml/>".getBytes(StandardCharsets.UTF_8), "<xml/>"),
1
);
ArgumentCaptor<SourceDescriptor> captor = ArgumentCaptor.forClass(SourceDescriptor.class);
verify(importService).importDocument(captor.capture());
assertThat(captor.getValue().attributes()).containsEntry(IngestionInternalAttributes.DEFER_LEXICAL_INDEX, "true");
}
}

View File

@ -0,0 +1,48 @@
package at.procon.dip.search.service;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import at.procon.dip.search.config.DipSearchProperties;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
@ExtendWith(MockitoExtension.class)
class SearchLexicalIndexSchedulerTest {
@Mock
private DocumentLexicalIndexService lexicalIndexService;
private DipSearchProperties properties;
private SearchLexicalIndexScheduler scheduler;
@BeforeEach
void setUp() {
properties = new DipSearchProperties();
properties.setScheduledLexicalBackfillBatchSize(123);
scheduler = new SearchLexicalIndexScheduler(properties, lexicalIndexService);
}
@Test
void shouldBackfillMissingVectorsWhenEnabled() {
properties.setScheduledLexicalBackfillEnabled(true);
when(lexicalIndexService.backfillMissingVectors(123)).thenReturn(5);
scheduler.backfillMissingLexicalIndexes();
verify(lexicalIndexService).backfillMissingVectors(123);
}
@Test
void shouldSkipBackfillWhenDisabled() {
properties.setScheduledLexicalBackfillEnabled(false);
scheduler.backfillMissingLexicalIndexes();
verify(lexicalIndexService, never()).backfillMissingVectors(123);
}
}

View File

@ -118,7 +118,8 @@ public class SearchTestDataFactory {
chunkStartOffset, chunkStartOffset,
chunkEndOffset, chunkEndOffset,
primary, primary,
text text,
false
)); ));
lexicalIndexService.refreshRepresentationLexicalIndex(representation.getId()); lexicalIndexService.refreshRepresentationLexicalIndex(representation.getId());
return representation; return representation;