diff --git a/src/main/java/at/procon/dip/domain/document/service/DocumentRepresentationService.java b/src/main/java/at/procon/dip/domain/document/service/DocumentRepresentationService.java index c081f54..731b6c5 100644 --- a/src/main/java/at/procon/dip/domain/document/service/DocumentRepresentationService.java +++ b/src/main/java/at/procon/dip/domain/document/service/DocumentRepresentationService.java @@ -37,7 +37,9 @@ public class DocumentRepresentationService { .textBody(command.textBody()) .build(); DocumentTextRepresentation saved = representationRepository.saveAndFlush(representation); - lexicalIndexService.indexRepresentation(saved.getId()); + if (!command.deferLexicalIndex()) { + lexicalIndexService.indexRepresentation(saved.getId()); + } return saved; } diff --git a/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentTextRepresentationCommand.java b/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentTextRepresentationCommand.java index 3106218..486bcf9 100644 --- a/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentTextRepresentationCommand.java +++ b/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentTextRepresentationCommand.java @@ -14,6 +14,7 @@ public record AddDocumentTextRepresentationCommand( Integer chunkStartOffset, Integer chunkEndOffset, boolean primaryRepresentation, - String textBody + String textBody, + boolean deferLexicalIndex ) { } diff --git a/src/main/java/at/procon/dip/domain/time/service/TimeEntryRepresentationMaterializationService.java b/src/main/java/at/procon/dip/domain/time/service/TimeEntryRepresentationMaterializationService.java index 3d7a1b3..9ab5019 100644 --- a/src/main/java/at/procon/dip/domain/time/service/TimeEntryRepresentationMaterializationService.java +++ b/src/main/java/at/procon/dip/domain/time/service/TimeEntryRepresentationMaterializationService.java @@ -76,7 +76,8 @@ public class TimeEntryRepresentationMaterializationService { null, null, true, - projection.getSemanticText() + projection.getSemanticText(), + false ))); if (changed diff --git a/src/main/java/at/procon/dip/ingestion/config/DipIngestionProperties.java b/src/main/java/at/procon/dip/ingestion/config/DipIngestionProperties.java index 1086d38..e9e309f 100644 --- a/src/main/java/at/procon/dip/ingestion/config/DipIngestionProperties.java +++ b/src/main/java/at/procon/dip/ingestion/config/DipIngestionProperties.java @@ -54,6 +54,12 @@ public class DipIngestionProperties { @NotBlank private String tedPackageImportBatchId = "phase41-ted-package"; + /** + * Skip synchronous lexical tsvector indexing during TED package child import. + * Missing search vectors are backfilled asynchronously by the lexical index scheduler. + */ + private boolean tedPackageDeferLexicalIndexing = true; + private boolean gatewayOnlyForTedPackages = false; @NotBlank diff --git a/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java b/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java index bbcd3ce..523abfa 100644 --- a/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java +++ b/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java @@ -36,6 +36,7 @@ import at.procon.dip.ingestion.config.DipIngestionProperties; import at.procon.dip.ingestion.dto.ImportedDocumentResult; import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; import at.procon.dip.ingestion.spi.SourceDescriptor; +import at.procon.dip.ingestion.service.IngestionInternalAttributes; import at.procon.dip.ingestion.util.DocumentImportSupport; import at.procon.dip.normalization.service.TextRepresentationBuildService; import at.procon.dip.normalization.spi.RepresentationBuildRequest; @@ -402,6 +403,8 @@ public class GenericDocumentImportService { } } + boolean deferLexicalIndex = shouldDeferLexicalIndex(sourceDescriptor); + for (TextRepresentationDraft draft : drafts) { if (!StringUtils.hasText(draft.textBody())) { continue; @@ -420,7 +423,8 @@ public class GenericDocumentImportService { draft.chunkStartOffset(), draft.chunkEndOffset(), draft.primary(), - draft.textBody() + draft.textBody(), + deferLexicalIndex )); if (shouldQueueEmbedding(draft, embeddingPolicy, embeddingProfile)) { @@ -434,6 +438,10 @@ public class GenericDocumentImportService { documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED); } + private boolean shouldDeferLexicalIndex(SourceDescriptor sourceDescriptor) { + return IngestionInternalAttributes.isTruthy(sourceDescriptor.attributes(), IngestionInternalAttributes.DEFER_LEXICAL_INDEX); + } + private DocumentContent resolveLinkedContent(TextRepresentationDraft draft, DocumentContent originalContent, Map derivedContent) { diff --git a/src/main/java/at/procon/dip/ingestion/service/TedPackageChildImportProcessor.java b/src/main/java/at/procon/dip/ingestion/service/TedPackageChildImportProcessor.java index 6f21a74..788ba8e 100644 --- a/src/main/java/at/procon/dip/ingestion/service/TedPackageChildImportProcessor.java +++ b/src/main/java/at/procon/dip/ingestion/service/TedPackageChildImportProcessor.java @@ -11,6 +11,7 @@ import at.procon.dip.ingestion.service.TedPackageExpansionService.TedPackageEntr import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.dip.ingestion.config.DipIngestionProperties; +import at.procon.dip.ingestion.service.IngestionInternalAttributes; import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; import at.procon.dip.runtime.config.RuntimeMode; import java.time.OffsetDateTime; @@ -50,6 +51,9 @@ public class TedPackageChildImportProcessor { childAttributes.put("archivePath", entry.archivePath()); childAttributes.put("title", entry.fileName()); childAttributes.put("importBatchId", properties.getTedPackageImportBatchId()); + if (properties.isTedPackageDeferLexicalIndexing()) { + childAttributes.put(IngestionInternalAttributes.DEFER_LEXICAL_INDEX, Boolean.TRUE.toString()); + } ImportedDocumentResult childResult = importService.importDocument(new SourceDescriptor( accessContext == null ? DocumentAccessContext.publicDocument() : accessContext, diff --git a/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillWorker.java b/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillWorker.java index 3b5caae..baba475 100644 --- a/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillWorker.java +++ b/src/main/java/at/procon/dip/migration/service/LegacyTedBackfillWorker.java @@ -298,7 +298,8 @@ public class LegacyTedBackfillWorker { draft.chunkStartOffset(), draft.chunkEndOffset(), draft.primary(), - draft.textBody() + draft.textBody(), + false )); existing.add(savedRepresentation); } else { diff --git a/src/main/java/at/procon/dip/search/config/DipSearchProperties.java b/src/main/java/at/procon/dip/search/config/DipSearchProperties.java index e4e5e0b..0f95b46 100644 --- a/src/main/java/at/procon/dip/search/config/DipSearchProperties.java +++ b/src/main/java/at/procon/dip/search/config/DipSearchProperties.java @@ -78,4 +78,12 @@ public class DipSearchProperties { /** Number of hits per engine returned by the debug endpoint. */ @Positive private int debugTopHitsPerEngine = 10; + + private boolean scheduledLexicalBackfillEnabled; + + @Positive + private int scheduledLexicalBackfillDelayMs = 30000; + + @Positive + private int scheduledLexicalBackfillBatchSize = 200; } \ No newline at end of file diff --git a/src/main/java/at/procon/dip/search/service/SearchLexicalIndexScheduler.java b/src/main/java/at/procon/dip/search/service/SearchLexicalIndexScheduler.java new file mode 100644 index 0000000..4848228 --- /dev/null +++ b/src/main/java/at/procon/dip/search/service/SearchLexicalIndexScheduler.java @@ -0,0 +1,30 @@ +package at.procon.dip.search.service; + +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; +import at.procon.dip.search.config.DipSearchProperties; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.scheduling.annotation.Scheduled; +import org.springframework.stereotype.Component; + +@Component +@ConditionalOnRuntimeMode(RuntimeMode.NEW) +@RequiredArgsConstructor +@Slf4j +public class SearchLexicalIndexScheduler { + + private final DipSearchProperties properties; + private final DocumentLexicalIndexService lexicalIndexService; + + @Scheduled(fixedDelayString = "${dip.search.scheduled-lexical-backfill-delay-ms:30000}") + public void backfillMissingLexicalIndexes() { + if (!properties.isScheduledLexicalBackfillEnabled()) { + return; + } + int updated = lexicalIndexService.backfillMissingVectors(properties.getScheduledLexicalBackfillBatchSize()); + if (updated > 0) { + log.info("Search lexical index scheduled backfill updated {} representations", updated); + } + } +} diff --git a/src/test/java/at/procon/dip/domain/document/service/DocumentRepresentationServiceTest.java b/src/test/java/at/procon/dip/domain/document/service/DocumentRepresentationServiceTest.java new file mode 100644 index 0000000..c18650a --- /dev/null +++ b/src/test/java/at/procon/dip/domain/document/service/DocumentRepresentationServiceTest.java @@ -0,0 +1,97 @@ +package at.procon.dip.domain.document.service; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import at.procon.dip.domain.document.RepresentationType; +import at.procon.dip.domain.document.entity.Document; +import at.procon.dip.domain.document.entity.DocumentContent; +import at.procon.dip.domain.document.entity.DocumentTextRepresentation; +import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository; +import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand; +import at.procon.dip.search.service.DocumentLexicalIndexService; +import java.util.UUID; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +@ExtendWith(MockitoExtension.class) +class DocumentRepresentationServiceTest { + + @Mock + private DocumentService documentService; + @Mock + private DocumentContentService contentService; + @Mock + private DocumentTextRepresentationRepository representationRepository; + @Mock + private DocumentLexicalIndexService lexicalIndexService; + + private DocumentRepresentationService service; + + @BeforeEach + void setUp() { + service = new DocumentRepresentationService(documentService, contentService, representationRepository, lexicalIndexService); + } + + @Test + void shouldIndexLexicallyByDefault() { + UUID documentId = UUID.randomUUID(); + UUID contentId = UUID.randomUUID(); + UUID representationId = UUID.randomUUID(); + when(documentService.getRequired(documentId)).thenReturn(Document.builder().id(documentId).build()); + when(contentService.getRequired(contentId)).thenReturn(DocumentContent.builder().id(contentId).build()); + when(representationRepository.saveAndFlush(any(DocumentTextRepresentation.class))).thenAnswer(inv -> { + DocumentTextRepresentation representation = inv.getArgument(0); + representation.setId(representationId); + return representation; + }); + + service.addRepresentation(new AddDocumentTextRepresentationCommand( + documentId, + contentId, + RepresentationType.SEMANTIC_TEXT, + "test-builder", + "en", + null, + null, + null, + null, + true, + "Hello world", + false + )); + + verify(lexicalIndexService).indexRepresentation(representationId); + } + + @Test + void shouldSkipImmediateLexicalIndexingWhenDeferred() { + UUID documentId = UUID.randomUUID(); + UUID contentId = UUID.randomUUID(); + when(documentService.getRequired(documentId)).thenReturn(Document.builder().id(documentId).build()); + when(contentService.getRequired(contentId)).thenReturn(DocumentContent.builder().id(contentId).build()); + when(representationRepository.saveAndFlush(any(DocumentTextRepresentation.class))).thenAnswer(inv -> inv.getArgument(0)); + + service.addRepresentation(new AddDocumentTextRepresentationCommand( + documentId, + contentId, + RepresentationType.SEMANTIC_TEXT, + "test-builder", + "en", + null, + null, + null, + null, + true, + "Hello world", + true + )); + + verify(lexicalIndexService, never()).indexRepresentation(any()); + } +} diff --git a/src/test/java/at/procon/dip/ingestion/service/TedPackageChildImportProcessorTest.java b/src/test/java/at/procon/dip/ingestion/service/TedPackageChildImportProcessorTest.java new file mode 100644 index 0000000..35eecc6 --- /dev/null +++ b/src/test/java/at/procon/dip/ingestion/service/TedPackageChildImportProcessorTest.java @@ -0,0 +1,67 @@ +package at.procon.dip.ingestion.service; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import at.procon.dip.domain.access.DocumentAccessContext; +import at.procon.dip.domain.document.entity.Document; +import at.procon.dip.domain.document.service.DocumentRelationService; +import at.procon.dip.ingestion.config.DipIngestionProperties; +import at.procon.dip.ingestion.dto.ImportedDocumentResult; +import at.procon.dip.ingestion.service.TedPackageExpansionService.TedPackageEntry; +import at.procon.dip.ingestion.spi.SourceDescriptor; +import java.nio.charset.StandardCharsets; +import java.time.OffsetDateTime; +import java.util.List; +import java.util.UUID; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.ArgumentCaptor; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +@ExtendWith(MockitoExtension.class) +class TedPackageChildImportProcessorTest { + + @Mock + private GenericDocumentImportService importService; + @Mock + private DocumentRelationService relationService; + + private DipIngestionProperties properties; + private TedPackageChildImportProcessor processor; + + @BeforeEach + void setUp() { + properties = new DipIngestionProperties(); + properties.setTedPackageImportBatchId("ted-batch"); + processor = new TedPackageChildImportProcessor(importService, relationService, properties); + } + + @Test + void shouldMarkTedPackageChildrenForDeferredLexicalIndexingWhenEnabled() { + properties.setTedPackageDeferLexicalIndexing(true); + when(importService.importDocument(any())).thenReturn(new ImportedDocumentResult( + Document.builder().id(UUID.randomUUID()).build(), + null, + List.of(), + false + )); + + processor.processChild( + UUID.randomUUID(), + "PKG-1", + OffsetDateTime.now(), + DocumentAccessContext.publicDocument(), + new TedPackageEntry("notice-1.xml", "path/notice-1.xml", "".getBytes(StandardCharsets.UTF_8), ""), + 1 + ); + + ArgumentCaptor captor = ArgumentCaptor.forClass(SourceDescriptor.class); + verify(importService).importDocument(captor.capture()); + assertThat(captor.getValue().attributes()).containsEntry(IngestionInternalAttributes.DEFER_LEXICAL_INDEX, "true"); + } +} diff --git a/src/test/java/at/procon/dip/search/service/SearchLexicalIndexSchedulerTest.java b/src/test/java/at/procon/dip/search/service/SearchLexicalIndexSchedulerTest.java new file mode 100644 index 0000000..e445da6 --- /dev/null +++ b/src/test/java/at/procon/dip/search/service/SearchLexicalIndexSchedulerTest.java @@ -0,0 +1,48 @@ +package at.procon.dip.search.service; + +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import at.procon.dip.search.config.DipSearchProperties; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +@ExtendWith(MockitoExtension.class) +class SearchLexicalIndexSchedulerTest { + + @Mock + private DocumentLexicalIndexService lexicalIndexService; + + private DipSearchProperties properties; + private SearchLexicalIndexScheduler scheduler; + + @BeforeEach + void setUp() { + properties = new DipSearchProperties(); + properties.setScheduledLexicalBackfillBatchSize(123); + scheduler = new SearchLexicalIndexScheduler(properties, lexicalIndexService); + } + + @Test + void shouldBackfillMissingVectorsWhenEnabled() { + properties.setScheduledLexicalBackfillEnabled(true); + when(lexicalIndexService.backfillMissingVectors(123)).thenReturn(5); + + scheduler.backfillMissingLexicalIndexes(); + + verify(lexicalIndexService).backfillMissingVectors(123); + } + + @Test + void shouldSkipBackfillWhenDisabled() { + properties.setScheduledLexicalBackfillEnabled(false); + + scheduler.backfillMissingLexicalIndexes(); + + verify(lexicalIndexService, never()).backfillMissingVectors(123); + } +} diff --git a/src/test/java/at/procon/dip/testsupport/SearchTestDataFactory.java b/src/test/java/at/procon/dip/testsupport/SearchTestDataFactory.java index 1d5266a..e24d30e 100644 --- a/src/test/java/at/procon/dip/testsupport/SearchTestDataFactory.java +++ b/src/test/java/at/procon/dip/testsupport/SearchTestDataFactory.java @@ -118,7 +118,8 @@ public class SearchTestDataFactory { chunkStartOffset, chunkEndOffset, primary, - text + text, + false )); lexicalIndexService.refreshRepresentationLexicalIndex(representation.getId()); return representation;