defer lexical indexing of TED text representations
This commit is contained in:
parent
a501176c83
commit
06c1485df9
|
|
@ -37,7 +37,9 @@ public class DocumentRepresentationService {
|
|||
.textBody(command.textBody())
|
||||
.build();
|
||||
DocumentTextRepresentation saved = representationRepository.saveAndFlush(representation);
|
||||
if (!command.deferLexicalIndex()) {
|
||||
lexicalIndexService.indexRepresentation(saved.getId());
|
||||
}
|
||||
return saved;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ public record AddDocumentTextRepresentationCommand(
|
|||
Integer chunkStartOffset,
|
||||
Integer chunkEndOffset,
|
||||
boolean primaryRepresentation,
|
||||
String textBody
|
||||
String textBody,
|
||||
boolean deferLexicalIndex
|
||||
) {
|
||||
}
|
||||
|
|
|
|||
|
|
@ -76,7 +76,8 @@ public class TimeEntryRepresentationMaterializationService {
|
|||
null,
|
||||
null,
|
||||
true,
|
||||
projection.getSemanticText()
|
||||
projection.getSemanticText(),
|
||||
false
|
||||
)));
|
||||
|
||||
if (changed
|
||||
|
|
|
|||
|
|
@ -54,6 +54,12 @@ public class DipIngestionProperties {
|
|||
@NotBlank
|
||||
private String tedPackageImportBatchId = "phase41-ted-package";
|
||||
|
||||
/**
|
||||
* Skip synchronous lexical tsvector indexing during TED package child import.
|
||||
* Missing search vectors are backfilled asynchronously by the lexical index scheduler.
|
||||
*/
|
||||
private boolean tedPackageDeferLexicalIndexing = true;
|
||||
|
||||
private boolean gatewayOnlyForTedPackages = false;
|
||||
|
||||
@NotBlank
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ import at.procon.dip.ingestion.config.DipIngestionProperties;
|
|||
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
|
||||
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
|
||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||
import at.procon.dip.ingestion.service.IngestionInternalAttributes;
|
||||
import at.procon.dip.ingestion.util.DocumentImportSupport;
|
||||
import at.procon.dip.normalization.service.TextRepresentationBuildService;
|
||||
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
|
||||
|
|
@ -402,6 +403,8 @@ public class GenericDocumentImportService {
|
|||
}
|
||||
}
|
||||
|
||||
boolean deferLexicalIndex = shouldDeferLexicalIndex(sourceDescriptor);
|
||||
|
||||
for (TextRepresentationDraft draft : drafts) {
|
||||
if (!StringUtils.hasText(draft.textBody())) {
|
||||
continue;
|
||||
|
|
@ -420,7 +423,8 @@ public class GenericDocumentImportService {
|
|||
draft.chunkStartOffset(),
|
||||
draft.chunkEndOffset(),
|
||||
draft.primary(),
|
||||
draft.textBody()
|
||||
draft.textBody(),
|
||||
deferLexicalIndex
|
||||
));
|
||||
|
||||
if (shouldQueueEmbedding(draft, embeddingPolicy, embeddingProfile)) {
|
||||
|
|
@ -434,6 +438,10 @@ public class GenericDocumentImportService {
|
|||
documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED);
|
||||
}
|
||||
|
||||
private boolean shouldDeferLexicalIndex(SourceDescriptor sourceDescriptor) {
|
||||
return IngestionInternalAttributes.isTruthy(sourceDescriptor.attributes(), IngestionInternalAttributes.DEFER_LEXICAL_INDEX);
|
||||
}
|
||||
|
||||
private DocumentContent resolveLinkedContent(TextRepresentationDraft draft,
|
||||
DocumentContent originalContent,
|
||||
Map<ContentRole, DocumentContent> derivedContent) {
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ import at.procon.dip.ingestion.service.TedPackageExpansionService.TedPackageEntr
|
|||
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
|
||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||
import at.procon.dip.ingestion.config.DipIngestionProperties;
|
||||
import at.procon.dip.ingestion.service.IngestionInternalAttributes;
|
||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||
import at.procon.dip.runtime.config.RuntimeMode;
|
||||
import java.time.OffsetDateTime;
|
||||
|
|
@ -50,6 +51,9 @@ public class TedPackageChildImportProcessor {
|
|||
childAttributes.put("archivePath", entry.archivePath());
|
||||
childAttributes.put("title", entry.fileName());
|
||||
childAttributes.put("importBatchId", properties.getTedPackageImportBatchId());
|
||||
if (properties.isTedPackageDeferLexicalIndexing()) {
|
||||
childAttributes.put(IngestionInternalAttributes.DEFER_LEXICAL_INDEX, Boolean.TRUE.toString());
|
||||
}
|
||||
|
||||
ImportedDocumentResult childResult = importService.importDocument(new SourceDescriptor(
|
||||
accessContext == null ? DocumentAccessContext.publicDocument() : accessContext,
|
||||
|
|
|
|||
|
|
@ -298,7 +298,8 @@ public class LegacyTedBackfillWorker {
|
|||
draft.chunkStartOffset(),
|
||||
draft.chunkEndOffset(),
|
||||
draft.primary(),
|
||||
draft.textBody()
|
||||
draft.textBody(),
|
||||
false
|
||||
));
|
||||
existing.add(savedRepresentation);
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -78,4 +78,12 @@ public class DipSearchProperties {
|
|||
/** Number of hits per engine returned by the debug endpoint. */
|
||||
@Positive
|
||||
private int debugTopHitsPerEngine = 10;
|
||||
|
||||
private boolean scheduledLexicalBackfillEnabled;
|
||||
|
||||
@Positive
|
||||
private int scheduledLexicalBackfillDelayMs = 30000;
|
||||
|
||||
@Positive
|
||||
private int scheduledLexicalBackfillBatchSize = 200;
|
||||
}
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
package at.procon.dip.search.service;
|
||||
|
||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||
import at.procon.dip.runtime.config.RuntimeMode;
|
||||
import at.procon.dip.search.config.DipSearchProperties;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.scheduling.annotation.Scheduled;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class SearchLexicalIndexScheduler {
|
||||
|
||||
private final DipSearchProperties properties;
|
||||
private final DocumentLexicalIndexService lexicalIndexService;
|
||||
|
||||
@Scheduled(fixedDelayString = "${dip.search.scheduled-lexical-backfill-delay-ms:30000}")
|
||||
public void backfillMissingLexicalIndexes() {
|
||||
if (!properties.isScheduledLexicalBackfillEnabled()) {
|
||||
return;
|
||||
}
|
||||
int updated = lexicalIndexService.backfillMissingVectors(properties.getScheduledLexicalBackfillBatchSize());
|
||||
if (updated > 0) {
|
||||
log.info("Search lexical index scheduled backfill updated {} representations", updated);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,97 @@
|
|||
package at.procon.dip.domain.document.service;
|
||||
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.Mockito.never;
|
||||
import static org.mockito.Mockito.verify;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import at.procon.dip.domain.document.entity.Document;
|
||||
import at.procon.dip.domain.document.entity.DocumentContent;
|
||||
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
|
||||
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
||||
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
|
||||
import at.procon.dip.search.service.DocumentLexicalIndexService;
|
||||
import java.util.UUID;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
class DocumentRepresentationServiceTest {
|
||||
|
||||
@Mock
|
||||
private DocumentService documentService;
|
||||
@Mock
|
||||
private DocumentContentService contentService;
|
||||
@Mock
|
||||
private DocumentTextRepresentationRepository representationRepository;
|
||||
@Mock
|
||||
private DocumentLexicalIndexService lexicalIndexService;
|
||||
|
||||
private DocumentRepresentationService service;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() {
|
||||
service = new DocumentRepresentationService(documentService, contentService, representationRepository, lexicalIndexService);
|
||||
}
|
||||
|
||||
@Test
|
||||
void shouldIndexLexicallyByDefault() {
|
||||
UUID documentId = UUID.randomUUID();
|
||||
UUID contentId = UUID.randomUUID();
|
||||
UUID representationId = UUID.randomUUID();
|
||||
when(documentService.getRequired(documentId)).thenReturn(Document.builder().id(documentId).build());
|
||||
when(contentService.getRequired(contentId)).thenReturn(DocumentContent.builder().id(contentId).build());
|
||||
when(representationRepository.saveAndFlush(any(DocumentTextRepresentation.class))).thenAnswer(inv -> {
|
||||
DocumentTextRepresentation representation = inv.getArgument(0);
|
||||
representation.setId(representationId);
|
||||
return representation;
|
||||
});
|
||||
|
||||
service.addRepresentation(new AddDocumentTextRepresentationCommand(
|
||||
documentId,
|
||||
contentId,
|
||||
RepresentationType.SEMANTIC_TEXT,
|
||||
"test-builder",
|
||||
"en",
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
true,
|
||||
"Hello world",
|
||||
false
|
||||
));
|
||||
|
||||
verify(lexicalIndexService).indexRepresentation(representationId);
|
||||
}
|
||||
|
||||
@Test
|
||||
void shouldSkipImmediateLexicalIndexingWhenDeferred() {
|
||||
UUID documentId = UUID.randomUUID();
|
||||
UUID contentId = UUID.randomUUID();
|
||||
when(documentService.getRequired(documentId)).thenReturn(Document.builder().id(documentId).build());
|
||||
when(contentService.getRequired(contentId)).thenReturn(DocumentContent.builder().id(contentId).build());
|
||||
when(representationRepository.saveAndFlush(any(DocumentTextRepresentation.class))).thenAnswer(inv -> inv.getArgument(0));
|
||||
|
||||
service.addRepresentation(new AddDocumentTextRepresentationCommand(
|
||||
documentId,
|
||||
contentId,
|
||||
RepresentationType.SEMANTIC_TEXT,
|
||||
"test-builder",
|
||||
"en",
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
true,
|
||||
"Hello world",
|
||||
true
|
||||
));
|
||||
|
||||
verify(lexicalIndexService, never()).indexRepresentation(any());
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
package at.procon.dip.ingestion.service;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.Mockito.verify;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import at.procon.dip.domain.access.DocumentAccessContext;
|
||||
import at.procon.dip.domain.document.entity.Document;
|
||||
import at.procon.dip.domain.document.service.DocumentRelationService;
|
||||
import at.procon.dip.ingestion.config.DipIngestionProperties;
|
||||
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
|
||||
import at.procon.dip.ingestion.service.TedPackageExpansionService.TedPackageEntry;
|
||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.ArgumentCaptor;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
class TedPackageChildImportProcessorTest {
|
||||
|
||||
@Mock
|
||||
private GenericDocumentImportService importService;
|
||||
@Mock
|
||||
private DocumentRelationService relationService;
|
||||
|
||||
private DipIngestionProperties properties;
|
||||
private TedPackageChildImportProcessor processor;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() {
|
||||
properties = new DipIngestionProperties();
|
||||
properties.setTedPackageImportBatchId("ted-batch");
|
||||
processor = new TedPackageChildImportProcessor(importService, relationService, properties);
|
||||
}
|
||||
|
||||
@Test
|
||||
void shouldMarkTedPackageChildrenForDeferredLexicalIndexingWhenEnabled() {
|
||||
properties.setTedPackageDeferLexicalIndexing(true);
|
||||
when(importService.importDocument(any())).thenReturn(new ImportedDocumentResult(
|
||||
Document.builder().id(UUID.randomUUID()).build(),
|
||||
null,
|
||||
List.of(),
|
||||
false
|
||||
));
|
||||
|
||||
processor.processChild(
|
||||
UUID.randomUUID(),
|
||||
"PKG-1",
|
||||
OffsetDateTime.now(),
|
||||
DocumentAccessContext.publicDocument(),
|
||||
new TedPackageEntry("notice-1.xml", "path/notice-1.xml", "<xml/>".getBytes(StandardCharsets.UTF_8), "<xml/>"),
|
||||
1
|
||||
);
|
||||
|
||||
ArgumentCaptor<SourceDescriptor> captor = ArgumentCaptor.forClass(SourceDescriptor.class);
|
||||
verify(importService).importDocument(captor.capture());
|
||||
assertThat(captor.getValue().attributes()).containsEntry(IngestionInternalAttributes.DEFER_LEXICAL_INDEX, "true");
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
package at.procon.dip.search.service;
|
||||
|
||||
import static org.mockito.Mockito.never;
|
||||
import static org.mockito.Mockito.verify;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import at.procon.dip.search.config.DipSearchProperties;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
class SearchLexicalIndexSchedulerTest {
|
||||
|
||||
@Mock
|
||||
private DocumentLexicalIndexService lexicalIndexService;
|
||||
|
||||
private DipSearchProperties properties;
|
||||
private SearchLexicalIndexScheduler scheduler;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() {
|
||||
properties = new DipSearchProperties();
|
||||
properties.setScheduledLexicalBackfillBatchSize(123);
|
||||
scheduler = new SearchLexicalIndexScheduler(properties, lexicalIndexService);
|
||||
}
|
||||
|
||||
@Test
|
||||
void shouldBackfillMissingVectorsWhenEnabled() {
|
||||
properties.setScheduledLexicalBackfillEnabled(true);
|
||||
when(lexicalIndexService.backfillMissingVectors(123)).thenReturn(5);
|
||||
|
||||
scheduler.backfillMissingLexicalIndexes();
|
||||
|
||||
verify(lexicalIndexService).backfillMissingVectors(123);
|
||||
}
|
||||
|
||||
@Test
|
||||
void shouldSkipBackfillWhenDisabled() {
|
||||
properties.setScheduledLexicalBackfillEnabled(false);
|
||||
|
||||
scheduler.backfillMissingLexicalIndexes();
|
||||
|
||||
verify(lexicalIndexService, never()).backfillMissingVectors(123);
|
||||
}
|
||||
}
|
||||
|
|
@ -118,7 +118,8 @@ public class SearchTestDataFactory {
|
|||
chunkStartOffset,
|
||||
chunkEndOffset,
|
||||
primary,
|
||||
text
|
||||
text,
|
||||
false
|
||||
));
|
||||
lexicalIndexService.refreshRepresentationLexicalIndex(representation.getId());
|
||||
return representation;
|
||||
|
|
|
|||
Loading…
Reference in New Issue