defer lexical indexing of TED text representations
This commit is contained in:
parent
a501176c83
commit
06c1485df9
|
|
@ -37,7 +37,9 @@ public class DocumentRepresentationService {
|
||||||
.textBody(command.textBody())
|
.textBody(command.textBody())
|
||||||
.build();
|
.build();
|
||||||
DocumentTextRepresentation saved = representationRepository.saveAndFlush(representation);
|
DocumentTextRepresentation saved = representationRepository.saveAndFlush(representation);
|
||||||
|
if (!command.deferLexicalIndex()) {
|
||||||
lexicalIndexService.indexRepresentation(saved.getId());
|
lexicalIndexService.indexRepresentation(saved.getId());
|
||||||
|
}
|
||||||
return saved;
|
return saved;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,7 @@ public record AddDocumentTextRepresentationCommand(
|
||||||
Integer chunkStartOffset,
|
Integer chunkStartOffset,
|
||||||
Integer chunkEndOffset,
|
Integer chunkEndOffset,
|
||||||
boolean primaryRepresentation,
|
boolean primaryRepresentation,
|
||||||
String textBody
|
String textBody,
|
||||||
|
boolean deferLexicalIndex
|
||||||
) {
|
) {
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -76,7 +76,8 @@ public class TimeEntryRepresentationMaterializationService {
|
||||||
null,
|
null,
|
||||||
null,
|
null,
|
||||||
true,
|
true,
|
||||||
projection.getSemanticText()
|
projection.getSemanticText(),
|
||||||
|
false
|
||||||
)));
|
)));
|
||||||
|
|
||||||
if (changed
|
if (changed
|
||||||
|
|
|
||||||
|
|
@ -54,6 +54,12 @@ public class DipIngestionProperties {
|
||||||
@NotBlank
|
@NotBlank
|
||||||
private String tedPackageImportBatchId = "phase41-ted-package";
|
private String tedPackageImportBatchId = "phase41-ted-package";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Skip synchronous lexical tsvector indexing during TED package child import.
|
||||||
|
* Missing search vectors are backfilled asynchronously by the lexical index scheduler.
|
||||||
|
*/
|
||||||
|
private boolean tedPackageDeferLexicalIndexing = true;
|
||||||
|
|
||||||
private boolean gatewayOnlyForTedPackages = false;
|
private boolean gatewayOnlyForTedPackages = false;
|
||||||
|
|
||||||
@NotBlank
|
@NotBlank
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,7 @@ import at.procon.dip.ingestion.config.DipIngestionProperties;
|
||||||
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
|
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
|
||||||
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
|
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
|
||||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
|
import at.procon.dip.ingestion.service.IngestionInternalAttributes;
|
||||||
import at.procon.dip.ingestion.util.DocumentImportSupport;
|
import at.procon.dip.ingestion.util.DocumentImportSupport;
|
||||||
import at.procon.dip.normalization.service.TextRepresentationBuildService;
|
import at.procon.dip.normalization.service.TextRepresentationBuildService;
|
||||||
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
|
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
|
||||||
|
|
@ -402,6 +403,8 @@ public class GenericDocumentImportService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
boolean deferLexicalIndex = shouldDeferLexicalIndex(sourceDescriptor);
|
||||||
|
|
||||||
for (TextRepresentationDraft draft : drafts) {
|
for (TextRepresentationDraft draft : drafts) {
|
||||||
if (!StringUtils.hasText(draft.textBody())) {
|
if (!StringUtils.hasText(draft.textBody())) {
|
||||||
continue;
|
continue;
|
||||||
|
|
@ -420,7 +423,8 @@ public class GenericDocumentImportService {
|
||||||
draft.chunkStartOffset(),
|
draft.chunkStartOffset(),
|
||||||
draft.chunkEndOffset(),
|
draft.chunkEndOffset(),
|
||||||
draft.primary(),
|
draft.primary(),
|
||||||
draft.textBody()
|
draft.textBody(),
|
||||||
|
deferLexicalIndex
|
||||||
));
|
));
|
||||||
|
|
||||||
if (shouldQueueEmbedding(draft, embeddingPolicy, embeddingProfile)) {
|
if (shouldQueueEmbedding(draft, embeddingPolicy, embeddingProfile)) {
|
||||||
|
|
@ -434,6 +438,10 @@ public class GenericDocumentImportService {
|
||||||
documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED);
|
documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean shouldDeferLexicalIndex(SourceDescriptor sourceDescriptor) {
|
||||||
|
return IngestionInternalAttributes.isTruthy(sourceDescriptor.attributes(), IngestionInternalAttributes.DEFER_LEXICAL_INDEX);
|
||||||
|
}
|
||||||
|
|
||||||
private DocumentContent resolveLinkedContent(TextRepresentationDraft draft,
|
private DocumentContent resolveLinkedContent(TextRepresentationDraft draft,
|
||||||
DocumentContent originalContent,
|
DocumentContent originalContent,
|
||||||
Map<ContentRole, DocumentContent> derivedContent) {
|
Map<ContentRole, DocumentContent> derivedContent) {
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ import at.procon.dip.ingestion.service.TedPackageExpansionService.TedPackageEntr
|
||||||
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
|
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
|
||||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
import at.procon.dip.ingestion.config.DipIngestionProperties;
|
import at.procon.dip.ingestion.config.DipIngestionProperties;
|
||||||
|
import at.procon.dip.ingestion.service.IngestionInternalAttributes;
|
||||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||||
import at.procon.dip.runtime.config.RuntimeMode;
|
import at.procon.dip.runtime.config.RuntimeMode;
|
||||||
import java.time.OffsetDateTime;
|
import java.time.OffsetDateTime;
|
||||||
|
|
@ -50,6 +51,9 @@ public class TedPackageChildImportProcessor {
|
||||||
childAttributes.put("archivePath", entry.archivePath());
|
childAttributes.put("archivePath", entry.archivePath());
|
||||||
childAttributes.put("title", entry.fileName());
|
childAttributes.put("title", entry.fileName());
|
||||||
childAttributes.put("importBatchId", properties.getTedPackageImportBatchId());
|
childAttributes.put("importBatchId", properties.getTedPackageImportBatchId());
|
||||||
|
if (properties.isTedPackageDeferLexicalIndexing()) {
|
||||||
|
childAttributes.put(IngestionInternalAttributes.DEFER_LEXICAL_INDEX, Boolean.TRUE.toString());
|
||||||
|
}
|
||||||
|
|
||||||
ImportedDocumentResult childResult = importService.importDocument(new SourceDescriptor(
|
ImportedDocumentResult childResult = importService.importDocument(new SourceDescriptor(
|
||||||
accessContext == null ? DocumentAccessContext.publicDocument() : accessContext,
|
accessContext == null ? DocumentAccessContext.publicDocument() : accessContext,
|
||||||
|
|
|
||||||
|
|
@ -298,7 +298,8 @@ public class LegacyTedBackfillWorker {
|
||||||
draft.chunkStartOffset(),
|
draft.chunkStartOffset(),
|
||||||
draft.chunkEndOffset(),
|
draft.chunkEndOffset(),
|
||||||
draft.primary(),
|
draft.primary(),
|
||||||
draft.textBody()
|
draft.textBody(),
|
||||||
|
false
|
||||||
));
|
));
|
||||||
existing.add(savedRepresentation);
|
existing.add(savedRepresentation);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
|
|
@ -78,4 +78,12 @@ public class DipSearchProperties {
|
||||||
/** Number of hits per engine returned by the debug endpoint. */
|
/** Number of hits per engine returned by the debug endpoint. */
|
||||||
@Positive
|
@Positive
|
||||||
private int debugTopHitsPerEngine = 10;
|
private int debugTopHitsPerEngine = 10;
|
||||||
|
|
||||||
|
private boolean scheduledLexicalBackfillEnabled;
|
||||||
|
|
||||||
|
@Positive
|
||||||
|
private int scheduledLexicalBackfillDelayMs = 30000;
|
||||||
|
|
||||||
|
@Positive
|
||||||
|
private int scheduledLexicalBackfillBatchSize = 200;
|
||||||
}
|
}
|
||||||
|
|
@ -0,0 +1,30 @@
|
||||||
|
package at.procon.dip.search.service;
|
||||||
|
|
||||||
|
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||||
|
import at.procon.dip.runtime.config.RuntimeMode;
|
||||||
|
import at.procon.dip.search.config.DipSearchProperties;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.springframework.scheduling.annotation.Scheduled;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
@Component
|
||||||
|
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Slf4j
|
||||||
|
public class SearchLexicalIndexScheduler {
|
||||||
|
|
||||||
|
private final DipSearchProperties properties;
|
||||||
|
private final DocumentLexicalIndexService lexicalIndexService;
|
||||||
|
|
||||||
|
@Scheduled(fixedDelayString = "${dip.search.scheduled-lexical-backfill-delay-ms:30000}")
|
||||||
|
public void backfillMissingLexicalIndexes() {
|
||||||
|
if (!properties.isScheduledLexicalBackfillEnabled()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
int updated = lexicalIndexService.backfillMissingVectors(properties.getScheduledLexicalBackfillBatchSize());
|
||||||
|
if (updated > 0) {
|
||||||
|
log.info("Search lexical index scheduled backfill updated {} representations", updated);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,97 @@
|
||||||
|
package at.procon.dip.domain.document.service;
|
||||||
|
|
||||||
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
|
import static org.mockito.Mockito.never;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
|
import at.procon.dip.domain.document.entity.Document;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentContent;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
||||||
|
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
|
||||||
|
import at.procon.dip.search.service.DocumentLexicalIndexService;
|
||||||
|
import java.util.UUID;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
import org.mockito.Mock;
|
||||||
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
|
||||||
|
@ExtendWith(MockitoExtension.class)
|
||||||
|
class DocumentRepresentationServiceTest {
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
private DocumentService documentService;
|
||||||
|
@Mock
|
||||||
|
private DocumentContentService contentService;
|
||||||
|
@Mock
|
||||||
|
private DocumentTextRepresentationRepository representationRepository;
|
||||||
|
@Mock
|
||||||
|
private DocumentLexicalIndexService lexicalIndexService;
|
||||||
|
|
||||||
|
private DocumentRepresentationService service;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() {
|
||||||
|
service = new DocumentRepresentationService(documentService, contentService, representationRepository, lexicalIndexService);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void shouldIndexLexicallyByDefault() {
|
||||||
|
UUID documentId = UUID.randomUUID();
|
||||||
|
UUID contentId = UUID.randomUUID();
|
||||||
|
UUID representationId = UUID.randomUUID();
|
||||||
|
when(documentService.getRequired(documentId)).thenReturn(Document.builder().id(documentId).build());
|
||||||
|
when(contentService.getRequired(contentId)).thenReturn(DocumentContent.builder().id(contentId).build());
|
||||||
|
when(representationRepository.saveAndFlush(any(DocumentTextRepresentation.class))).thenAnswer(inv -> {
|
||||||
|
DocumentTextRepresentation representation = inv.getArgument(0);
|
||||||
|
representation.setId(representationId);
|
||||||
|
return representation;
|
||||||
|
});
|
||||||
|
|
||||||
|
service.addRepresentation(new AddDocumentTextRepresentationCommand(
|
||||||
|
documentId,
|
||||||
|
contentId,
|
||||||
|
RepresentationType.SEMANTIC_TEXT,
|
||||||
|
"test-builder",
|
||||||
|
"en",
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
true,
|
||||||
|
"Hello world",
|
||||||
|
false
|
||||||
|
));
|
||||||
|
|
||||||
|
verify(lexicalIndexService).indexRepresentation(representationId);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void shouldSkipImmediateLexicalIndexingWhenDeferred() {
|
||||||
|
UUID documentId = UUID.randomUUID();
|
||||||
|
UUID contentId = UUID.randomUUID();
|
||||||
|
when(documentService.getRequired(documentId)).thenReturn(Document.builder().id(documentId).build());
|
||||||
|
when(contentService.getRequired(contentId)).thenReturn(DocumentContent.builder().id(contentId).build());
|
||||||
|
when(representationRepository.saveAndFlush(any(DocumentTextRepresentation.class))).thenAnswer(inv -> inv.getArgument(0));
|
||||||
|
|
||||||
|
service.addRepresentation(new AddDocumentTextRepresentationCommand(
|
||||||
|
documentId,
|
||||||
|
contentId,
|
||||||
|
RepresentationType.SEMANTIC_TEXT,
|
||||||
|
"test-builder",
|
||||||
|
"en",
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
true,
|
||||||
|
"Hello world",
|
||||||
|
true
|
||||||
|
));
|
||||||
|
|
||||||
|
verify(lexicalIndexService, never()).indexRepresentation(any());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,67 @@
|
||||||
|
package at.procon.dip.ingestion.service;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.access.DocumentAccessContext;
|
||||||
|
import at.procon.dip.domain.document.entity.Document;
|
||||||
|
import at.procon.dip.domain.document.service.DocumentRelationService;
|
||||||
|
import at.procon.dip.ingestion.config.DipIngestionProperties;
|
||||||
|
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
|
||||||
|
import at.procon.dip.ingestion.service.TedPackageExpansionService.TedPackageEntry;
|
||||||
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.UUID;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
import org.mockito.ArgumentCaptor;
|
||||||
|
import org.mockito.Mock;
|
||||||
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
|
||||||
|
@ExtendWith(MockitoExtension.class)
|
||||||
|
class TedPackageChildImportProcessorTest {
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
private GenericDocumentImportService importService;
|
||||||
|
@Mock
|
||||||
|
private DocumentRelationService relationService;
|
||||||
|
|
||||||
|
private DipIngestionProperties properties;
|
||||||
|
private TedPackageChildImportProcessor processor;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() {
|
||||||
|
properties = new DipIngestionProperties();
|
||||||
|
properties.setTedPackageImportBatchId("ted-batch");
|
||||||
|
processor = new TedPackageChildImportProcessor(importService, relationService, properties);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void shouldMarkTedPackageChildrenForDeferredLexicalIndexingWhenEnabled() {
|
||||||
|
properties.setTedPackageDeferLexicalIndexing(true);
|
||||||
|
when(importService.importDocument(any())).thenReturn(new ImportedDocumentResult(
|
||||||
|
Document.builder().id(UUID.randomUUID()).build(),
|
||||||
|
null,
|
||||||
|
List.of(),
|
||||||
|
false
|
||||||
|
));
|
||||||
|
|
||||||
|
processor.processChild(
|
||||||
|
UUID.randomUUID(),
|
||||||
|
"PKG-1",
|
||||||
|
OffsetDateTime.now(),
|
||||||
|
DocumentAccessContext.publicDocument(),
|
||||||
|
new TedPackageEntry("notice-1.xml", "path/notice-1.xml", "<xml/>".getBytes(StandardCharsets.UTF_8), "<xml/>"),
|
||||||
|
1
|
||||||
|
);
|
||||||
|
|
||||||
|
ArgumentCaptor<SourceDescriptor> captor = ArgumentCaptor.forClass(SourceDescriptor.class);
|
||||||
|
verify(importService).importDocument(captor.capture());
|
||||||
|
assertThat(captor.getValue().attributes()).containsEntry(IngestionInternalAttributes.DEFER_LEXICAL_INDEX, "true");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,48 @@
|
||||||
|
package at.procon.dip.search.service;
|
||||||
|
|
||||||
|
import static org.mockito.Mockito.never;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
import at.procon.dip.search.config.DipSearchProperties;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
import org.mockito.Mock;
|
||||||
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
|
||||||
|
@ExtendWith(MockitoExtension.class)
|
||||||
|
class SearchLexicalIndexSchedulerTest {
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
private DocumentLexicalIndexService lexicalIndexService;
|
||||||
|
|
||||||
|
private DipSearchProperties properties;
|
||||||
|
private SearchLexicalIndexScheduler scheduler;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() {
|
||||||
|
properties = new DipSearchProperties();
|
||||||
|
properties.setScheduledLexicalBackfillBatchSize(123);
|
||||||
|
scheduler = new SearchLexicalIndexScheduler(properties, lexicalIndexService);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void shouldBackfillMissingVectorsWhenEnabled() {
|
||||||
|
properties.setScheduledLexicalBackfillEnabled(true);
|
||||||
|
when(lexicalIndexService.backfillMissingVectors(123)).thenReturn(5);
|
||||||
|
|
||||||
|
scheduler.backfillMissingLexicalIndexes();
|
||||||
|
|
||||||
|
verify(lexicalIndexService).backfillMissingVectors(123);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void shouldSkipBackfillWhenDisabled() {
|
||||||
|
properties.setScheduledLexicalBackfillEnabled(false);
|
||||||
|
|
||||||
|
scheduler.backfillMissingLexicalIndexes();
|
||||||
|
|
||||||
|
verify(lexicalIndexService, never()).backfillMissingVectors(123);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -118,7 +118,8 @@ public class SearchTestDataFactory {
|
||||||
chunkStartOffset,
|
chunkStartOffset,
|
||||||
chunkEndOffset,
|
chunkEndOffset,
|
||||||
primary,
|
primary,
|
||||||
text
|
text,
|
||||||
|
false
|
||||||
));
|
));
|
||||||
lexicalIndexService.refreshRepresentationLexicalIndex(representation.getId());
|
lexicalIndexService.refreshRepresentationLexicalIndex(representation.getId());
|
||||||
return representation;
|
return representation;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue