diff --git a/docs/architecture/EMBEDDING_POLICY_PATCH_K1.md b/docs/architecture/EMBEDDING_POLICY_PATCH_K1.md new file mode 100644 index 0000000..38995dc --- /dev/null +++ b/docs/architecture/EMBEDDING_POLICY_PATCH_K1.md @@ -0,0 +1,30 @@ +# Embedding policy Patch K1 + +Patch K1 introduces the configuration and resolver layer for policy-based document embedding selection. + +## Added +- `EmbeddingPolicy` +- `EmbeddingProfile` +- `EmbeddingPolicyCondition` +- `EmbeddingPolicyUse` +- `EmbeddingPolicyRule` +- `EmbeddingPolicyProperties` +- `EmbeddingProfileProperties` +- `EmbeddingPolicyResolver` +- `DefaultEmbeddingPolicyResolver` +- `EmbeddingProfileResolver` +- `DefaultEmbeddingProfileResolver` + +## Example config +See `application-new-example-embedding-policy.yml`. + +## What K1 does not change +- no runtime import/orchestrator wiring yet +- no `SourceDescriptor` schema change yet +- no job persistence/audit changes yet + +## Intended follow-up +K2 should wire: +- `GenericDocumentImportService` +- `RepresentationEmbeddingOrchestrator` +to use the resolved policy and profile. diff --git a/docs/architecture/EMBEDDING_POLICY_PATCH_K2.md b/docs/architecture/EMBEDDING_POLICY_PATCH_K2.md new file mode 100644 index 0000000..7b0942f --- /dev/null +++ b/docs/architecture/EMBEDDING_POLICY_PATCH_K2.md @@ -0,0 +1,26 @@ +# Embedding policy Patch K2 + +Patch K2 wires the policy/profile layer into the actual NEW import runtime. + +## What it changes +- `GenericDocumentImportService` + - resolves `EmbeddingPolicy` per imported document + - resolves `EmbeddingProfile` + - ensures the selected embedding model is registered + - queues embeddings only for representation drafts allowed by the resolved profile +- `RepresentationEmbeddingOrchestrator` + - adds a convenience overload for `(documentId, modelKey, profile)` +- `EmbeddingJobService` + - adds a profile-aware enqueue overload +- `DefaultEmbeddingSelectionPolicy` + - adds profile-aware representation filtering +- `DefaultEmbeddingPolicyResolver` + - corrected for the current `SourceDescriptor.attributes()` shape + +## Runtime flow after K2 +document imported +-> representations built +-> policy resolved +-> profile resolved +-> model ensured +-> matching representations queued for embedding diff --git a/src/main/java/at/procon/dip/domain/document/RepresentationType.java b/src/main/java/at/procon/dip/domain/document/RepresentationType.java index 0cc7b3e..a53c470 100644 --- a/src/main/java/at/procon/dip/domain/document/RepresentationType.java +++ b/src/main/java/at/procon/dip/domain/document/RepresentationType.java @@ -9,5 +9,6 @@ public enum RepresentationType { SUMMARY, TITLE_ABSTRACT, CHUNK, - METADATA_ENRICHED + METADATA_ENRICHED, + ATTACHMENT_ROLLUP } diff --git a/src/main/java/at/procon/dip/embedding/config/EmbeddingPolicyCondition.java b/src/main/java/at/procon/dip/embedding/config/EmbeddingPolicyCondition.java new file mode 100644 index 0000000..fe23183 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/config/EmbeddingPolicyCondition.java @@ -0,0 +1,14 @@ +package at.procon.dip.embedding.config; + +import lombok.Data; + +@Data +public class EmbeddingPolicyCondition { + private String documentType; + private String documentFamily; + private String sourceType; + private String mimeType; + private String language; + private String ownerTenantKey; + private String embeddingPolicyHint; +} diff --git a/src/main/java/at/procon/dip/embedding/config/EmbeddingPolicyProperties.java b/src/main/java/at/procon/dip/embedding/config/EmbeddingPolicyProperties.java new file mode 100644 index 0000000..10c1b13 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/config/EmbeddingPolicyProperties.java @@ -0,0 +1,16 @@ +package at.procon.dip.embedding.config; + +import java.util.ArrayList; +import java.util.List; +import lombok.Data; +import org.springframework.boot.context.properties.ConfigurationProperties; +import org.springframework.context.annotation.Configuration; + +@Configuration +@ConfigurationProperties(prefix = "dip.embedding.policies") +@Data +public class EmbeddingPolicyProperties { + + private EmbeddingPolicyUse defaultPolicy = new EmbeddingPolicyUse(); + private List rules = new ArrayList<>(); +} diff --git a/src/main/java/at/procon/dip/embedding/config/EmbeddingPolicyRule.java b/src/main/java/at/procon/dip/embedding/config/EmbeddingPolicyRule.java new file mode 100644 index 0000000..e122dde --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/config/EmbeddingPolicyRule.java @@ -0,0 +1,10 @@ +package at.procon.dip.embedding.config; + +import lombok.Data; + +@Data +public class EmbeddingPolicyRule { + private String name; + private EmbeddingPolicyCondition when = new EmbeddingPolicyCondition(); + private EmbeddingPolicyUse use = new EmbeddingPolicyUse(); +} diff --git a/src/main/java/at/procon/dip/embedding/config/EmbeddingPolicyUse.java b/src/main/java/at/procon/dip/embedding/config/EmbeddingPolicyUse.java new file mode 100644 index 0000000..46885bf --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/config/EmbeddingPolicyUse.java @@ -0,0 +1,12 @@ +package at.procon.dip.embedding.config; + +import lombok.Data; + +@Data +public class EmbeddingPolicyUse { + private String policyKey; + private String modelKey; + private String queryModelKey; + private String profileKey; + private boolean enabled = true; +} diff --git a/src/main/java/at/procon/dip/embedding/config/EmbeddingProfileProperties.java b/src/main/java/at/procon/dip/embedding/config/EmbeddingProfileProperties.java new file mode 100644 index 0000000..04dd502 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/config/EmbeddingProfileProperties.java @@ -0,0 +1,23 @@ +package at.procon.dip.embedding.config; + +import at.procon.dip.domain.document.RepresentationType; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import lombok.Data; +import org.springframework.boot.context.properties.ConfigurationProperties; +import org.springframework.context.annotation.Configuration; + +@Configuration +@ConfigurationProperties(prefix = "dip.embedding.profiles") +@Data +public class EmbeddingProfileProperties { + + private Map definitions = new LinkedHashMap<>(); + + @Data + public static class ProfileDefinition { + private List embedRepresentationTypes = new ArrayList<>(); + } +} diff --git a/src/main/java/at/procon/dip/embedding/job/service/EmbeddingJobService.java b/src/main/java/at/procon/dip/embedding/job/service/EmbeddingJobService.java index 9cae485..1247bd7 100644 --- a/src/main/java/at/procon/dip/embedding/job/service/EmbeddingJobService.java +++ b/src/main/java/at/procon/dip/embedding/job/service/EmbeddingJobService.java @@ -1,13 +1,14 @@ package at.procon.dip.embedding.job.service; +import at.procon.dip.domain.document.entity.DocumentTextRepresentation; import at.procon.dip.embedding.config.EmbeddingProperties; import at.procon.dip.embedding.job.entity.EmbeddingJob; import at.procon.dip.embedding.job.repository.EmbeddingJobRepository; import at.procon.dip.embedding.model.EmbeddingJobStatus; import at.procon.dip.embedding.model.EmbeddingJobType; +import at.procon.dip.embedding.policy.EmbeddingProfile; import at.procon.dip.embedding.policy.EmbeddingSelectionPolicy; import at.procon.dip.embedding.registry.EmbeddingModelRegistry; -import at.procon.dip.domain.document.entity.DocumentTextRepresentation; import java.time.Duration; import java.time.OffsetDateTime; import java.util.List; @@ -46,6 +47,14 @@ public class EmbeddingJobService { .toList(); } + public List enqueueForDocument(UUID documentId, String modelKey, EmbeddingProfile profile) { + var model = modelRegistry.getRequired(modelKey); + List selected = selectionPolicy.selectRepresentations(documentId, model, profile); + return selected.stream() + .map(representation -> enqueueForRepresentation(documentId, representation.getId(), modelKey, EmbeddingJobType.DOCUMENT_EMBED)) + .toList(); + } + public EmbeddingJob enqueueForRepresentation(UUID documentId, UUID representationId, String modelKey, EmbeddingJobType jobType) { return jobRepository.findFirstByRepresentationIdAndModelKeyAndJobTypeAndStatusIn( representationId, diff --git a/src/main/java/at/procon/dip/embedding/policy/DefaultEmbeddingSelectionPolicy.java b/src/main/java/at/procon/dip/embedding/policy/DefaultEmbeddingSelectionPolicy.java index 033483b..b05c655 100644 --- a/src/main/java/at/procon/dip/embedding/policy/DefaultEmbeddingSelectionPolicy.java +++ b/src/main/java/at/procon/dip/embedding/policy/DefaultEmbeddingSelectionPolicy.java @@ -23,18 +23,24 @@ public class DefaultEmbeddingSelectionPolicy implements EmbeddingSelectionPolicy @Override public List selectRepresentations(UUID documentId, EmbeddingModelDescriptor model) { + return selectRepresentations(documentId, model, null); + } + + @Override + public List selectRepresentations(UUID documentId, EmbeddingModelDescriptor model, EmbeddingProfile profile) { List representations = representationRepository.findByDocument_Id(documentId); List selected = new ArrayList<>(); EmbeddingProperties.IndexingProperties indexing = embeddingProperties.getIndexing(); for (DocumentTextRepresentation representation : representations) { - if (include(representation, indexing)) { + if (include(representation, indexing, profile)) { selected.add(representation); } } if (selected.isEmpty()) { representationRepository.findFirstByDocument_IdAndPrimaryRepresentationTrue(documentId) + .filter(rep -> include(rep, indexing, profile)) .ifPresent(selected::add); } @@ -48,7 +54,12 @@ public class DefaultEmbeddingSelectionPolicy implements EmbeddingSelectionPolicy .toList(); } - private boolean include(DocumentTextRepresentation representation, EmbeddingProperties.IndexingProperties indexing) { + private boolean include(DocumentTextRepresentation representation, + EmbeddingProperties.IndexingProperties indexing, + EmbeddingProfile profile) { + if (profile != null && !profile.includes(representation.getRepresentationType())) { + return false; + } return switch (representation.getRepresentationType()) { case SEMANTIC_TEXT -> indexing.isEmbedSemanticText(); case TITLE_ABSTRACT -> indexing.isEmbedTitleAbstract(); diff --git a/src/main/java/at/procon/dip/embedding/policy/EmbeddingPolicy.java b/src/main/java/at/procon/dip/embedding/policy/EmbeddingPolicy.java new file mode 100644 index 0000000..576adcd --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/policy/EmbeddingPolicy.java @@ -0,0 +1,10 @@ +package at.procon.dip.embedding.policy; + +public record EmbeddingPolicy( + String policyKey, + String modelKey, + String queryModelKey, + String profileKey, + boolean enabled +) { +} diff --git a/src/main/java/at/procon/dip/embedding/policy/EmbeddingProfile.java b/src/main/java/at/procon/dip/embedding/policy/EmbeddingProfile.java new file mode 100644 index 0000000..121b032 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/policy/EmbeddingProfile.java @@ -0,0 +1,13 @@ +package at.procon.dip.embedding.policy; + +import at.procon.dip.domain.document.RepresentationType; +import java.util.List; + +public record EmbeddingProfile( + String profileKey, + List embedRepresentationTypes +) { + public boolean includes(RepresentationType representationType) { + return embedRepresentationTypes != null && embedRepresentationTypes.contains(representationType); + } +} diff --git a/src/main/java/at/procon/dip/embedding/policy/EmbeddingSelectionPolicy.java b/src/main/java/at/procon/dip/embedding/policy/EmbeddingSelectionPolicy.java index 1fbf538..a506c7c 100644 --- a/src/main/java/at/procon/dip/embedding/policy/EmbeddingSelectionPolicy.java +++ b/src/main/java/at/procon/dip/embedding/policy/EmbeddingSelectionPolicy.java @@ -8,4 +8,6 @@ import java.util.UUID; public interface EmbeddingSelectionPolicy { List selectRepresentations(UUID documentId, EmbeddingModelDescriptor model); + + List selectRepresentations(UUID documentId, EmbeddingModelDescriptor model, EmbeddingProfile profile); } diff --git a/src/main/java/at/procon/dip/embedding/service/DefaultEmbeddingPolicyResolver.java b/src/main/java/at/procon/dip/embedding/service/DefaultEmbeddingPolicyResolver.java new file mode 100644 index 0000000..52594b7 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/service/DefaultEmbeddingPolicyResolver.java @@ -0,0 +1,131 @@ +package at.procon.dip.embedding.service; + +import at.procon.dip.domain.document.entity.Document; +import at.procon.dip.embedding.config.EmbeddingPolicyCondition; +import at.procon.dip.embedding.config.EmbeddingPolicyProperties; +import at.procon.dip.embedding.config.EmbeddingPolicyRule; +import at.procon.dip.embedding.config.EmbeddingPolicyUse; +import at.procon.dip.embedding.policy.EmbeddingPolicy; +import at.procon.dip.ingestion.spi.SourceDescriptor; +import java.util.Map; +import java.util.Objects; +import java.util.regex.Pattern; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Service; + +@Service +@RequiredArgsConstructor +public class DefaultEmbeddingPolicyResolver implements EmbeddingPolicyResolver { + + private final EmbeddingPolicyProperties properties; + + @Override + public EmbeddingPolicy resolve(Document document, SourceDescriptor sourceDescriptor) { + String overridePolicy = attributeValue(sourceDescriptor, "embeddingPolicyKey"); + if (overridePolicy != null) { + return policyByKey(overridePolicy); + } + + String policyHint = policyHint(sourceDescriptor); + if (policyHint != null) { + return policyByKey(policyHint); + } + + for (EmbeddingPolicyRule rule : properties.getRules()) { + if (matches(rule.getWhen(), document, sourceDescriptor)) { + return toPolicy(rule.getUse()); + } + } + + return toPolicy(properties.getDefaultPolicy()); + } + + private EmbeddingPolicy policyByKey(String policyKey) { + for (EmbeddingPolicyRule rule : properties.getRules()) { + if (rule.getUse() != null && policyKey.equals(rule.getUse().getPolicyKey())) { + return toPolicy(rule.getUse()); + } + } + EmbeddingPolicyUse def = properties.getDefaultPolicy(); + if (def != null && policyKey.equals(def.getPolicyKey())) { + return toPolicy(def); + } + throw new IllegalArgumentException("Unknown embedding policy key: " + policyKey); + } + + private EmbeddingPolicy toPolicy(EmbeddingPolicyUse use) { + if (use == null) { + throw new IllegalStateException("Embedding policy configuration is missing"); + } + return new EmbeddingPolicy( + use.getPolicyKey(), + use.getModelKey(), + use.getQueryModelKey(), + use.getProfileKey(), + use.isEnabled() + ); + } + + private boolean matches(EmbeddingPolicyCondition c, Document document, SourceDescriptor sourceDescriptor) { + if (c == null) { + return true; + } + + if (!matchesExact(c.getDocumentType(), enumName(document != null ? document.getDocumentType() : null))) { + return false; + } + if (!matchesExact(c.getDocumentFamily(), enumName(document != null ? document.getDocumentFamily() : null))) { + return false; + } + if (!matchesExact(c.getSourceType(), enumName(sourceDescriptor != null ? sourceDescriptor.sourceType() : null))) { + return false; + } + if (!matchesMime(c.getMimeType(), sourceDescriptor != null ? sourceDescriptor.mediaType() : null)) { + return false; + } + if (!matchesExact(c.getLanguage(), document != null ? document.getLanguageCode() : null)) { + return false; + } + if (!matchesExact(c.getOwnerTenantKey(), document != null && document.getOwnerTenant() != null ? document.getOwnerTenant().getTenantKey() : null )) { + return false; + } + return matchesExact(c.getEmbeddingPolicyHint(), policyHint(sourceDescriptor)); + } + + private boolean matchesExact(String expected, String actual) { + if (expected == null || expected.isBlank()) { + return true; + } + return Objects.equals(expected, actual); + } + + private boolean matchesMime(String pattern, String actual) { + if (pattern == null || pattern.isBlank()) { + return true; + } + if (actual == null || actual.isBlank()) { + return false; + } + return Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(actual).matches(); + } + + private String enumName(Enum value) { + return value != null ? value.name() : null; + } + + private String policyHint(SourceDescriptor sourceDescriptor) { + return attributeValue(sourceDescriptor, "embeddingPolicyHint"); + } + + private String attributeValue(SourceDescriptor sourceDescriptor, String key) { + if (sourceDescriptor == null) { + return null; + } + Map attributes = sourceDescriptor.attributes(); + if (attributes == null) { + return null; + } + String value = attributes.get(key); + return (value == null || value.isBlank()) ? null : value; + } +} diff --git a/src/main/java/at/procon/dip/embedding/service/DefaultEmbeddingProfileResolver.java b/src/main/java/at/procon/dip/embedding/service/DefaultEmbeddingProfileResolver.java new file mode 100644 index 0000000..2cddcf7 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/service/DefaultEmbeddingProfileResolver.java @@ -0,0 +1,31 @@ +package at.procon.dip.embedding.service; + +import at.procon.dip.embedding.config.EmbeddingProfileProperties; +import at.procon.dip.embedding.policy.EmbeddingProfile; +import java.util.List; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Service; + +@Service +@RequiredArgsConstructor +public class DefaultEmbeddingProfileResolver implements EmbeddingProfileResolver { + + private final EmbeddingProfileProperties properties; + + @Override + public EmbeddingProfile resolve(String profileKey) { + if (profileKey == null || profileKey.isBlank()) { + throw new IllegalArgumentException("Embedding profile key must not be blank"); + } + + EmbeddingProfileProperties.ProfileDefinition definition = properties.getDefinitions().get(profileKey); + if (definition == null) { + throw new IllegalArgumentException("Unknown embedding profile: " + profileKey); + } + + return new EmbeddingProfile( + profileKey, + List.copyOf(definition.getEmbedRepresentationTypes()) + ); + } +} diff --git a/src/main/java/at/procon/dip/embedding/service/EmbeddingPolicyResolver.java b/src/main/java/at/procon/dip/embedding/service/EmbeddingPolicyResolver.java new file mode 100644 index 0000000..1f1e9a1 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/service/EmbeddingPolicyResolver.java @@ -0,0 +1,9 @@ +package at.procon.dip.embedding.service; + +import at.procon.dip.domain.document.entity.Document; +import at.procon.dip.embedding.policy.EmbeddingPolicy; +import at.procon.dip.ingestion.spi.SourceDescriptor; + +public interface EmbeddingPolicyResolver { + EmbeddingPolicy resolve(Document document, SourceDescriptor sourceDescriptor); +} diff --git a/src/main/java/at/procon/dip/embedding/service/EmbeddingProfileResolver.java b/src/main/java/at/procon/dip/embedding/service/EmbeddingProfileResolver.java new file mode 100644 index 0000000..f33f0f3 --- /dev/null +++ b/src/main/java/at/procon/dip/embedding/service/EmbeddingProfileResolver.java @@ -0,0 +1,7 @@ +package at.procon.dip.embedding.service; + +import at.procon.dip.embedding.policy.EmbeddingProfile; + +public interface EmbeddingProfileResolver { + EmbeddingProfile resolve(String profileKey); +} diff --git a/src/main/java/at/procon/dip/embedding/service/RepresentationEmbeddingOrchestrator.java b/src/main/java/at/procon/dip/embedding/service/RepresentationEmbeddingOrchestrator.java index 3222135..083e793 100644 --- a/src/main/java/at/procon/dip/embedding/service/RepresentationEmbeddingOrchestrator.java +++ b/src/main/java/at/procon/dip/embedding/service/RepresentationEmbeddingOrchestrator.java @@ -9,6 +9,8 @@ import at.procon.dip.embedding.job.service.EmbeddingJobService; import at.procon.dip.embedding.model.EmbeddingJobType; import at.procon.dip.embedding.model.EmbeddingProviderResult; import at.procon.dip.embedding.model.EmbeddingUseCase; +import at.procon.dip.embedding.policy.EmbeddingProfile; +import at.procon.dip.embedding.policy.EmbeddingSelectionPolicy; import at.procon.dip.embedding.registry.EmbeddingModelRegistry; import java.util.List; import java.util.UUID; @@ -26,6 +28,7 @@ public class RepresentationEmbeddingOrchestrator { private final EmbeddingExecutionService executionService; private final EmbeddingPersistenceService persistenceService; private final DocumentTextRepresentationRepository representationRepository; + private final EmbeddingSelectionPolicy selectionPolicy; private final EmbeddingModelRegistry modelRegistry; private final EmbeddingProperties embeddingProperties; @@ -39,6 +42,14 @@ public class RepresentationEmbeddingOrchestrator { return jobService.enqueueForDocument(documentId, modelKey); } + @Transactional + public List enqueueDocument(UUID documentId, String modelKey, EmbeddingProfile profile) { + var model = modelRegistry.getRequired(modelKey); + return selectionPolicy.selectRepresentations(documentId, model, profile).stream() + .map(representation -> enqueueRepresentation(documentId, representation.getId(), modelKey)) + .toList(); + } + @Transactional public EmbeddingJob enqueueRepresentation(UUID documentId, UUID representationId, String modelKey) { return jobService.enqueueForRepresentation(documentId, representationId, modelKey, EmbeddingJobType.DOCUMENT_EMBED); diff --git a/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java b/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java index 3fbaa2f..fbf6359 100644 --- a/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java +++ b/src/main/java/at/procon/dip/ingestion/service/GenericDocumentImportService.java @@ -10,7 +10,6 @@ import at.procon.dip.domain.document.DocumentStatus; import at.procon.dip.domain.document.StorageType; import at.procon.dip.domain.document.entity.Document; import at.procon.dip.domain.document.entity.DocumentContent; -import at.procon.dip.domain.document.entity.DocumentSource; import at.procon.dip.domain.document.repository.DocumentRepository; import at.procon.dip.domain.document.repository.DocumentSourceRepository; import at.procon.dip.domain.document.service.DocumentContentService; @@ -21,26 +20,30 @@ import at.procon.dip.domain.document.service.command.AddDocumentContentCommand; import at.procon.dip.domain.document.service.command.AddDocumentSourceCommand; import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand; import at.procon.dip.domain.document.service.command.CreateDocumentCommand; +import at.procon.dip.embedding.config.EmbeddingProperties; +import at.procon.dip.embedding.policy.EmbeddingPolicy; +import at.procon.dip.embedding.policy.EmbeddingProfile; +import at.procon.dip.embedding.registry.EmbeddingModelRegistry; +import at.procon.dip.embedding.service.EmbeddingModelCatalogService; +import at.procon.dip.embedding.service.EmbeddingPolicyResolver; +import at.procon.dip.embedding.service.EmbeddingProfileResolver; +import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator; import at.procon.dip.extraction.service.DocumentExtractionService; import at.procon.dip.extraction.spi.ExtractionRequest; import at.procon.dip.extraction.spi.ExtractionResult; +import at.procon.dip.ingestion.config.DipIngestionProperties; import at.procon.dip.ingestion.dto.ImportedDocumentResult; import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; import at.procon.dip.ingestion.spi.SourceDescriptor; import at.procon.dip.ingestion.util.DocumentImportSupport; -import at.procon.dip.embedding.config.EmbeddingProperties; -import at.procon.dip.embedding.registry.EmbeddingModelRegistry; -import at.procon.dip.embedding.service.EmbeddingModelCatalogService; -import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator; -import at.procon.dip.ingestion.config.DipIngestionProperties; -import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; -import at.procon.dip.runtime.config.RuntimeMode; import at.procon.dip.normalization.service.TextRepresentationBuildService; -import at.procon.dip.processing.service.StructuredDocumentProcessingService; import at.procon.dip.normalization.spi.RepresentationBuildRequest; import at.procon.dip.normalization.spi.TextRepresentationDraft; +import at.procon.dip.processing.service.StructuredDocumentProcessingService; import at.procon.dip.processing.spi.DocumentProcessingPolicy; import at.procon.dip.processing.spi.StructuredProcessingRequest; +import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode; +import at.procon.dip.runtime.config.RuntimeMode; import at.procon.ted.util.HashUtils; import java.nio.charset.StandardCharsets; import java.time.OffsetDateTime; @@ -49,16 +52,12 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.UUID; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; import org.springframework.util.StringUtils; -/** - * Phase 4 generic import pipeline that persists arbitrary document types into the DOC model. - */ @Service @ConditionalOnRuntimeMode(RuntimeMode.NEW) @RequiredArgsConstructor @@ -80,6 +79,8 @@ public class GenericDocumentImportService { private final EmbeddingModelRegistry embeddingModelRegistry; private final EmbeddingModelCatalogService embeddingModelCatalogService; private final RepresentationEmbeddingOrchestrator representationEmbeddingOrchestrator; + private final EmbeddingPolicyResolver embeddingPolicyResolver; + private final EmbeddingProfileResolver embeddingProfileResolver; @Transactional public ImportedDocumentResult importDocument(SourceDescriptor sourceDescriptor) { @@ -163,7 +164,7 @@ public class GenericDocumentImportService { if (processingPolicy.runRepresentationBuilders()) { var drafts = representationBuildService.build(new RepresentationBuildRequest(sourceDescriptor, detection, extractionResult)); - persistRepresentationsAndEmbeddings(document, originalContent, persistedDerivedContent, drafts); + persistRepresentationsAndEmbeddings(document, originalContent, persistedDerivedContent, drafts, sourceDescriptor); } if (processingPolicy.applyStructuredTitleIfMissing() && !extractionResult.structuredPayloads().isEmpty()) { @@ -182,30 +183,7 @@ public class GenericDocumentImportService { return new ImportedDocumentResult(reloaded, detection, warnings, false); } - - private ExtractionResult emptyExtractionResult() { - return new ExtractionResult(java.util.Collections.emptyMap(), java.util.Collections.emptyList(), java.util.Collections.emptyList()); - } - - private Optional resolveDeduplicatedDocument(String dedupHash, DocumentAccessContext accessContext) { - return documentRepository.findAllByDedupHash(dedupHash).stream() - .filter(existing -> sameAccessScope(existing, accessContext)) - .findFirst(); - } - - private boolean sameAccessScope(Document existing, DocumentAccessContext accessContext) { - if (existing.getVisibility() != accessContext.visibility()) { - return false; - } - String existingTenantKey = existing.getOwnerTenant() == null ? null : existing.getOwnerTenant().getTenantKey(); - String requestedTenantKey = accessContext.ownerTenant() == null ? null : accessContext.ownerTenant().tenantKey(); - return java.util.Objects.equals(existingTenantKey, requestedTenantKey); - } - private SourceDescriptor withResolvedMediaType(SourceDescriptor sourceDescriptor, ResolvedPayload payload) { - if (StringUtils.hasText(sourceDescriptor.mediaType())) { - return sourceDescriptor; - } return new SourceDescriptor( sourceDescriptor.accessContext(), sourceDescriptor.sourceType(), @@ -269,7 +247,7 @@ public class GenericDocumentImportService { return sourceDescriptor.fileName(); } if (StringUtils.hasText(payload.textContent())) { - for (String line : payload.textContent().split("\\n")) { + for (String line : payload.textContent().split("\n")) { if (StringUtils.hasText(line)) { return DocumentImportSupport.ellipsize(line.trim(), 240); } @@ -395,25 +373,33 @@ public class GenericDocumentImportService { private void persistRepresentationsAndEmbeddings(Document document, DocumentContent originalContent, Map derivedContent, - List drafts) { + List drafts, + SourceDescriptor sourceDescriptor) { if (drafts == null || drafts.isEmpty()) { return; } - String embeddingModelKey = null; + EmbeddingPolicy embeddingPolicy = null; + EmbeddingProfile embeddingProfile = null; if (embeddingProperties.isEnabled()) { - embeddingModelKey = embeddingModelRegistry.getRequiredDefaultDocumentModelKey(); - embeddingModelCatalogService.ensureRegistered(embeddingModelKey); + embeddingPolicy = embeddingPolicyResolver.resolve(document, sourceDescriptor); + if (embeddingPolicy != null && embeddingPolicy.enabled()) { + embeddingModelRegistry.getRequired(embeddingPolicy.modelKey()); + embeddingModelCatalogService.ensureRegistered(embeddingPolicy.modelKey()); + embeddingProfile = embeddingProfileResolver.resolve(embeddingPolicy.profileKey()); + log.debug("Resolved embedding policy {} for document {} -> model={}, profile={}", + embeddingPolicy.policyKey(), document.getId(), embeddingPolicy.modelKey(), embeddingPolicy.profileKey()); + } else if (embeddingPolicy != null) { + log.debug("Resolved disabled embedding policy {} for document {}", embeddingPolicy.policyKey(), document.getId()); + } } for (TextRepresentationDraft draft : drafts) { if (!StringUtils.hasText(draft.textBody())) { continue; } - DocumentContent linkedContent = switch (draft.representationType()) { - case FULLTEXT, SEMANTIC_TEXT, SUMMARY, TITLE_ABSTRACT, METADATA_ENRICHED, CHUNK -> - derivedContent.getOrDefault(ContentRole.NORMALIZED_TEXT, originalContent); - }; + + DocumentContent linkedContent = resolveLinkedContent(draft, originalContent, derivedContent); var representation = documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand( document.getId(), @@ -429,8 +415,12 @@ public class GenericDocumentImportService { draft.textBody() )); - if (embeddingModelKey != null && shouldQueueEmbedding(draft)) { - representationEmbeddingOrchestrator.enqueueRepresentation(document.getId(), representation.getId(), embeddingModelKey); + if (shouldQueueEmbedding(draft, embeddingPolicy, embeddingProfile)) { + representationEmbeddingOrchestrator.enqueueRepresentation( + document.getId(), + representation.getId(), + embeddingPolicy.modelKey() + ); } } documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED); @@ -445,7 +435,15 @@ public class GenericDocumentImportService { return derivedContent.getOrDefault(ContentRole.NORMALIZED_TEXT, originalContent); } - private boolean shouldQueueEmbedding(TextRepresentationDraft draft) { + private boolean shouldQueueEmbedding(TextRepresentationDraft draft, + EmbeddingPolicy embeddingPolicy, + EmbeddingProfile embeddingProfile) { + if (embeddingPolicy == null || !embeddingPolicy.enabled() || embeddingProfile == null) { + return false; + } + if (!embeddingProfile.includes(draft.representationType())) { + return false; + } if (draft.queueForEmbedding() != null) { return draft.queueForEmbedding(); } @@ -502,6 +500,31 @@ public class GenericDocumentImportService { return java.util.Objects.equals(left, right); } + private Optional resolveDeduplicatedDocument(String dedupHash, DocumentAccessContext accessContext) { + return documentRepository.findByDedupHash(dedupHash).stream() + .filter(document -> matchesAccessContext(document, accessContext)) + .findFirst(); + } + + private boolean matchesAccessContext(Document document, DocumentAccessContext accessContext) { + String expectedTenantKey = accessContext.ownerTenant() == null ? null : accessContext.ownerTenant().tenantKey(); + if (!equalsNullable(document.getOwnerTenant() != null ? document.getOwnerTenant().getTenantKey() : null, expectedTenantKey)) { + return false; + } + return document.getVisibility() == accessContext.visibility(); + } + + private ExtractionResult emptyExtractionResult() { + return new ExtractionResult(Map.of(), List.of(), List.of()); + } + + private CanonicalDocumentMetadata buildCanonicalMetadata( Document document, + DetectionResult detection, + SourceDescriptor sourceDescriptor, + ExtractionResult extractionResult) { + return document.toCanonicalMetadata(); + } + private record ResolvedPayload(byte[] binaryContent, String textContent, String mediaType) { } } diff --git a/src/main/resources/application-new-example-embedding-policy.yml b/src/main/resources/application-new-example-embedding-policy.yml new file mode 100644 index 0000000..6c0e228 --- /dev/null +++ b/src/main/resources/application-new-example-embedding-policy.yml @@ -0,0 +1,71 @@ +dip: + embedding: + profiles: + definitions: + primary-only: + embed-representation-types: [SEMANTIC_TEXT] + + primary-and-chunks: + embed-representation-types: [SEMANTIC_TEXT, CHUNK] + + ted-semantic: + embed-representation-types: [SEMANTIC_TEXT, TITLE_ABSTRACT, CHUNK] + + mail-message: + embed-representation-types: [SEMANTIC_TEXT, ATTACHMENT_ROLLUP] + + attachment-chunks: + embed-representation-types: [CHUNK] + + disabled: + embed-representation-types: [] + + policies: + default-policy: + policy-key: generic-default + model-key: e5-default + query-model-key: e5-default + profile-key: primary-and-chunks + enabled: true + + rules: + - name: ted-notice + when: + document-family: TED_NOTICE + use: + policy-key: ted-default + model-key: e5-default + query-model-key: e5-default + profile-key: ted-semantic + enabled: true + + - name: email-root + when: + document-type: EMAIL + use: + policy-key: mail-default + model-key: e5-default + query-model-key: e5-default + profile-key: mail-message + enabled: true + + - name: mail-attachment-pdf + when: + source-type: MAIL_ATTACHMENT + mime-type: application/pdf + use: + policy-key: mail-attachment-pdf + model-key: e5-default + query-model-key: e5-default + profile-key: attachment-chunks + enabled: true + + - name: skip-images + when: + mime-type: image/.* + use: + policy-key: no-embedding-images + model-key: e5-default + query-model-key: e5-default + profile-key: disabled + enabled: false diff --git a/src/main/resources/application-new.yml b/src/main/resources/application-new.yml index f7439ab..1327108 100644 --- a/src/main/resources/application-new.yml +++ b/src/main/resources/application-new.yml @@ -34,8 +34,11 @@ dip: startup-lexical-backfill-limit: 500 # Number of top hits per engine returned by /search/debug debug-top-hits-per-engine: 10 + embedding: enabled: true + jobs: + enabled: true default-document-model: e5-default default-query-model: e5-default providers: @@ -62,8 +65,82 @@ dip: distance-metric: COSINE supports-query-embedding-mode: true active: true - jobs: - enabled: true + + profiles: + definitions: + primary-only: + embed-representation-types: [SEMANTIC_TEXT] + + primary-and-chunks: + embed-representation-types: [SEMANTIC_TEXT, CHUNK] + + ted-semantic: + embed-representation-types: [SEMANTIC_TEXT] #[SEMANTIC_TEXT, TITLE_ABSTRACT, CHUNK] + + mail-message: + embed-representation-types: [SEMANTIC_TEXT, ATTACHMENT_ROLLUP] + + attachment-chunks: + embed-representation-types: [CHUNK] + + disabled: + embed-representation-types: [] + + policies: + default-policy: + policy-key: generic-default + model-key: e5-default + query-model-key: e5-default + profile-key: primary-and-chunks + enabled: true + + ted-policy: + policy-key: ted-default + model-key: e5-default + query-model-key: e5-default + profile-key: ted-semantic + enabled: true + + rules: + - name: ted-notice + when: + document-family: TED_NOTICE + use: + policy-key: ted-default + model-key: e5-default + query-model-key: e5-default + profile-key: ted-semantic + enabled: true + + - name: email-root + when: + document-type: EMAIL + use: + policy-key: mail-default + model-key: e5-default + query-model-key: e5-default + profile-key: mail-message + enabled: true + + - name: mail-attachment-pdf + when: + source-type: MAIL_ATTACHMENT + mime-type: application/pdf + use: + policy-key: mail-attachment-pdf + model-key: e5-default + query-model-key: e5-default + profile-key: attachment-chunks + enabled: true + + - name: skip-images + when: + mime-type: image/.* + use: + policy-key: no-embedding-images + profile-key: disabled + enabled: false + # Phase 4 generic ingestion configuration ingestion: # Master switch for arbitrary document ingestion into the DOC model diff --git a/src/test/java/at/procon/dip/embedding/service/DefaultEmbeddingPolicyResolverTest.java b/src/test/java/at/procon/dip/embedding/service/DefaultEmbeddingPolicyResolverTest.java new file mode 100644 index 0000000..b8f5c72 --- /dev/null +++ b/src/test/java/at/procon/dip/embedding/service/DefaultEmbeddingPolicyResolverTest.java @@ -0,0 +1,135 @@ +package at.procon.dip.embedding.service; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import at.procon.dip.domain.access.DocumentVisibility; +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentStatus; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.SourceType; +import at.procon.dip.domain.document.entity.Document; +import at.procon.dip.embedding.config.EmbeddingPolicyCondition; +import at.procon.dip.embedding.config.EmbeddingPolicyProperties; +import at.procon.dip.embedding.config.EmbeddingPolicyRule; +import at.procon.dip.embedding.config.EmbeddingPolicyUse; +import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy; +import at.procon.dip.ingestion.spi.SourceDescriptor; +import java.time.OffsetDateTime; +import java.util.Map; +import java.util.UUID; +import org.junit.jupiter.api.Test; + +class DefaultEmbeddingPolicyResolverTest { + + @Test + void shouldPreferHintAndOverrideFromAttributes() { + EmbeddingPolicyProperties properties = baseProperties(); + + EmbeddingPolicyRule mailRule = new EmbeddingPolicyRule(); + EmbeddingPolicyUse mailUse = new EmbeddingPolicyUse(); + mailUse.setPolicyKey("mail-default"); + mailUse.setModelKey("e5-default"); + mailUse.setQueryModelKey("e5-default"); + mailUse.setProfileKey("mail-message"); + mailUse.setEnabled(true); + mailRule.setUse(mailUse); + properties.getRules().add(mailRule); + + EmbeddingPolicyRule tedRule = new EmbeddingPolicyRule(); + EmbeddingPolicyUse tedUse = new EmbeddingPolicyUse(); + tedUse.setPolicyKey("ted-default"); + tedUse.setModelKey("e5-default"); + tedUse.setQueryModelKey("e5-default"); + tedUse.setProfileKey("ted-semantic"); + tedUse.setEnabled(true); + tedRule.setUse(tedUse); + properties.getRules().add(tedRule); + + DefaultEmbeddingPolicyResolver resolver = new DefaultEmbeddingPolicyResolver(properties); + + SourceDescriptor descriptor = sourceDescriptor(SourceType.MAIL_MESSAGE, "message/rfc822", Map.of( + "embeddingPolicyHint", "mail-default", + "embeddingPolicyKey", "ted-default" + )); + + var policy = resolver.resolve(document(DocumentFamily.GENERIC, DocumentType.EMAIL, "en"), descriptor); + + assertThat(policy.policyKey()).isEqualTo("ted-default"); + } + + @Test + void shouldMatchByMimeTypeUsingMediaTypeField() { + EmbeddingPolicyProperties properties = baseProperties(); + EmbeddingPolicyRule rule = new EmbeddingPolicyRule(); + EmbeddingPolicyCondition when = new EmbeddingPolicyCondition(); + when.setSourceType("MAIL_ATTACHMENT"); + when.setMimeType("application/pdf"); + rule.setWhen(when); + EmbeddingPolicyUse use = new EmbeddingPolicyUse(); + use.setPolicyKey("mail-attachment-pdf"); + use.setModelKey("e5-default"); + use.setQueryModelKey("e5-default"); + use.setProfileKey("attachment-chunks"); + rule.setUse(use); + properties.getRules().add(rule); + + DefaultEmbeddingPolicyResolver resolver = new DefaultEmbeddingPolicyResolver(properties); + + var policy = resolver.resolve(document(DocumentFamily.GENERIC, DocumentType.FILE, "en"), + sourceDescriptor(SourceType.MAIL_ATTACHMENT, "application/pdf", Map.of())); + + assertThat(policy.policyKey()).isEqualTo("mail-attachment-pdf"); + } + + @Test + void shouldFailForUnknownOverridePolicy() { + DefaultEmbeddingPolicyResolver resolver = new DefaultEmbeddingPolicyResolver(baseProperties()); + SourceDescriptor descriptor = sourceDescriptor(SourceType.FILE_IMPORT, "application/pdf", Map.of( + "embeddingPolicyKey", "missing-policy" + )); + assertThatThrownBy(() -> resolver.resolve(document(DocumentFamily.GENERIC, DocumentType.FILE, "en"), descriptor)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Unknown embedding policy key"); + } + + private EmbeddingPolicyProperties baseProperties() { + EmbeddingPolicyProperties properties = new EmbeddingPolicyProperties(); + EmbeddingPolicyUse defaultPolicy = new EmbeddingPolicyUse(); + defaultPolicy.setPolicyKey("generic-default"); + defaultPolicy.setModelKey("e5-default"); + defaultPolicy.setQueryModelKey("e5-default"); + defaultPolicy.setProfileKey("primary-and-chunks"); + defaultPolicy.setEnabled(true); + properties.setDefaultPolicy(defaultPolicy); + return properties; + } + + private Document document(DocumentFamily family, DocumentType type, String language) { + return Document.builder() + .id(UUID.randomUUID()) + .documentFamily(family) + .documentType(type) + .languageCode(language) + .status(DocumentStatus.IMPORTED) + .visibility(DocumentVisibility.PUBLIC) + .title("Test document") + .build(); + } + + private SourceDescriptor sourceDescriptor(SourceType sourceType, String mediaType, Map attrs) { + return new SourceDescriptor( + null, + sourceType, + "source-ref", + "/tmp/source", + "source.bin", + mediaType, + null, + null, + OffsetDateTime.now(), + OriginalContentStoragePolicy.DEFAULT, + attrs + ); + } +} diff --git a/src/test/java/at/procon/dip/embedding/service/DefaultEmbeddingProfileResolverTest.java b/src/test/java/at/procon/dip/embedding/service/DefaultEmbeddingProfileResolverTest.java new file mode 100644 index 0000000..4699f05 --- /dev/null +++ b/src/test/java/at/procon/dip/embedding/service/DefaultEmbeddingProfileResolverTest.java @@ -0,0 +1,37 @@ +package at.procon.dip.embedding.service; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import at.procon.dip.domain.document.RepresentationType; +import at.procon.dip.embedding.config.EmbeddingProfileProperties; +import java.util.List; +import org.junit.jupiter.api.Test; + +class DefaultEmbeddingProfileResolverTest { + + @Test + void shouldResolveKnownProfile() { + EmbeddingProfileProperties properties = new EmbeddingProfileProperties(); + EmbeddingProfileProperties.ProfileDefinition def = new EmbeddingProfileProperties.ProfileDefinition(); + def.setEmbedRepresentationTypes(List.of(RepresentationType.SEMANTIC_TEXT, RepresentationType.CHUNK)); + properties.getDefinitions().put("primary-and-chunks", def); + + DefaultEmbeddingProfileResolver resolver = new DefaultEmbeddingProfileResolver(properties); + + var profile = resolver.resolve("primary-and-chunks"); + + assertThat(profile.profileKey()).isEqualTo("primary-and-chunks"); + assertThat(profile.embedRepresentationTypes()).containsExactly(RepresentationType.SEMANTIC_TEXT, RepresentationType.CHUNK); + } + + @Test + void shouldFailForUnknownProfile() { + EmbeddingProfileProperties properties = new EmbeddingProfileProperties(); + DefaultEmbeddingProfileResolver resolver = new DefaultEmbeddingProfileResolver(properties); + + assertThatThrownBy(() -> resolver.resolve("missing")) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Unknown embedding profile"); + } +}