vectorization profiles per document type

master
trifonovt 4 weeks ago
parent fbd249e56b
commit 177c61803e

@ -0,0 +1,30 @@
# Embedding policy Patch K1
Patch K1 introduces the configuration and resolver layer for policy-based document embedding selection.
## Added
- `EmbeddingPolicy`
- `EmbeddingProfile`
- `EmbeddingPolicyCondition`
- `EmbeddingPolicyUse`
- `EmbeddingPolicyRule`
- `EmbeddingPolicyProperties`
- `EmbeddingProfileProperties`
- `EmbeddingPolicyResolver`
- `DefaultEmbeddingPolicyResolver`
- `EmbeddingProfileResolver`
- `DefaultEmbeddingProfileResolver`
## Example config
See `application-new-example-embedding-policy.yml`.
## What K1 does not change
- no runtime import/orchestrator wiring yet
- no `SourceDescriptor` schema change yet
- no job persistence/audit changes yet
## Intended follow-up
K2 should wire:
- `GenericDocumentImportService`
- `RepresentationEmbeddingOrchestrator`
to use the resolved policy and profile.

@ -0,0 +1,26 @@
# Embedding policy Patch K2
Patch K2 wires the policy/profile layer into the actual NEW import runtime.
## What it changes
- `GenericDocumentImportService`
- resolves `EmbeddingPolicy` per imported document
- resolves `EmbeddingProfile`
- ensures the selected embedding model is registered
- queues embeddings only for representation drafts allowed by the resolved profile
- `RepresentationEmbeddingOrchestrator`
- adds a convenience overload for `(documentId, modelKey, profile)`
- `EmbeddingJobService`
- adds a profile-aware enqueue overload
- `DefaultEmbeddingSelectionPolicy`
- adds profile-aware representation filtering
- `DefaultEmbeddingPolicyResolver`
- corrected for the current `SourceDescriptor.attributes()` shape
## Runtime flow after K2
document imported
-> representations built
-> policy resolved
-> profile resolved
-> model ensured
-> matching representations queued for embedding

@ -9,5 +9,6 @@ public enum RepresentationType {
SUMMARY,
TITLE_ABSTRACT,
CHUNK,
METADATA_ENRICHED
METADATA_ENRICHED,
ATTACHMENT_ROLLUP
}

@ -0,0 +1,14 @@
package at.procon.dip.embedding.config;
import lombok.Data;
@Data
public class EmbeddingPolicyCondition {
private String documentType;
private String documentFamily;
private String sourceType;
private String mimeType;
private String language;
private String ownerTenantKey;
private String embeddingPolicyHint;
}

@ -0,0 +1,16 @@
package at.procon.dip.embedding.config;
import java.util.ArrayList;
import java.util.List;
import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;
@Configuration
@ConfigurationProperties(prefix = "dip.embedding.policies")
@Data
public class EmbeddingPolicyProperties {
private EmbeddingPolicyUse defaultPolicy = new EmbeddingPolicyUse();
private List<EmbeddingPolicyRule> rules = new ArrayList<>();
}

@ -0,0 +1,10 @@
package at.procon.dip.embedding.config;
import lombok.Data;
@Data
public class EmbeddingPolicyRule {
private String name;
private EmbeddingPolicyCondition when = new EmbeddingPolicyCondition();
private EmbeddingPolicyUse use = new EmbeddingPolicyUse();
}

@ -0,0 +1,12 @@
package at.procon.dip.embedding.config;
import lombok.Data;
@Data
public class EmbeddingPolicyUse {
private String policyKey;
private String modelKey;
private String queryModelKey;
private String profileKey;
private boolean enabled = true;
}

@ -0,0 +1,23 @@
package at.procon.dip.embedding.config;
import at.procon.dip.domain.document.RepresentationType;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;
@Configuration
@ConfigurationProperties(prefix = "dip.embedding.profiles")
@Data
public class EmbeddingProfileProperties {
private Map<String, ProfileDefinition> definitions = new LinkedHashMap<>();
@Data
public static class ProfileDefinition {
private List<RepresentationType> embedRepresentationTypes = new ArrayList<>();
}
}

@ -1,13 +1,14 @@
package at.procon.dip.embedding.job.service;
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
import at.procon.dip.embedding.config.EmbeddingProperties;
import at.procon.dip.embedding.job.entity.EmbeddingJob;
import at.procon.dip.embedding.job.repository.EmbeddingJobRepository;
import at.procon.dip.embedding.model.EmbeddingJobStatus;
import at.procon.dip.embedding.model.EmbeddingJobType;
import at.procon.dip.embedding.policy.EmbeddingProfile;
import at.procon.dip.embedding.policy.EmbeddingSelectionPolicy;
import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
import java.time.Duration;
import java.time.OffsetDateTime;
import java.util.List;
@ -46,6 +47,14 @@ public class EmbeddingJobService {
.toList();
}
public List<EmbeddingJob> enqueueForDocument(UUID documentId, String modelKey, EmbeddingProfile profile) {
var model = modelRegistry.getRequired(modelKey);
List<DocumentTextRepresentation> selected = selectionPolicy.selectRepresentations(documentId, model, profile);
return selected.stream()
.map(representation -> enqueueForRepresentation(documentId, representation.getId(), modelKey, EmbeddingJobType.DOCUMENT_EMBED))
.toList();
}
public EmbeddingJob enqueueForRepresentation(UUID documentId, UUID representationId, String modelKey, EmbeddingJobType jobType) {
return jobRepository.findFirstByRepresentationIdAndModelKeyAndJobTypeAndStatusIn(
representationId,

@ -23,18 +23,24 @@ public class DefaultEmbeddingSelectionPolicy implements EmbeddingSelectionPolicy
@Override
public List<DocumentTextRepresentation> selectRepresentations(UUID documentId, EmbeddingModelDescriptor model) {
return selectRepresentations(documentId, model, null);
}
@Override
public List<DocumentTextRepresentation> selectRepresentations(UUID documentId, EmbeddingModelDescriptor model, EmbeddingProfile profile) {
List<DocumentTextRepresentation> representations = representationRepository.findByDocument_Id(documentId);
List<DocumentTextRepresentation> selected = new ArrayList<>();
EmbeddingProperties.IndexingProperties indexing = embeddingProperties.getIndexing();
for (DocumentTextRepresentation representation : representations) {
if (include(representation, indexing)) {
if (include(representation, indexing, profile)) {
selected.add(representation);
}
}
if (selected.isEmpty()) {
representationRepository.findFirstByDocument_IdAndPrimaryRepresentationTrue(documentId)
.filter(rep -> include(rep, indexing, profile))
.ifPresent(selected::add);
}
@ -48,7 +54,12 @@ public class DefaultEmbeddingSelectionPolicy implements EmbeddingSelectionPolicy
.toList();
}
private boolean include(DocumentTextRepresentation representation, EmbeddingProperties.IndexingProperties indexing) {
private boolean include(DocumentTextRepresentation representation,
EmbeddingProperties.IndexingProperties indexing,
EmbeddingProfile profile) {
if (profile != null && !profile.includes(representation.getRepresentationType())) {
return false;
}
return switch (representation.getRepresentationType()) {
case SEMANTIC_TEXT -> indexing.isEmbedSemanticText();
case TITLE_ABSTRACT -> indexing.isEmbedTitleAbstract();

@ -0,0 +1,10 @@
package at.procon.dip.embedding.policy;
public record EmbeddingPolicy(
String policyKey,
String modelKey,
String queryModelKey,
String profileKey,
boolean enabled
) {
}

@ -0,0 +1,13 @@
package at.procon.dip.embedding.policy;
import at.procon.dip.domain.document.RepresentationType;
import java.util.List;
public record EmbeddingProfile(
String profileKey,
List<RepresentationType> embedRepresentationTypes
) {
public boolean includes(RepresentationType representationType) {
return embedRepresentationTypes != null && embedRepresentationTypes.contains(representationType);
}
}

@ -8,4 +8,6 @@ import java.util.UUID;
public interface EmbeddingSelectionPolicy {
List<DocumentTextRepresentation> selectRepresentations(UUID documentId, EmbeddingModelDescriptor model);
List<DocumentTextRepresentation> selectRepresentations(UUID documentId, EmbeddingModelDescriptor model, EmbeddingProfile profile);
}

@ -0,0 +1,131 @@
package at.procon.dip.embedding.service;
import at.procon.dip.domain.document.entity.Document;
import at.procon.dip.embedding.config.EmbeddingPolicyCondition;
import at.procon.dip.embedding.config.EmbeddingPolicyProperties;
import at.procon.dip.embedding.config.EmbeddingPolicyRule;
import at.procon.dip.embedding.config.EmbeddingPolicyUse;
import at.procon.dip.embedding.policy.EmbeddingPolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Pattern;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Service;
@Service
@RequiredArgsConstructor
public class DefaultEmbeddingPolicyResolver implements EmbeddingPolicyResolver {
private final EmbeddingPolicyProperties properties;
@Override
public EmbeddingPolicy resolve(Document document, SourceDescriptor sourceDescriptor) {
String overridePolicy = attributeValue(sourceDescriptor, "embeddingPolicyKey");
if (overridePolicy != null) {
return policyByKey(overridePolicy);
}
String policyHint = policyHint(sourceDescriptor);
if (policyHint != null) {
return policyByKey(policyHint);
}
for (EmbeddingPolicyRule rule : properties.getRules()) {
if (matches(rule.getWhen(), document, sourceDescriptor)) {
return toPolicy(rule.getUse());
}
}
return toPolicy(properties.getDefaultPolicy());
}
private EmbeddingPolicy policyByKey(String policyKey) {
for (EmbeddingPolicyRule rule : properties.getRules()) {
if (rule.getUse() != null && policyKey.equals(rule.getUse().getPolicyKey())) {
return toPolicy(rule.getUse());
}
}
EmbeddingPolicyUse def = properties.getDefaultPolicy();
if (def != null && policyKey.equals(def.getPolicyKey())) {
return toPolicy(def);
}
throw new IllegalArgumentException("Unknown embedding policy key: " + policyKey);
}
private EmbeddingPolicy toPolicy(EmbeddingPolicyUse use) {
if (use == null) {
throw new IllegalStateException("Embedding policy configuration is missing");
}
return new EmbeddingPolicy(
use.getPolicyKey(),
use.getModelKey(),
use.getQueryModelKey(),
use.getProfileKey(),
use.isEnabled()
);
}
private boolean matches(EmbeddingPolicyCondition c, Document document, SourceDescriptor sourceDescriptor) {
if (c == null) {
return true;
}
if (!matchesExact(c.getDocumentType(), enumName(document != null ? document.getDocumentType() : null))) {
return false;
}
if (!matchesExact(c.getDocumentFamily(), enumName(document != null ? document.getDocumentFamily() : null))) {
return false;
}
if (!matchesExact(c.getSourceType(), enumName(sourceDescriptor != null ? sourceDescriptor.sourceType() : null))) {
return false;
}
if (!matchesMime(c.getMimeType(), sourceDescriptor != null ? sourceDescriptor.mediaType() : null)) {
return false;
}
if (!matchesExact(c.getLanguage(), document != null ? document.getLanguageCode() : null)) {
return false;
}
if (!matchesExact(c.getOwnerTenantKey(), document != null && document.getOwnerTenant() != null ? document.getOwnerTenant().getTenantKey() : null )) {
return false;
}
return matchesExact(c.getEmbeddingPolicyHint(), policyHint(sourceDescriptor));
}
private boolean matchesExact(String expected, String actual) {
if (expected == null || expected.isBlank()) {
return true;
}
return Objects.equals(expected, actual);
}
private boolean matchesMime(String pattern, String actual) {
if (pattern == null || pattern.isBlank()) {
return true;
}
if (actual == null || actual.isBlank()) {
return false;
}
return Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(actual).matches();
}
private String enumName(Enum<?> value) {
return value != null ? value.name() : null;
}
private String policyHint(SourceDescriptor sourceDescriptor) {
return attributeValue(sourceDescriptor, "embeddingPolicyHint");
}
private String attributeValue(SourceDescriptor sourceDescriptor, String key) {
if (sourceDescriptor == null) {
return null;
}
Map<String, String> attributes = sourceDescriptor.attributes();
if (attributes == null) {
return null;
}
String value = attributes.get(key);
return (value == null || value.isBlank()) ? null : value;
}
}

@ -0,0 +1,31 @@
package at.procon.dip.embedding.service;
import at.procon.dip.embedding.config.EmbeddingProfileProperties;
import at.procon.dip.embedding.policy.EmbeddingProfile;
import java.util.List;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Service;
@Service
@RequiredArgsConstructor
public class DefaultEmbeddingProfileResolver implements EmbeddingProfileResolver {
private final EmbeddingProfileProperties properties;
@Override
public EmbeddingProfile resolve(String profileKey) {
if (profileKey == null || profileKey.isBlank()) {
throw new IllegalArgumentException("Embedding profile key must not be blank");
}
EmbeddingProfileProperties.ProfileDefinition definition = properties.getDefinitions().get(profileKey);
if (definition == null) {
throw new IllegalArgumentException("Unknown embedding profile: " + profileKey);
}
return new EmbeddingProfile(
profileKey,
List.copyOf(definition.getEmbedRepresentationTypes())
);
}
}

@ -0,0 +1,9 @@
package at.procon.dip.embedding.service;
import at.procon.dip.domain.document.entity.Document;
import at.procon.dip.embedding.policy.EmbeddingPolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor;
public interface EmbeddingPolicyResolver {
EmbeddingPolicy resolve(Document document, SourceDescriptor sourceDescriptor);
}

@ -0,0 +1,7 @@
package at.procon.dip.embedding.service;
import at.procon.dip.embedding.policy.EmbeddingProfile;
public interface EmbeddingProfileResolver {
EmbeddingProfile resolve(String profileKey);
}

@ -9,6 +9,8 @@ import at.procon.dip.embedding.job.service.EmbeddingJobService;
import at.procon.dip.embedding.model.EmbeddingJobType;
import at.procon.dip.embedding.model.EmbeddingProviderResult;
import at.procon.dip.embedding.model.EmbeddingUseCase;
import at.procon.dip.embedding.policy.EmbeddingProfile;
import at.procon.dip.embedding.policy.EmbeddingSelectionPolicy;
import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
import java.util.List;
import java.util.UUID;
@ -26,6 +28,7 @@ public class RepresentationEmbeddingOrchestrator {
private final EmbeddingExecutionService executionService;
private final EmbeddingPersistenceService persistenceService;
private final DocumentTextRepresentationRepository representationRepository;
private final EmbeddingSelectionPolicy selectionPolicy;
private final EmbeddingModelRegistry modelRegistry;
private final EmbeddingProperties embeddingProperties;
@ -39,6 +42,14 @@ public class RepresentationEmbeddingOrchestrator {
return jobService.enqueueForDocument(documentId, modelKey);
}
@Transactional
public List<EmbeddingJob> enqueueDocument(UUID documentId, String modelKey, EmbeddingProfile profile) {
var model = modelRegistry.getRequired(modelKey);
return selectionPolicy.selectRepresentations(documentId, model, profile).stream()
.map(representation -> enqueueRepresentation(documentId, representation.getId(), modelKey))
.toList();
}
@Transactional
public EmbeddingJob enqueueRepresentation(UUID documentId, UUID representationId, String modelKey) {
return jobService.enqueueForRepresentation(documentId, representationId, modelKey, EmbeddingJobType.DOCUMENT_EMBED);

@ -10,7 +10,6 @@ import at.procon.dip.domain.document.DocumentStatus;
import at.procon.dip.domain.document.StorageType;
import at.procon.dip.domain.document.entity.Document;
import at.procon.dip.domain.document.entity.DocumentContent;
import at.procon.dip.domain.document.entity.DocumentSource;
import at.procon.dip.domain.document.repository.DocumentRepository;
import at.procon.dip.domain.document.repository.DocumentSourceRepository;
import at.procon.dip.domain.document.service.DocumentContentService;
@ -21,26 +20,30 @@ import at.procon.dip.domain.document.service.command.AddDocumentContentCommand;
import at.procon.dip.domain.document.service.command.AddDocumentSourceCommand;
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
import at.procon.dip.domain.document.service.command.CreateDocumentCommand;
import at.procon.dip.embedding.config.EmbeddingProperties;
import at.procon.dip.embedding.policy.EmbeddingPolicy;
import at.procon.dip.embedding.policy.EmbeddingProfile;
import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
import at.procon.dip.embedding.service.EmbeddingModelCatalogService;
import at.procon.dip.embedding.service.EmbeddingPolicyResolver;
import at.procon.dip.embedding.service.EmbeddingProfileResolver;
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
import at.procon.dip.extraction.service.DocumentExtractionService;
import at.procon.dip.extraction.spi.ExtractionRequest;
import at.procon.dip.extraction.spi.ExtractionResult;
import at.procon.dip.ingestion.config.DipIngestionProperties;
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import at.procon.dip.ingestion.util.DocumentImportSupport;
import at.procon.dip.embedding.config.EmbeddingProperties;
import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
import at.procon.dip.embedding.service.EmbeddingModelCatalogService;
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
import at.procon.dip.ingestion.config.DipIngestionProperties;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import at.procon.dip.normalization.service.TextRepresentationBuildService;
import at.procon.dip.processing.service.StructuredDocumentProcessingService;
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
import at.procon.dip.normalization.spi.TextRepresentationDraft;
import at.procon.dip.processing.service.StructuredDocumentProcessingService;
import at.procon.dip.processing.spi.DocumentProcessingPolicy;
import at.procon.dip.processing.spi.StructuredProcessingRequest;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import at.procon.ted.util.HashUtils;
import java.nio.charset.StandardCharsets;
import java.time.OffsetDateTime;
@ -49,16 +52,12 @@ import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.UUID;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.StringUtils;
/**
* Phase 4 generic import pipeline that persists arbitrary document types into the DOC model.
*/
@Service
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
@RequiredArgsConstructor
@ -80,6 +79,8 @@ public class GenericDocumentImportService {
private final EmbeddingModelRegistry embeddingModelRegistry;
private final EmbeddingModelCatalogService embeddingModelCatalogService;
private final RepresentationEmbeddingOrchestrator representationEmbeddingOrchestrator;
private final EmbeddingPolicyResolver embeddingPolicyResolver;
private final EmbeddingProfileResolver embeddingProfileResolver;
@Transactional
public ImportedDocumentResult importDocument(SourceDescriptor sourceDescriptor) {
@ -163,7 +164,7 @@ public class GenericDocumentImportService {
if (processingPolicy.runRepresentationBuilders()) {
var drafts = representationBuildService.build(new RepresentationBuildRequest(sourceDescriptor, detection, extractionResult));
persistRepresentationsAndEmbeddings(document, originalContent, persistedDerivedContent, drafts);
persistRepresentationsAndEmbeddings(document, originalContent, persistedDerivedContent, drafts, sourceDescriptor);
}
if (processingPolicy.applyStructuredTitleIfMissing() && !extractionResult.structuredPayloads().isEmpty()) {
@ -182,30 +183,7 @@ public class GenericDocumentImportService {
return new ImportedDocumentResult(reloaded, detection, warnings, false);
}
private ExtractionResult emptyExtractionResult() {
return new ExtractionResult(java.util.Collections.emptyMap(), java.util.Collections.emptyList(), java.util.Collections.emptyList());
}
private Optional<Document> resolveDeduplicatedDocument(String dedupHash, DocumentAccessContext accessContext) {
return documentRepository.findAllByDedupHash(dedupHash).stream()
.filter(existing -> sameAccessScope(existing, accessContext))
.findFirst();
}
private boolean sameAccessScope(Document existing, DocumentAccessContext accessContext) {
if (existing.getVisibility() != accessContext.visibility()) {
return false;
}
String existingTenantKey = existing.getOwnerTenant() == null ? null : existing.getOwnerTenant().getTenantKey();
String requestedTenantKey = accessContext.ownerTenant() == null ? null : accessContext.ownerTenant().tenantKey();
return java.util.Objects.equals(existingTenantKey, requestedTenantKey);
}
private SourceDescriptor withResolvedMediaType(SourceDescriptor sourceDescriptor, ResolvedPayload payload) {
if (StringUtils.hasText(sourceDescriptor.mediaType())) {
return sourceDescriptor;
}
return new SourceDescriptor(
sourceDescriptor.accessContext(),
sourceDescriptor.sourceType(),
@ -269,7 +247,7 @@ public class GenericDocumentImportService {
return sourceDescriptor.fileName();
}
if (StringUtils.hasText(payload.textContent())) {
for (String line : payload.textContent().split("\\n")) {
for (String line : payload.textContent().split("\n")) {
if (StringUtils.hasText(line)) {
return DocumentImportSupport.ellipsize(line.trim(), 240);
}
@ -395,25 +373,33 @@ public class GenericDocumentImportService {
private void persistRepresentationsAndEmbeddings(Document document,
DocumentContent originalContent,
Map<ContentRole, DocumentContent> derivedContent,
List<TextRepresentationDraft> drafts) {
List<TextRepresentationDraft> drafts,
SourceDescriptor sourceDescriptor) {
if (drafts == null || drafts.isEmpty()) {
return;
}
String embeddingModelKey = null;
EmbeddingPolicy embeddingPolicy = null;
EmbeddingProfile embeddingProfile = null;
if (embeddingProperties.isEnabled()) {
embeddingModelKey = embeddingModelRegistry.getRequiredDefaultDocumentModelKey();
embeddingModelCatalogService.ensureRegistered(embeddingModelKey);
embeddingPolicy = embeddingPolicyResolver.resolve(document, sourceDescriptor);
if (embeddingPolicy != null && embeddingPolicy.enabled()) {
embeddingModelRegistry.getRequired(embeddingPolicy.modelKey());
embeddingModelCatalogService.ensureRegistered(embeddingPolicy.modelKey());
embeddingProfile = embeddingProfileResolver.resolve(embeddingPolicy.profileKey());
log.debug("Resolved embedding policy {} for document {} -> model={}, profile={}",
embeddingPolicy.policyKey(), document.getId(), embeddingPolicy.modelKey(), embeddingPolicy.profileKey());
} else if (embeddingPolicy != null) {
log.debug("Resolved disabled embedding policy {} for document {}", embeddingPolicy.policyKey(), document.getId());
}
}
for (TextRepresentationDraft draft : drafts) {
if (!StringUtils.hasText(draft.textBody())) {
continue;
}
DocumentContent linkedContent = switch (draft.representationType()) {
case FULLTEXT, SEMANTIC_TEXT, SUMMARY, TITLE_ABSTRACT, METADATA_ENRICHED, CHUNK ->
derivedContent.getOrDefault(ContentRole.NORMALIZED_TEXT, originalContent);
};
DocumentContent linkedContent = resolveLinkedContent(draft, originalContent, derivedContent);
var representation = documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand(
document.getId(),
@ -429,8 +415,12 @@ public class GenericDocumentImportService {
draft.textBody()
));
if (embeddingModelKey != null && shouldQueueEmbedding(draft)) {
representationEmbeddingOrchestrator.enqueueRepresentation(document.getId(), representation.getId(), embeddingModelKey);
if (shouldQueueEmbedding(draft, embeddingPolicy, embeddingProfile)) {
representationEmbeddingOrchestrator.enqueueRepresentation(
document.getId(),
representation.getId(),
embeddingPolicy.modelKey()
);
}
}
documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED);
@ -445,7 +435,15 @@ public class GenericDocumentImportService {
return derivedContent.getOrDefault(ContentRole.NORMALIZED_TEXT, originalContent);
}
private boolean shouldQueueEmbedding(TextRepresentationDraft draft) {
private boolean shouldQueueEmbedding(TextRepresentationDraft draft,
EmbeddingPolicy embeddingPolicy,
EmbeddingProfile embeddingProfile) {
if (embeddingPolicy == null || !embeddingPolicy.enabled() || embeddingProfile == null) {
return false;
}
if (!embeddingProfile.includes(draft.representationType())) {
return false;
}
if (draft.queueForEmbedding() != null) {
return draft.queueForEmbedding();
}
@ -502,6 +500,31 @@ public class GenericDocumentImportService {
return java.util.Objects.equals(left, right);
}
private Optional<Document> resolveDeduplicatedDocument(String dedupHash, DocumentAccessContext accessContext) {
return documentRepository.findByDedupHash(dedupHash).stream()
.filter(document -> matchesAccessContext(document, accessContext))
.findFirst();
}
private boolean matchesAccessContext(Document document, DocumentAccessContext accessContext) {
String expectedTenantKey = accessContext.ownerTenant() == null ? null : accessContext.ownerTenant().tenantKey();
if (!equalsNullable(document.getOwnerTenant() != null ? document.getOwnerTenant().getTenantKey() : null, expectedTenantKey)) {
return false;
}
return document.getVisibility() == accessContext.visibility();
}
private ExtractionResult emptyExtractionResult() {
return new ExtractionResult(Map.of(), List.of(), List.of());
}
private CanonicalDocumentMetadata buildCanonicalMetadata( Document document,
DetectionResult detection,
SourceDescriptor sourceDescriptor,
ExtractionResult extractionResult) {
return document.toCanonicalMetadata();
}
private record ResolvedPayload(byte[] binaryContent, String textContent, String mediaType) {
}
}

@ -0,0 +1,71 @@
dip:
embedding:
profiles:
definitions:
primary-only:
embed-representation-types: [SEMANTIC_TEXT]
primary-and-chunks:
embed-representation-types: [SEMANTIC_TEXT, CHUNK]
ted-semantic:
embed-representation-types: [SEMANTIC_TEXT, TITLE_ABSTRACT, CHUNK]
mail-message:
embed-representation-types: [SEMANTIC_TEXT, ATTACHMENT_ROLLUP]
attachment-chunks:
embed-representation-types: [CHUNK]
disabled:
embed-representation-types: []
policies:
default-policy:
policy-key: generic-default
model-key: e5-default
query-model-key: e5-default
profile-key: primary-and-chunks
enabled: true
rules:
- name: ted-notice
when:
document-family: TED_NOTICE
use:
policy-key: ted-default
model-key: e5-default
query-model-key: e5-default
profile-key: ted-semantic
enabled: true
- name: email-root
when:
document-type: EMAIL
use:
policy-key: mail-default
model-key: e5-default
query-model-key: e5-default
profile-key: mail-message
enabled: true
- name: mail-attachment-pdf
when:
source-type: MAIL_ATTACHMENT
mime-type: application/pdf
use:
policy-key: mail-attachment-pdf
model-key: e5-default
query-model-key: e5-default
profile-key: attachment-chunks
enabled: true
- name: skip-images
when:
mime-type: image/.*
use:
policy-key: no-embedding-images
model-key: e5-default
query-model-key: e5-default
profile-key: disabled
enabled: false

@ -34,8 +34,11 @@ dip:
startup-lexical-backfill-limit: 500
# Number of top hits per engine returned by /search/debug
debug-top-hits-per-engine: 10
embedding:
enabled: true
jobs:
enabled: true
default-document-model: e5-default
default-query-model: e5-default
providers:
@ -62,8 +65,82 @@ dip:
distance-metric: COSINE
supports-query-embedding-mode: true
active: true
jobs:
profiles:
definitions:
primary-only:
embed-representation-types: [SEMANTIC_TEXT]
primary-and-chunks:
embed-representation-types: [SEMANTIC_TEXT, CHUNK]
ted-semantic:
embed-representation-types: [SEMANTIC_TEXT] #[SEMANTIC_TEXT, TITLE_ABSTRACT, CHUNK]
mail-message:
embed-representation-types: [SEMANTIC_TEXT, ATTACHMENT_ROLLUP]
attachment-chunks:
embed-representation-types: [CHUNK]
disabled:
embed-representation-types: []
policies:
default-policy:
policy-key: generic-default
model-key: e5-default
query-model-key: e5-default
profile-key: primary-and-chunks
enabled: true
ted-policy:
policy-key: ted-default
model-key: e5-default
query-model-key: e5-default
profile-key: ted-semantic
enabled: true
rules:
- name: ted-notice
when:
document-family: TED_NOTICE
use:
policy-key: ted-default
model-key: e5-default
query-model-key: e5-default
profile-key: ted-semantic
enabled: true
- name: email-root
when:
document-type: EMAIL
use:
policy-key: mail-default
model-key: e5-default
query-model-key: e5-default
profile-key: mail-message
enabled: true
- name: mail-attachment-pdf
when:
source-type: MAIL_ATTACHMENT
mime-type: application/pdf
use:
policy-key: mail-attachment-pdf
model-key: e5-default
query-model-key: e5-default
profile-key: attachment-chunks
enabled: true
- name: skip-images
when:
mime-type: image/.*
use:
policy-key: no-embedding-images
profile-key: disabled
enabled: false
# Phase 4 generic ingestion configuration
ingestion:
# Master switch for arbitrary document ingestion into the DOC model

@ -0,0 +1,135 @@
package at.procon.dip.embedding.service;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import at.procon.dip.domain.access.DocumentVisibility;
import at.procon.dip.domain.document.DocumentFamily;
import at.procon.dip.domain.document.DocumentStatus;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.SourceType;
import at.procon.dip.domain.document.entity.Document;
import at.procon.dip.embedding.config.EmbeddingPolicyCondition;
import at.procon.dip.embedding.config.EmbeddingPolicyProperties;
import at.procon.dip.embedding.config.EmbeddingPolicyRule;
import at.procon.dip.embedding.config.EmbeddingPolicyUse;
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
import at.procon.dip.ingestion.spi.SourceDescriptor;
import java.time.OffsetDateTime;
import java.util.Map;
import java.util.UUID;
import org.junit.jupiter.api.Test;
class DefaultEmbeddingPolicyResolverTest {
@Test
void shouldPreferHintAndOverrideFromAttributes() {
EmbeddingPolicyProperties properties = baseProperties();
EmbeddingPolicyRule mailRule = new EmbeddingPolicyRule();
EmbeddingPolicyUse mailUse = new EmbeddingPolicyUse();
mailUse.setPolicyKey("mail-default");
mailUse.setModelKey("e5-default");
mailUse.setQueryModelKey("e5-default");
mailUse.setProfileKey("mail-message");
mailUse.setEnabled(true);
mailRule.setUse(mailUse);
properties.getRules().add(mailRule);
EmbeddingPolicyRule tedRule = new EmbeddingPolicyRule();
EmbeddingPolicyUse tedUse = new EmbeddingPolicyUse();
tedUse.setPolicyKey("ted-default");
tedUse.setModelKey("e5-default");
tedUse.setQueryModelKey("e5-default");
tedUse.setProfileKey("ted-semantic");
tedUse.setEnabled(true);
tedRule.setUse(tedUse);
properties.getRules().add(tedRule);
DefaultEmbeddingPolicyResolver resolver = new DefaultEmbeddingPolicyResolver(properties);
SourceDescriptor descriptor = sourceDescriptor(SourceType.MAIL_MESSAGE, "message/rfc822", Map.of(
"embeddingPolicyHint", "mail-default",
"embeddingPolicyKey", "ted-default"
));
var policy = resolver.resolve(document(DocumentFamily.GENERIC, DocumentType.EMAIL, "en"), descriptor);
assertThat(policy.policyKey()).isEqualTo("ted-default");
}
@Test
void shouldMatchByMimeTypeUsingMediaTypeField() {
EmbeddingPolicyProperties properties = baseProperties();
EmbeddingPolicyRule rule = new EmbeddingPolicyRule();
EmbeddingPolicyCondition when = new EmbeddingPolicyCondition();
when.setSourceType("MAIL_ATTACHMENT");
when.setMimeType("application/pdf");
rule.setWhen(when);
EmbeddingPolicyUse use = new EmbeddingPolicyUse();
use.setPolicyKey("mail-attachment-pdf");
use.setModelKey("e5-default");
use.setQueryModelKey("e5-default");
use.setProfileKey("attachment-chunks");
rule.setUse(use);
properties.getRules().add(rule);
DefaultEmbeddingPolicyResolver resolver = new DefaultEmbeddingPolicyResolver(properties);
var policy = resolver.resolve(document(DocumentFamily.GENERIC, DocumentType.FILE, "en"),
sourceDescriptor(SourceType.MAIL_ATTACHMENT, "application/pdf", Map.of()));
assertThat(policy.policyKey()).isEqualTo("mail-attachment-pdf");
}
@Test
void shouldFailForUnknownOverridePolicy() {
DefaultEmbeddingPolicyResolver resolver = new DefaultEmbeddingPolicyResolver(baseProperties());
SourceDescriptor descriptor = sourceDescriptor(SourceType.FILE_IMPORT, "application/pdf", Map.of(
"embeddingPolicyKey", "missing-policy"
));
assertThatThrownBy(() -> resolver.resolve(document(DocumentFamily.GENERIC, DocumentType.FILE, "en"), descriptor))
.isInstanceOf(IllegalArgumentException.class)
.hasMessageContaining("Unknown embedding policy key");
}
private EmbeddingPolicyProperties baseProperties() {
EmbeddingPolicyProperties properties = new EmbeddingPolicyProperties();
EmbeddingPolicyUse defaultPolicy = new EmbeddingPolicyUse();
defaultPolicy.setPolicyKey("generic-default");
defaultPolicy.setModelKey("e5-default");
defaultPolicy.setQueryModelKey("e5-default");
defaultPolicy.setProfileKey("primary-and-chunks");
defaultPolicy.setEnabled(true);
properties.setDefaultPolicy(defaultPolicy);
return properties;
}
private Document document(DocumentFamily family, DocumentType type, String language) {
return Document.builder()
.id(UUID.randomUUID())
.documentFamily(family)
.documentType(type)
.languageCode(language)
.status(DocumentStatus.IMPORTED)
.visibility(DocumentVisibility.PUBLIC)
.title("Test document")
.build();
}
private SourceDescriptor sourceDescriptor(SourceType sourceType, String mediaType, Map<String, String> attrs) {
return new SourceDescriptor(
null,
sourceType,
"source-ref",
"/tmp/source",
"source.bin",
mediaType,
null,
null,
OffsetDateTime.now(),
OriginalContentStoragePolicy.DEFAULT,
attrs
);
}
}

@ -0,0 +1,37 @@
package at.procon.dip.embedding.service;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.embedding.config.EmbeddingProfileProperties;
import java.util.List;
import org.junit.jupiter.api.Test;
class DefaultEmbeddingProfileResolverTest {
@Test
void shouldResolveKnownProfile() {
EmbeddingProfileProperties properties = new EmbeddingProfileProperties();
EmbeddingProfileProperties.ProfileDefinition def = new EmbeddingProfileProperties.ProfileDefinition();
def.setEmbedRepresentationTypes(List.of(RepresentationType.SEMANTIC_TEXT, RepresentationType.CHUNK));
properties.getDefinitions().put("primary-and-chunks", def);
DefaultEmbeddingProfileResolver resolver = new DefaultEmbeddingProfileResolver(properties);
var profile = resolver.resolve("primary-and-chunks");
assertThat(profile.profileKey()).isEqualTo("primary-and-chunks");
assertThat(profile.embedRepresentationTypes()).containsExactly(RepresentationType.SEMANTIC_TEXT, RepresentationType.CHUNK);
}
@Test
void shouldFailForUnknownProfile() {
EmbeddingProfileProperties properties = new EmbeddingProfileProperties();
DefaultEmbeddingProfileResolver resolver = new DefaultEmbeddingProfileResolver(properties);
assertThatThrownBy(() -> resolver.resolve("missing"))
.isInstanceOf(IllegalArgumentException.class)
.hasMessageContaining("Unknown embedding profile");
}
}
Loading…
Cancel
Save