vectorization profiles per document type
This commit is contained in:
parent
fbd249e56b
commit
177c61803e
|
|
@ -0,0 +1,30 @@
|
|||
# Embedding policy Patch K1
|
||||
|
||||
Patch K1 introduces the configuration and resolver layer for policy-based document embedding selection.
|
||||
|
||||
## Added
|
||||
- `EmbeddingPolicy`
|
||||
- `EmbeddingProfile`
|
||||
- `EmbeddingPolicyCondition`
|
||||
- `EmbeddingPolicyUse`
|
||||
- `EmbeddingPolicyRule`
|
||||
- `EmbeddingPolicyProperties`
|
||||
- `EmbeddingProfileProperties`
|
||||
- `EmbeddingPolicyResolver`
|
||||
- `DefaultEmbeddingPolicyResolver`
|
||||
- `EmbeddingProfileResolver`
|
||||
- `DefaultEmbeddingProfileResolver`
|
||||
|
||||
## Example config
|
||||
See `application-new-example-embedding-policy.yml`.
|
||||
|
||||
## What K1 does not change
|
||||
- no runtime import/orchestrator wiring yet
|
||||
- no `SourceDescriptor` schema change yet
|
||||
- no job persistence/audit changes yet
|
||||
|
||||
## Intended follow-up
|
||||
K2 should wire:
|
||||
- `GenericDocumentImportService`
|
||||
- `RepresentationEmbeddingOrchestrator`
|
||||
to use the resolved policy and profile.
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
# Embedding policy Patch K2
|
||||
|
||||
Patch K2 wires the policy/profile layer into the actual NEW import runtime.
|
||||
|
||||
## What it changes
|
||||
- `GenericDocumentImportService`
|
||||
- resolves `EmbeddingPolicy` per imported document
|
||||
- resolves `EmbeddingProfile`
|
||||
- ensures the selected embedding model is registered
|
||||
- queues embeddings only for representation drafts allowed by the resolved profile
|
||||
- `RepresentationEmbeddingOrchestrator`
|
||||
- adds a convenience overload for `(documentId, modelKey, profile)`
|
||||
- `EmbeddingJobService`
|
||||
- adds a profile-aware enqueue overload
|
||||
- `DefaultEmbeddingSelectionPolicy`
|
||||
- adds profile-aware representation filtering
|
||||
- `DefaultEmbeddingPolicyResolver`
|
||||
- corrected for the current `SourceDescriptor.attributes()` shape
|
||||
|
||||
## Runtime flow after K2
|
||||
document imported
|
||||
-> representations built
|
||||
-> policy resolved
|
||||
-> profile resolved
|
||||
-> model ensured
|
||||
-> matching representations queued for embedding
|
||||
|
|
@ -9,5 +9,6 @@ public enum RepresentationType {
|
|||
SUMMARY,
|
||||
TITLE_ABSTRACT,
|
||||
CHUNK,
|
||||
METADATA_ENRICHED
|
||||
METADATA_ENRICHED,
|
||||
ATTACHMENT_ROLLUP
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,14 @@
|
|||
package at.procon.dip.embedding.config;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class EmbeddingPolicyCondition {
|
||||
private String documentType;
|
||||
private String documentFamily;
|
||||
private String sourceType;
|
||||
private String mimeType;
|
||||
private String language;
|
||||
private String ownerTenantKey;
|
||||
private String embeddingPolicyHint;
|
||||
}
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
package at.procon.dip.embedding.config;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import lombok.Data;
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
@Configuration
|
||||
@ConfigurationProperties(prefix = "dip.embedding.policies")
|
||||
@Data
|
||||
public class EmbeddingPolicyProperties {
|
||||
|
||||
private EmbeddingPolicyUse defaultPolicy = new EmbeddingPolicyUse();
|
||||
private List<EmbeddingPolicyRule> rules = new ArrayList<>();
|
||||
}
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
package at.procon.dip.embedding.config;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class EmbeddingPolicyRule {
|
||||
private String name;
|
||||
private EmbeddingPolicyCondition when = new EmbeddingPolicyCondition();
|
||||
private EmbeddingPolicyUse use = new EmbeddingPolicyUse();
|
||||
}
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
package at.procon.dip.embedding.config;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class EmbeddingPolicyUse {
|
||||
private String policyKey;
|
||||
private String modelKey;
|
||||
private String queryModelKey;
|
||||
private String profileKey;
|
||||
private boolean enabled = true;
|
||||
}
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
package at.procon.dip.embedding.config;
|
||||
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import lombok.Data;
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
@Configuration
|
||||
@ConfigurationProperties(prefix = "dip.embedding.profiles")
|
||||
@Data
|
||||
public class EmbeddingProfileProperties {
|
||||
|
||||
private Map<String, ProfileDefinition> definitions = new LinkedHashMap<>();
|
||||
|
||||
@Data
|
||||
public static class ProfileDefinition {
|
||||
private List<RepresentationType> embedRepresentationTypes = new ArrayList<>();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,13 +1,14 @@
|
|||
package at.procon.dip.embedding.job.service;
|
||||
|
||||
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
|
||||
import at.procon.dip.embedding.config.EmbeddingProperties;
|
||||
import at.procon.dip.embedding.job.entity.EmbeddingJob;
|
||||
import at.procon.dip.embedding.job.repository.EmbeddingJobRepository;
|
||||
import at.procon.dip.embedding.model.EmbeddingJobStatus;
|
||||
import at.procon.dip.embedding.model.EmbeddingJobType;
|
||||
import at.procon.dip.embedding.policy.EmbeddingProfile;
|
||||
import at.procon.dip.embedding.policy.EmbeddingSelectionPolicy;
|
||||
import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
|
||||
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
|
||||
import java.time.Duration;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.List;
|
||||
|
|
@ -46,6 +47,14 @@ public class EmbeddingJobService {
|
|||
.toList();
|
||||
}
|
||||
|
||||
public List<EmbeddingJob> enqueueForDocument(UUID documentId, String modelKey, EmbeddingProfile profile) {
|
||||
var model = modelRegistry.getRequired(modelKey);
|
||||
List<DocumentTextRepresentation> selected = selectionPolicy.selectRepresentations(documentId, model, profile);
|
||||
return selected.stream()
|
||||
.map(representation -> enqueueForRepresentation(documentId, representation.getId(), modelKey, EmbeddingJobType.DOCUMENT_EMBED))
|
||||
.toList();
|
||||
}
|
||||
|
||||
public EmbeddingJob enqueueForRepresentation(UUID documentId, UUID representationId, String modelKey, EmbeddingJobType jobType) {
|
||||
return jobRepository.findFirstByRepresentationIdAndModelKeyAndJobTypeAndStatusIn(
|
||||
representationId,
|
||||
|
|
|
|||
|
|
@ -23,18 +23,24 @@ public class DefaultEmbeddingSelectionPolicy implements EmbeddingSelectionPolicy
|
|||
|
||||
@Override
|
||||
public List<DocumentTextRepresentation> selectRepresentations(UUID documentId, EmbeddingModelDescriptor model) {
|
||||
return selectRepresentations(documentId, model, null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<DocumentTextRepresentation> selectRepresentations(UUID documentId, EmbeddingModelDescriptor model, EmbeddingProfile profile) {
|
||||
List<DocumentTextRepresentation> representations = representationRepository.findByDocument_Id(documentId);
|
||||
List<DocumentTextRepresentation> selected = new ArrayList<>();
|
||||
EmbeddingProperties.IndexingProperties indexing = embeddingProperties.getIndexing();
|
||||
|
||||
for (DocumentTextRepresentation representation : representations) {
|
||||
if (include(representation, indexing)) {
|
||||
if (include(representation, indexing, profile)) {
|
||||
selected.add(representation);
|
||||
}
|
||||
}
|
||||
|
||||
if (selected.isEmpty()) {
|
||||
representationRepository.findFirstByDocument_IdAndPrimaryRepresentationTrue(documentId)
|
||||
.filter(rep -> include(rep, indexing, profile))
|
||||
.ifPresent(selected::add);
|
||||
}
|
||||
|
||||
|
|
@ -48,7 +54,12 @@ public class DefaultEmbeddingSelectionPolicy implements EmbeddingSelectionPolicy
|
|||
.toList();
|
||||
}
|
||||
|
||||
private boolean include(DocumentTextRepresentation representation, EmbeddingProperties.IndexingProperties indexing) {
|
||||
private boolean include(DocumentTextRepresentation representation,
|
||||
EmbeddingProperties.IndexingProperties indexing,
|
||||
EmbeddingProfile profile) {
|
||||
if (profile != null && !profile.includes(representation.getRepresentationType())) {
|
||||
return false;
|
||||
}
|
||||
return switch (representation.getRepresentationType()) {
|
||||
case SEMANTIC_TEXT -> indexing.isEmbedSemanticText();
|
||||
case TITLE_ABSTRACT -> indexing.isEmbedTitleAbstract();
|
||||
|
|
|
|||
|
|
@ -0,0 +1,10 @@
|
|||
package at.procon.dip.embedding.policy;
|
||||
|
||||
public record EmbeddingPolicy(
|
||||
String policyKey,
|
||||
String modelKey,
|
||||
String queryModelKey,
|
||||
String profileKey,
|
||||
boolean enabled
|
||||
) {
|
||||
}
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
package at.procon.dip.embedding.policy;
|
||||
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import java.util.List;
|
||||
|
||||
public record EmbeddingProfile(
|
||||
String profileKey,
|
||||
List<RepresentationType> embedRepresentationTypes
|
||||
) {
|
||||
public boolean includes(RepresentationType representationType) {
|
||||
return embedRepresentationTypes != null && embedRepresentationTypes.contains(representationType);
|
||||
}
|
||||
}
|
||||
|
|
@ -8,4 +8,6 @@ import java.util.UUID;
|
|||
public interface EmbeddingSelectionPolicy {
|
||||
|
||||
List<DocumentTextRepresentation> selectRepresentations(UUID documentId, EmbeddingModelDescriptor model);
|
||||
|
||||
List<DocumentTextRepresentation> selectRepresentations(UUID documentId, EmbeddingModelDescriptor model, EmbeddingProfile profile);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,131 @@
|
|||
package at.procon.dip.embedding.service;
|
||||
|
||||
import at.procon.dip.domain.document.entity.Document;
|
||||
import at.procon.dip.embedding.config.EmbeddingPolicyCondition;
|
||||
import at.procon.dip.embedding.config.EmbeddingPolicyProperties;
|
||||
import at.procon.dip.embedding.config.EmbeddingPolicyRule;
|
||||
import at.procon.dip.embedding.config.EmbeddingPolicyUse;
|
||||
import at.procon.dip.embedding.policy.EmbeddingPolicy;
|
||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.regex.Pattern;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class DefaultEmbeddingPolicyResolver implements EmbeddingPolicyResolver {
|
||||
|
||||
private final EmbeddingPolicyProperties properties;
|
||||
|
||||
@Override
|
||||
public EmbeddingPolicy resolve(Document document, SourceDescriptor sourceDescriptor) {
|
||||
String overridePolicy = attributeValue(sourceDescriptor, "embeddingPolicyKey");
|
||||
if (overridePolicy != null) {
|
||||
return policyByKey(overridePolicy);
|
||||
}
|
||||
|
||||
String policyHint = policyHint(sourceDescriptor);
|
||||
if (policyHint != null) {
|
||||
return policyByKey(policyHint);
|
||||
}
|
||||
|
||||
for (EmbeddingPolicyRule rule : properties.getRules()) {
|
||||
if (matches(rule.getWhen(), document, sourceDescriptor)) {
|
||||
return toPolicy(rule.getUse());
|
||||
}
|
||||
}
|
||||
|
||||
return toPolicy(properties.getDefaultPolicy());
|
||||
}
|
||||
|
||||
private EmbeddingPolicy policyByKey(String policyKey) {
|
||||
for (EmbeddingPolicyRule rule : properties.getRules()) {
|
||||
if (rule.getUse() != null && policyKey.equals(rule.getUse().getPolicyKey())) {
|
||||
return toPolicy(rule.getUse());
|
||||
}
|
||||
}
|
||||
EmbeddingPolicyUse def = properties.getDefaultPolicy();
|
||||
if (def != null && policyKey.equals(def.getPolicyKey())) {
|
||||
return toPolicy(def);
|
||||
}
|
||||
throw new IllegalArgumentException("Unknown embedding policy key: " + policyKey);
|
||||
}
|
||||
|
||||
private EmbeddingPolicy toPolicy(EmbeddingPolicyUse use) {
|
||||
if (use == null) {
|
||||
throw new IllegalStateException("Embedding policy configuration is missing");
|
||||
}
|
||||
return new EmbeddingPolicy(
|
||||
use.getPolicyKey(),
|
||||
use.getModelKey(),
|
||||
use.getQueryModelKey(),
|
||||
use.getProfileKey(),
|
||||
use.isEnabled()
|
||||
);
|
||||
}
|
||||
|
||||
private boolean matches(EmbeddingPolicyCondition c, Document document, SourceDescriptor sourceDescriptor) {
|
||||
if (c == null) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!matchesExact(c.getDocumentType(), enumName(document != null ? document.getDocumentType() : null))) {
|
||||
return false;
|
||||
}
|
||||
if (!matchesExact(c.getDocumentFamily(), enumName(document != null ? document.getDocumentFamily() : null))) {
|
||||
return false;
|
||||
}
|
||||
if (!matchesExact(c.getSourceType(), enumName(sourceDescriptor != null ? sourceDescriptor.sourceType() : null))) {
|
||||
return false;
|
||||
}
|
||||
if (!matchesMime(c.getMimeType(), sourceDescriptor != null ? sourceDescriptor.mediaType() : null)) {
|
||||
return false;
|
||||
}
|
||||
if (!matchesExact(c.getLanguage(), document != null ? document.getLanguageCode() : null)) {
|
||||
return false;
|
||||
}
|
||||
if (!matchesExact(c.getOwnerTenantKey(), document != null && document.getOwnerTenant() != null ? document.getOwnerTenant().getTenantKey() : null )) {
|
||||
return false;
|
||||
}
|
||||
return matchesExact(c.getEmbeddingPolicyHint(), policyHint(sourceDescriptor));
|
||||
}
|
||||
|
||||
private boolean matchesExact(String expected, String actual) {
|
||||
if (expected == null || expected.isBlank()) {
|
||||
return true;
|
||||
}
|
||||
return Objects.equals(expected, actual);
|
||||
}
|
||||
|
||||
private boolean matchesMime(String pattern, String actual) {
|
||||
if (pattern == null || pattern.isBlank()) {
|
||||
return true;
|
||||
}
|
||||
if (actual == null || actual.isBlank()) {
|
||||
return false;
|
||||
}
|
||||
return Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(actual).matches();
|
||||
}
|
||||
|
||||
private String enumName(Enum<?> value) {
|
||||
return value != null ? value.name() : null;
|
||||
}
|
||||
|
||||
private String policyHint(SourceDescriptor sourceDescriptor) {
|
||||
return attributeValue(sourceDescriptor, "embeddingPolicyHint");
|
||||
}
|
||||
|
||||
private String attributeValue(SourceDescriptor sourceDescriptor, String key) {
|
||||
if (sourceDescriptor == null) {
|
||||
return null;
|
||||
}
|
||||
Map<String, String> attributes = sourceDescriptor.attributes();
|
||||
if (attributes == null) {
|
||||
return null;
|
||||
}
|
||||
String value = attributes.get(key);
|
||||
return (value == null || value.isBlank()) ? null : value;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
package at.procon.dip.embedding.service;
|
||||
|
||||
import at.procon.dip.embedding.config.EmbeddingProfileProperties;
|
||||
import at.procon.dip.embedding.policy.EmbeddingProfile;
|
||||
import java.util.List;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class DefaultEmbeddingProfileResolver implements EmbeddingProfileResolver {
|
||||
|
||||
private final EmbeddingProfileProperties properties;
|
||||
|
||||
@Override
|
||||
public EmbeddingProfile resolve(String profileKey) {
|
||||
if (profileKey == null || profileKey.isBlank()) {
|
||||
throw new IllegalArgumentException("Embedding profile key must not be blank");
|
||||
}
|
||||
|
||||
EmbeddingProfileProperties.ProfileDefinition definition = properties.getDefinitions().get(profileKey);
|
||||
if (definition == null) {
|
||||
throw new IllegalArgumentException("Unknown embedding profile: " + profileKey);
|
||||
}
|
||||
|
||||
return new EmbeddingProfile(
|
||||
profileKey,
|
||||
List.copyOf(definition.getEmbedRepresentationTypes())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
package at.procon.dip.embedding.service;
|
||||
|
||||
import at.procon.dip.domain.document.entity.Document;
|
||||
import at.procon.dip.embedding.policy.EmbeddingPolicy;
|
||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||
|
||||
public interface EmbeddingPolicyResolver {
|
||||
EmbeddingPolicy resolve(Document document, SourceDescriptor sourceDescriptor);
|
||||
}
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
package at.procon.dip.embedding.service;
|
||||
|
||||
import at.procon.dip.embedding.policy.EmbeddingProfile;
|
||||
|
||||
public interface EmbeddingProfileResolver {
|
||||
EmbeddingProfile resolve(String profileKey);
|
||||
}
|
||||
|
|
@ -9,6 +9,8 @@ import at.procon.dip.embedding.job.service.EmbeddingJobService;
|
|||
import at.procon.dip.embedding.model.EmbeddingJobType;
|
||||
import at.procon.dip.embedding.model.EmbeddingProviderResult;
|
||||
import at.procon.dip.embedding.model.EmbeddingUseCase;
|
||||
import at.procon.dip.embedding.policy.EmbeddingProfile;
|
||||
import at.procon.dip.embedding.policy.EmbeddingSelectionPolicy;
|
||||
import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
|
@ -26,6 +28,7 @@ public class RepresentationEmbeddingOrchestrator {
|
|||
private final EmbeddingExecutionService executionService;
|
||||
private final EmbeddingPersistenceService persistenceService;
|
||||
private final DocumentTextRepresentationRepository representationRepository;
|
||||
private final EmbeddingSelectionPolicy selectionPolicy;
|
||||
private final EmbeddingModelRegistry modelRegistry;
|
||||
private final EmbeddingProperties embeddingProperties;
|
||||
|
||||
|
|
@ -39,6 +42,14 @@ public class RepresentationEmbeddingOrchestrator {
|
|||
return jobService.enqueueForDocument(documentId, modelKey);
|
||||
}
|
||||
|
||||
@Transactional
|
||||
public List<EmbeddingJob> enqueueDocument(UUID documentId, String modelKey, EmbeddingProfile profile) {
|
||||
var model = modelRegistry.getRequired(modelKey);
|
||||
return selectionPolicy.selectRepresentations(documentId, model, profile).stream()
|
||||
.map(representation -> enqueueRepresentation(documentId, representation.getId(), modelKey))
|
||||
.toList();
|
||||
}
|
||||
|
||||
@Transactional
|
||||
public EmbeddingJob enqueueRepresentation(UUID documentId, UUID representationId, String modelKey) {
|
||||
return jobService.enqueueForRepresentation(documentId, representationId, modelKey, EmbeddingJobType.DOCUMENT_EMBED);
|
||||
|
|
|
|||
|
|
@ -10,7 +10,6 @@ import at.procon.dip.domain.document.DocumentStatus;
|
|||
import at.procon.dip.domain.document.StorageType;
|
||||
import at.procon.dip.domain.document.entity.Document;
|
||||
import at.procon.dip.domain.document.entity.DocumentContent;
|
||||
import at.procon.dip.domain.document.entity.DocumentSource;
|
||||
import at.procon.dip.domain.document.repository.DocumentRepository;
|
||||
import at.procon.dip.domain.document.repository.DocumentSourceRepository;
|
||||
import at.procon.dip.domain.document.service.DocumentContentService;
|
||||
|
|
@ -21,26 +20,30 @@ import at.procon.dip.domain.document.service.command.AddDocumentContentCommand;
|
|||
import at.procon.dip.domain.document.service.command.AddDocumentSourceCommand;
|
||||
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
|
||||
import at.procon.dip.domain.document.service.command.CreateDocumentCommand;
|
||||
import at.procon.dip.embedding.config.EmbeddingProperties;
|
||||
import at.procon.dip.embedding.policy.EmbeddingPolicy;
|
||||
import at.procon.dip.embedding.policy.EmbeddingProfile;
|
||||
import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
|
||||
import at.procon.dip.embedding.service.EmbeddingModelCatalogService;
|
||||
import at.procon.dip.embedding.service.EmbeddingPolicyResolver;
|
||||
import at.procon.dip.embedding.service.EmbeddingProfileResolver;
|
||||
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
|
||||
import at.procon.dip.extraction.service.DocumentExtractionService;
|
||||
import at.procon.dip.extraction.spi.ExtractionRequest;
|
||||
import at.procon.dip.extraction.spi.ExtractionResult;
|
||||
import at.procon.dip.ingestion.config.DipIngestionProperties;
|
||||
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
|
||||
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
|
||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||
import at.procon.dip.ingestion.util.DocumentImportSupport;
|
||||
import at.procon.dip.embedding.config.EmbeddingProperties;
|
||||
import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
|
||||
import at.procon.dip.embedding.service.EmbeddingModelCatalogService;
|
||||
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
|
||||
import at.procon.dip.ingestion.config.DipIngestionProperties;
|
||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||
import at.procon.dip.runtime.config.RuntimeMode;
|
||||
import at.procon.dip.normalization.service.TextRepresentationBuildService;
|
||||
import at.procon.dip.processing.service.StructuredDocumentProcessingService;
|
||||
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
|
||||
import at.procon.dip.normalization.spi.TextRepresentationDraft;
|
||||
import at.procon.dip.processing.service.StructuredDocumentProcessingService;
|
||||
import at.procon.dip.processing.spi.DocumentProcessingPolicy;
|
||||
import at.procon.dip.processing.spi.StructuredProcessingRequest;
|
||||
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
||||
import at.procon.dip.runtime.config.RuntimeMode;
|
||||
import at.procon.ted.util.HashUtils;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.OffsetDateTime;
|
||||
|
|
@ -49,16 +52,12 @@ import java.util.LinkedHashMap;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
import org.springframework.util.StringUtils;
|
||||
|
||||
/**
|
||||
* Phase 4 generic import pipeline that persists arbitrary document types into the DOC model.
|
||||
*/
|
||||
@Service
|
||||
@ConditionalOnRuntimeMode(RuntimeMode.NEW)
|
||||
@RequiredArgsConstructor
|
||||
|
|
@ -80,6 +79,8 @@ public class GenericDocumentImportService {
|
|||
private final EmbeddingModelRegistry embeddingModelRegistry;
|
||||
private final EmbeddingModelCatalogService embeddingModelCatalogService;
|
||||
private final RepresentationEmbeddingOrchestrator representationEmbeddingOrchestrator;
|
||||
private final EmbeddingPolicyResolver embeddingPolicyResolver;
|
||||
private final EmbeddingProfileResolver embeddingProfileResolver;
|
||||
|
||||
@Transactional
|
||||
public ImportedDocumentResult importDocument(SourceDescriptor sourceDescriptor) {
|
||||
|
|
@ -163,7 +164,7 @@ public class GenericDocumentImportService {
|
|||
|
||||
if (processingPolicy.runRepresentationBuilders()) {
|
||||
var drafts = representationBuildService.build(new RepresentationBuildRequest(sourceDescriptor, detection, extractionResult));
|
||||
persistRepresentationsAndEmbeddings(document, originalContent, persistedDerivedContent, drafts);
|
||||
persistRepresentationsAndEmbeddings(document, originalContent, persistedDerivedContent, drafts, sourceDescriptor);
|
||||
}
|
||||
|
||||
if (processingPolicy.applyStructuredTitleIfMissing() && !extractionResult.structuredPayloads().isEmpty()) {
|
||||
|
|
@ -182,30 +183,7 @@ public class GenericDocumentImportService {
|
|||
return new ImportedDocumentResult(reloaded, detection, warnings, false);
|
||||
}
|
||||
|
||||
|
||||
private ExtractionResult emptyExtractionResult() {
|
||||
return new ExtractionResult(java.util.Collections.emptyMap(), java.util.Collections.emptyList(), java.util.Collections.emptyList());
|
||||
}
|
||||
|
||||
private Optional<Document> resolveDeduplicatedDocument(String dedupHash, DocumentAccessContext accessContext) {
|
||||
return documentRepository.findAllByDedupHash(dedupHash).stream()
|
||||
.filter(existing -> sameAccessScope(existing, accessContext))
|
||||
.findFirst();
|
||||
}
|
||||
|
||||
private boolean sameAccessScope(Document existing, DocumentAccessContext accessContext) {
|
||||
if (existing.getVisibility() != accessContext.visibility()) {
|
||||
return false;
|
||||
}
|
||||
String existingTenantKey = existing.getOwnerTenant() == null ? null : existing.getOwnerTenant().getTenantKey();
|
||||
String requestedTenantKey = accessContext.ownerTenant() == null ? null : accessContext.ownerTenant().tenantKey();
|
||||
return java.util.Objects.equals(existingTenantKey, requestedTenantKey);
|
||||
}
|
||||
|
||||
private SourceDescriptor withResolvedMediaType(SourceDescriptor sourceDescriptor, ResolvedPayload payload) {
|
||||
if (StringUtils.hasText(sourceDescriptor.mediaType())) {
|
||||
return sourceDescriptor;
|
||||
}
|
||||
return new SourceDescriptor(
|
||||
sourceDescriptor.accessContext(),
|
||||
sourceDescriptor.sourceType(),
|
||||
|
|
@ -269,7 +247,7 @@ public class GenericDocumentImportService {
|
|||
return sourceDescriptor.fileName();
|
||||
}
|
||||
if (StringUtils.hasText(payload.textContent())) {
|
||||
for (String line : payload.textContent().split("\\n")) {
|
||||
for (String line : payload.textContent().split("\n")) {
|
||||
if (StringUtils.hasText(line)) {
|
||||
return DocumentImportSupport.ellipsize(line.trim(), 240);
|
||||
}
|
||||
|
|
@ -395,25 +373,33 @@ public class GenericDocumentImportService {
|
|||
private void persistRepresentationsAndEmbeddings(Document document,
|
||||
DocumentContent originalContent,
|
||||
Map<ContentRole, DocumentContent> derivedContent,
|
||||
List<TextRepresentationDraft> drafts) {
|
||||
List<TextRepresentationDraft> drafts,
|
||||
SourceDescriptor sourceDescriptor) {
|
||||
if (drafts == null || drafts.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
String embeddingModelKey = null;
|
||||
EmbeddingPolicy embeddingPolicy = null;
|
||||
EmbeddingProfile embeddingProfile = null;
|
||||
if (embeddingProperties.isEnabled()) {
|
||||
embeddingModelKey = embeddingModelRegistry.getRequiredDefaultDocumentModelKey();
|
||||
embeddingModelCatalogService.ensureRegistered(embeddingModelKey);
|
||||
embeddingPolicy = embeddingPolicyResolver.resolve(document, sourceDescriptor);
|
||||
if (embeddingPolicy != null && embeddingPolicy.enabled()) {
|
||||
embeddingModelRegistry.getRequired(embeddingPolicy.modelKey());
|
||||
embeddingModelCatalogService.ensureRegistered(embeddingPolicy.modelKey());
|
||||
embeddingProfile = embeddingProfileResolver.resolve(embeddingPolicy.profileKey());
|
||||
log.debug("Resolved embedding policy {} for document {} -> model={}, profile={}",
|
||||
embeddingPolicy.policyKey(), document.getId(), embeddingPolicy.modelKey(), embeddingPolicy.profileKey());
|
||||
} else if (embeddingPolicy != null) {
|
||||
log.debug("Resolved disabled embedding policy {} for document {}", embeddingPolicy.policyKey(), document.getId());
|
||||
}
|
||||
}
|
||||
|
||||
for (TextRepresentationDraft draft : drafts) {
|
||||
if (!StringUtils.hasText(draft.textBody())) {
|
||||
continue;
|
||||
}
|
||||
DocumentContent linkedContent = switch (draft.representationType()) {
|
||||
case FULLTEXT, SEMANTIC_TEXT, SUMMARY, TITLE_ABSTRACT, METADATA_ENRICHED, CHUNK ->
|
||||
derivedContent.getOrDefault(ContentRole.NORMALIZED_TEXT, originalContent);
|
||||
};
|
||||
|
||||
DocumentContent linkedContent = resolveLinkedContent(draft, originalContent, derivedContent);
|
||||
|
||||
var representation = documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand(
|
||||
document.getId(),
|
||||
|
|
@ -429,8 +415,12 @@ public class GenericDocumentImportService {
|
|||
draft.textBody()
|
||||
));
|
||||
|
||||
if (embeddingModelKey != null && shouldQueueEmbedding(draft)) {
|
||||
representationEmbeddingOrchestrator.enqueueRepresentation(document.getId(), representation.getId(), embeddingModelKey);
|
||||
if (shouldQueueEmbedding(draft, embeddingPolicy, embeddingProfile)) {
|
||||
representationEmbeddingOrchestrator.enqueueRepresentation(
|
||||
document.getId(),
|
||||
representation.getId(),
|
||||
embeddingPolicy.modelKey()
|
||||
);
|
||||
}
|
||||
}
|
||||
documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED);
|
||||
|
|
@ -445,7 +435,15 @@ public class GenericDocumentImportService {
|
|||
return derivedContent.getOrDefault(ContentRole.NORMALIZED_TEXT, originalContent);
|
||||
}
|
||||
|
||||
private boolean shouldQueueEmbedding(TextRepresentationDraft draft) {
|
||||
private boolean shouldQueueEmbedding(TextRepresentationDraft draft,
|
||||
EmbeddingPolicy embeddingPolicy,
|
||||
EmbeddingProfile embeddingProfile) {
|
||||
if (embeddingPolicy == null || !embeddingPolicy.enabled() || embeddingProfile == null) {
|
||||
return false;
|
||||
}
|
||||
if (!embeddingProfile.includes(draft.representationType())) {
|
||||
return false;
|
||||
}
|
||||
if (draft.queueForEmbedding() != null) {
|
||||
return draft.queueForEmbedding();
|
||||
}
|
||||
|
|
@ -502,6 +500,31 @@ public class GenericDocumentImportService {
|
|||
return java.util.Objects.equals(left, right);
|
||||
}
|
||||
|
||||
private Optional<Document> resolveDeduplicatedDocument(String dedupHash, DocumentAccessContext accessContext) {
|
||||
return documentRepository.findByDedupHash(dedupHash).stream()
|
||||
.filter(document -> matchesAccessContext(document, accessContext))
|
||||
.findFirst();
|
||||
}
|
||||
|
||||
private boolean matchesAccessContext(Document document, DocumentAccessContext accessContext) {
|
||||
String expectedTenantKey = accessContext.ownerTenant() == null ? null : accessContext.ownerTenant().tenantKey();
|
||||
if (!equalsNullable(document.getOwnerTenant() != null ? document.getOwnerTenant().getTenantKey() : null, expectedTenantKey)) {
|
||||
return false;
|
||||
}
|
||||
return document.getVisibility() == accessContext.visibility();
|
||||
}
|
||||
|
||||
private ExtractionResult emptyExtractionResult() {
|
||||
return new ExtractionResult(Map.of(), List.of(), List.of());
|
||||
}
|
||||
|
||||
private CanonicalDocumentMetadata buildCanonicalMetadata( Document document,
|
||||
DetectionResult detection,
|
||||
SourceDescriptor sourceDescriptor,
|
||||
ExtractionResult extractionResult) {
|
||||
return document.toCanonicalMetadata();
|
||||
}
|
||||
|
||||
private record ResolvedPayload(byte[] binaryContent, String textContent, String mediaType) {
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,71 @@
|
|||
dip:
|
||||
embedding:
|
||||
profiles:
|
||||
definitions:
|
||||
primary-only:
|
||||
embed-representation-types: [SEMANTIC_TEXT]
|
||||
|
||||
primary-and-chunks:
|
||||
embed-representation-types: [SEMANTIC_TEXT, CHUNK]
|
||||
|
||||
ted-semantic:
|
||||
embed-representation-types: [SEMANTIC_TEXT, TITLE_ABSTRACT, CHUNK]
|
||||
|
||||
mail-message:
|
||||
embed-representation-types: [SEMANTIC_TEXT, ATTACHMENT_ROLLUP]
|
||||
|
||||
attachment-chunks:
|
||||
embed-representation-types: [CHUNK]
|
||||
|
||||
disabled:
|
||||
embed-representation-types: []
|
||||
|
||||
policies:
|
||||
default-policy:
|
||||
policy-key: generic-default
|
||||
model-key: e5-default
|
||||
query-model-key: e5-default
|
||||
profile-key: primary-and-chunks
|
||||
enabled: true
|
||||
|
||||
rules:
|
||||
- name: ted-notice
|
||||
when:
|
||||
document-family: TED_NOTICE
|
||||
use:
|
||||
policy-key: ted-default
|
||||
model-key: e5-default
|
||||
query-model-key: e5-default
|
||||
profile-key: ted-semantic
|
||||
enabled: true
|
||||
|
||||
- name: email-root
|
||||
when:
|
||||
document-type: EMAIL
|
||||
use:
|
||||
policy-key: mail-default
|
||||
model-key: e5-default
|
||||
query-model-key: e5-default
|
||||
profile-key: mail-message
|
||||
enabled: true
|
||||
|
||||
- name: mail-attachment-pdf
|
||||
when:
|
||||
source-type: MAIL_ATTACHMENT
|
||||
mime-type: application/pdf
|
||||
use:
|
||||
policy-key: mail-attachment-pdf
|
||||
model-key: e5-default
|
||||
query-model-key: e5-default
|
||||
profile-key: attachment-chunks
|
||||
enabled: true
|
||||
|
||||
- name: skip-images
|
||||
when:
|
||||
mime-type: image/.*
|
||||
use:
|
||||
policy-key: no-embedding-images
|
||||
model-key: e5-default
|
||||
query-model-key: e5-default
|
||||
profile-key: disabled
|
||||
enabled: false
|
||||
|
|
@ -34,8 +34,11 @@ dip:
|
|||
startup-lexical-backfill-limit: 500
|
||||
# Number of top hits per engine returned by /search/debug
|
||||
debug-top-hits-per-engine: 10
|
||||
|
||||
embedding:
|
||||
enabled: true
|
||||
jobs:
|
||||
enabled: true
|
||||
default-document-model: e5-default
|
||||
default-query-model: e5-default
|
||||
providers:
|
||||
|
|
@ -62,8 +65,82 @@ dip:
|
|||
distance-metric: COSINE
|
||||
supports-query-embedding-mode: true
|
||||
active: true
|
||||
jobs:
|
||||
enabled: true
|
||||
|
||||
profiles:
|
||||
definitions:
|
||||
primary-only:
|
||||
embed-representation-types: [SEMANTIC_TEXT]
|
||||
|
||||
primary-and-chunks:
|
||||
embed-representation-types: [SEMANTIC_TEXT, CHUNK]
|
||||
|
||||
ted-semantic:
|
||||
embed-representation-types: [SEMANTIC_TEXT] #[SEMANTIC_TEXT, TITLE_ABSTRACT, CHUNK]
|
||||
|
||||
mail-message:
|
||||
embed-representation-types: [SEMANTIC_TEXT, ATTACHMENT_ROLLUP]
|
||||
|
||||
attachment-chunks:
|
||||
embed-representation-types: [CHUNK]
|
||||
|
||||
disabled:
|
||||
embed-representation-types: []
|
||||
|
||||
policies:
|
||||
default-policy:
|
||||
policy-key: generic-default
|
||||
model-key: e5-default
|
||||
query-model-key: e5-default
|
||||
profile-key: primary-and-chunks
|
||||
enabled: true
|
||||
|
||||
ted-policy:
|
||||
policy-key: ted-default
|
||||
model-key: e5-default
|
||||
query-model-key: e5-default
|
||||
profile-key: ted-semantic
|
||||
enabled: true
|
||||
|
||||
rules:
|
||||
- name: ted-notice
|
||||
when:
|
||||
document-family: TED_NOTICE
|
||||
use:
|
||||
policy-key: ted-default
|
||||
model-key: e5-default
|
||||
query-model-key: e5-default
|
||||
profile-key: ted-semantic
|
||||
enabled: true
|
||||
|
||||
- name: email-root
|
||||
when:
|
||||
document-type: EMAIL
|
||||
use:
|
||||
policy-key: mail-default
|
||||
model-key: e5-default
|
||||
query-model-key: e5-default
|
||||
profile-key: mail-message
|
||||
enabled: true
|
||||
|
||||
- name: mail-attachment-pdf
|
||||
when:
|
||||
source-type: MAIL_ATTACHMENT
|
||||
mime-type: application/pdf
|
||||
use:
|
||||
policy-key: mail-attachment-pdf
|
||||
model-key: e5-default
|
||||
query-model-key: e5-default
|
||||
profile-key: attachment-chunks
|
||||
enabled: true
|
||||
|
||||
- name: skip-images
|
||||
when:
|
||||
mime-type: image/.*
|
||||
use:
|
||||
policy-key: no-embedding-images
|
||||
profile-key: disabled
|
||||
enabled: false
|
||||
|
||||
# Phase 4 generic ingestion configuration
|
||||
ingestion:
|
||||
# Master switch for arbitrary document ingestion into the DOC model
|
||||
|
|
|
|||
|
|
@ -0,0 +1,135 @@
|
|||
package at.procon.dip.embedding.service;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.assertj.core.api.Assertions.assertThatThrownBy;
|
||||
|
||||
import at.procon.dip.domain.access.DocumentVisibility;
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentStatus;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.SourceType;
|
||||
import at.procon.dip.domain.document.entity.Document;
|
||||
import at.procon.dip.embedding.config.EmbeddingPolicyCondition;
|
||||
import at.procon.dip.embedding.config.EmbeddingPolicyProperties;
|
||||
import at.procon.dip.embedding.config.EmbeddingPolicyRule;
|
||||
import at.procon.dip.embedding.config.EmbeddingPolicyUse;
|
||||
import at.procon.dip.ingestion.spi.OriginalContentStoragePolicy;
|
||||
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class DefaultEmbeddingPolicyResolverTest {
|
||||
|
||||
@Test
|
||||
void shouldPreferHintAndOverrideFromAttributes() {
|
||||
EmbeddingPolicyProperties properties = baseProperties();
|
||||
|
||||
EmbeddingPolicyRule mailRule = new EmbeddingPolicyRule();
|
||||
EmbeddingPolicyUse mailUse = new EmbeddingPolicyUse();
|
||||
mailUse.setPolicyKey("mail-default");
|
||||
mailUse.setModelKey("e5-default");
|
||||
mailUse.setQueryModelKey("e5-default");
|
||||
mailUse.setProfileKey("mail-message");
|
||||
mailUse.setEnabled(true);
|
||||
mailRule.setUse(mailUse);
|
||||
properties.getRules().add(mailRule);
|
||||
|
||||
EmbeddingPolicyRule tedRule = new EmbeddingPolicyRule();
|
||||
EmbeddingPolicyUse tedUse = new EmbeddingPolicyUse();
|
||||
tedUse.setPolicyKey("ted-default");
|
||||
tedUse.setModelKey("e5-default");
|
||||
tedUse.setQueryModelKey("e5-default");
|
||||
tedUse.setProfileKey("ted-semantic");
|
||||
tedUse.setEnabled(true);
|
||||
tedRule.setUse(tedUse);
|
||||
properties.getRules().add(tedRule);
|
||||
|
||||
DefaultEmbeddingPolicyResolver resolver = new DefaultEmbeddingPolicyResolver(properties);
|
||||
|
||||
SourceDescriptor descriptor = sourceDescriptor(SourceType.MAIL_MESSAGE, "message/rfc822", Map.of(
|
||||
"embeddingPolicyHint", "mail-default",
|
||||
"embeddingPolicyKey", "ted-default"
|
||||
));
|
||||
|
||||
var policy = resolver.resolve(document(DocumentFamily.GENERIC, DocumentType.EMAIL, "en"), descriptor);
|
||||
|
||||
assertThat(policy.policyKey()).isEqualTo("ted-default");
|
||||
}
|
||||
|
||||
@Test
|
||||
void shouldMatchByMimeTypeUsingMediaTypeField() {
|
||||
EmbeddingPolicyProperties properties = baseProperties();
|
||||
EmbeddingPolicyRule rule = new EmbeddingPolicyRule();
|
||||
EmbeddingPolicyCondition when = new EmbeddingPolicyCondition();
|
||||
when.setSourceType("MAIL_ATTACHMENT");
|
||||
when.setMimeType("application/pdf");
|
||||
rule.setWhen(when);
|
||||
EmbeddingPolicyUse use = new EmbeddingPolicyUse();
|
||||
use.setPolicyKey("mail-attachment-pdf");
|
||||
use.setModelKey("e5-default");
|
||||
use.setQueryModelKey("e5-default");
|
||||
use.setProfileKey("attachment-chunks");
|
||||
rule.setUse(use);
|
||||
properties.getRules().add(rule);
|
||||
|
||||
DefaultEmbeddingPolicyResolver resolver = new DefaultEmbeddingPolicyResolver(properties);
|
||||
|
||||
var policy = resolver.resolve(document(DocumentFamily.GENERIC, DocumentType.FILE, "en"),
|
||||
sourceDescriptor(SourceType.MAIL_ATTACHMENT, "application/pdf", Map.of()));
|
||||
|
||||
assertThat(policy.policyKey()).isEqualTo("mail-attachment-pdf");
|
||||
}
|
||||
|
||||
@Test
|
||||
void shouldFailForUnknownOverridePolicy() {
|
||||
DefaultEmbeddingPolicyResolver resolver = new DefaultEmbeddingPolicyResolver(baseProperties());
|
||||
SourceDescriptor descriptor = sourceDescriptor(SourceType.FILE_IMPORT, "application/pdf", Map.of(
|
||||
"embeddingPolicyKey", "missing-policy"
|
||||
));
|
||||
assertThatThrownBy(() -> resolver.resolve(document(DocumentFamily.GENERIC, DocumentType.FILE, "en"), descriptor))
|
||||
.isInstanceOf(IllegalArgumentException.class)
|
||||
.hasMessageContaining("Unknown embedding policy key");
|
||||
}
|
||||
|
||||
private EmbeddingPolicyProperties baseProperties() {
|
||||
EmbeddingPolicyProperties properties = new EmbeddingPolicyProperties();
|
||||
EmbeddingPolicyUse defaultPolicy = new EmbeddingPolicyUse();
|
||||
defaultPolicy.setPolicyKey("generic-default");
|
||||
defaultPolicy.setModelKey("e5-default");
|
||||
defaultPolicy.setQueryModelKey("e5-default");
|
||||
defaultPolicy.setProfileKey("primary-and-chunks");
|
||||
defaultPolicy.setEnabled(true);
|
||||
properties.setDefaultPolicy(defaultPolicy);
|
||||
return properties;
|
||||
}
|
||||
|
||||
private Document document(DocumentFamily family, DocumentType type, String language) {
|
||||
return Document.builder()
|
||||
.id(UUID.randomUUID())
|
||||
.documentFamily(family)
|
||||
.documentType(type)
|
||||
.languageCode(language)
|
||||
.status(DocumentStatus.IMPORTED)
|
||||
.visibility(DocumentVisibility.PUBLIC)
|
||||
.title("Test document")
|
||||
.build();
|
||||
}
|
||||
|
||||
private SourceDescriptor sourceDescriptor(SourceType sourceType, String mediaType, Map<String, String> attrs) {
|
||||
return new SourceDescriptor(
|
||||
null,
|
||||
sourceType,
|
||||
"source-ref",
|
||||
"/tmp/source",
|
||||
"source.bin",
|
||||
mediaType,
|
||||
null,
|
||||
null,
|
||||
OffsetDateTime.now(),
|
||||
OriginalContentStoragePolicy.DEFAULT,
|
||||
attrs
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
package at.procon.dip.embedding.service;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.assertj.core.api.Assertions.assertThatThrownBy;
|
||||
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import at.procon.dip.embedding.config.EmbeddingProfileProperties;
|
||||
import java.util.List;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class DefaultEmbeddingProfileResolverTest {
|
||||
|
||||
@Test
|
||||
void shouldResolveKnownProfile() {
|
||||
EmbeddingProfileProperties properties = new EmbeddingProfileProperties();
|
||||
EmbeddingProfileProperties.ProfileDefinition def = new EmbeddingProfileProperties.ProfileDefinition();
|
||||
def.setEmbedRepresentationTypes(List.of(RepresentationType.SEMANTIC_TEXT, RepresentationType.CHUNK));
|
||||
properties.getDefinitions().put("primary-and-chunks", def);
|
||||
|
||||
DefaultEmbeddingProfileResolver resolver = new DefaultEmbeddingProfileResolver(properties);
|
||||
|
||||
var profile = resolver.resolve("primary-and-chunks");
|
||||
|
||||
assertThat(profile.profileKey()).isEqualTo("primary-and-chunks");
|
||||
assertThat(profile.embedRepresentationTypes()).containsExactly(RepresentationType.SEMANTIC_TEXT, RepresentationType.CHUNK);
|
||||
}
|
||||
|
||||
@Test
|
||||
void shouldFailForUnknownProfile() {
|
||||
EmbeddingProfileProperties properties = new EmbeddingProfileProperties();
|
||||
DefaultEmbeddingProfileResolver resolver = new DefaultEmbeddingProfileResolver(properties);
|
||||
|
||||
assertThatThrownBy(() -> resolver.resolve("missing"))
|
||||
.isInstanceOf(IllegalArgumentException.class)
|
||||
.hasMessageContaining("Unknown embedding profile");
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue