From 71fb43a5ea3b43153d5d0838713103364084bf92 Mon Sep 17 00:00:00 2001 From: trifonovt <87468028+TihomirTrifonov@users.noreply.github.com> Date: Tue, 17 Mar 2026 14:41:27 +0100 Subject: [PATCH] Refactor phases 0-2 --- README.md | 6 +- .../PHASE0_ARCHITECTURE_FOUNDATION.md | 76 +++++ .../PHASE1_GENERIC_PERSISTENCE_MODEL.md | 42 +++ .../PHASE2_VECTORIZATION_DECOUPLING.md | 48 +++ pom.xml | 9 +- ...cumentIntelligencePlatformApplication.java | 28 ++ ...ntIntelligencePlatformApplication.java.bak | 28 ++ src/main/java/at/procon/dip/README_PHASE0.md | 39 +++ src/main/java/at/procon/dip/README_PHASE1.md | 27 ++ src/main/java/at/procon/dip/README_PHASE2.md | 18 ++ .../architecture/PlatformArchitecture.java | 45 +++ .../procon/dip/architecture/SchemaNames.java | 13 + .../classification/spi/DetectionResult.java | 17 ++ .../spi/DocumentTypeDetector.java | 13 + .../domain/access/DocumentAccessContext.java | 31 ++ .../dip/domain/access/DocumentVisibility.java | 11 + .../document/CanonicalDocumentMetadata.java | 23 ++ .../dip/domain/document/ContentRole.java | 14 + .../dip/domain/document/DistanceMetric.java | 10 + .../dip/domain/document/DocumentFamily.java | 12 + .../dip/domain/document/DocumentStatus.java | 14 + .../dip/domain/document/DocumentType.java | 19 ++ .../dip/domain/document/EmbeddingStatus.java | 12 + .../dip/domain/document/RelationType.java | 14 + .../domain/document/RepresentationType.java | 13 + .../dip/domain/document/SourceType.java | 15 + .../dip/domain/document/StorageType.java | 12 + .../dip/domain/document/entity/Document.java | 133 +++++++++ .../document/entity/DocumentContent.java | 86 ++++++ .../document/entity/DocumentEmbedding.java | 103 +++++++ .../entity/DocumentEmbeddingModel.java | 86 ++++++ .../document/entity/DocumentRelation.java | 72 +++++ .../document/entity/DocumentSource.java | 85 ++++++ .../entity/DocumentTextRepresentation.java | 98 ++++++ .../repository/DocumentContentRepository.java | 17 ++ .../DocumentEmbeddingModelRepository.java | 11 + .../DocumentEmbeddingRepository.java | 55 ++++ .../DocumentRelationRepository.java | 16 + .../repository/DocumentRepository.java | 31 ++ .../repository/DocumentSourceRepository.java | 17 ++ .../DocumentTextRepresentationRepository.java | 19 ++ .../service/DocumentContentService.java | 45 +++ .../service/DocumentEmbeddingService.java | 125 ++++++++ .../service/DocumentRelationService.java | 35 +++ .../DocumentRepresentationService.java | 50 ++++ .../document/service/DocumentService.java | 75 +++++ .../service/DocumentSourceService.java | 38 +++ .../command/AddDocumentContentCommand.java | 18 ++ .../command/AddDocumentSourceCommand.java | 17 ++ .../AddDocumentTextRepresentationCommand.java | 19 ++ .../command/CreateDocumentCommand.java | 24 ++ .../CreateDocumentRelationCommand.java | 13 + .../RegisterEmbeddingModelCommand.java | 14 + .../procon/dip/domain/tenant/TenantRef.java | 11 + .../domain/tenant/entity/DocumentTenant.java | 71 +++++ .../repository/DocumentTenantRepository.java | 13 + .../tenant/service/DocumentTenantService.java | 45 +++ .../service/command/CreateTenantCommand.java | 9 + .../dip/extraction/spi/DocumentExtractor.java | 13 + .../spi/ExtractedStructuredPayload.java | 12 + .../dip/extraction/spi/ExtractionRequest.java | 15 + .../dip/extraction/spi/ExtractionResult.java | 15 + .../spi/DocumentIngestionAdapter.java | 11 + .../dip/ingestion/spi/IngestionResult.java | 13 + .../dip/ingestion/spi/SourceDescriptor.java | 19 ++ .../dip/migration/MigrationStrategyMode.java | 12 + .../spi/RepresentationBuildRequest.java | 15 + .../spi/TextRepresentationBuilder.java | 14 + .../spi/TextRepresentationDraft.java | 15 + .../dip/processing/spi/ProcessingStage.java | 14 + .../dip/search/spi/SearchDocumentScope.java | 18 ++ .../camel/GenericVectorizationRoute.java | 211 +++++++++++++ .../DocumentEmbeddingProcessingService.java | 142 +++++++++ .../spi/EmbeddingModelDescriptor.java | 13 + .../vectorization/spi/EmbeddingProvider.java | 13 + .../vectorization/spi/EmbeddingResult.java | 13 + ...ConfiguredEmbeddingModelStartupRunner.java | 41 +++ .../GenericVectorizationStartupRunner.java | 60 ++++ .../TedProcurementProcessorApplication.java | 28 +- .../procon/ted/camel/VectorizationRoute.java | 4 + .../ted/config/TedProcessorProperties.java | 31 ++ .../ted/controller/AdminController.java | 54 +++- .../ted/event/VectorizationEventListener.java | 2 +- .../BatchDocumentProcessingService.java | 5 + .../service/DocumentProcessingService.java | 20 +- .../TedPhase2GenericDocumentService.java | 197 ++++++++++++ .../startup/VectorizationStartupRunner.java | 4 + src/main/resources/application.yml | 37 ++- ...__add_doc_generic_persistence_backbone.sql | 281 ++++++++++++++++++ .../V5__doc_phase2_vectorization_support.sql | 14 + 90 files changed, 3361 insertions(+), 55 deletions(-) create mode 100644 docs/architecture/PHASE0_ARCHITECTURE_FOUNDATION.md create mode 100644 docs/architecture/PHASE1_GENERIC_PERSISTENCE_MODEL.md create mode 100644 docs/architecture/PHASE2_VECTORIZATION_DECOUPLING.md create mode 100644 src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java create mode 100644 src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java.bak create mode 100644 src/main/java/at/procon/dip/README_PHASE0.md create mode 100644 src/main/java/at/procon/dip/README_PHASE1.md create mode 100644 src/main/java/at/procon/dip/README_PHASE2.md create mode 100644 src/main/java/at/procon/dip/architecture/PlatformArchitecture.java create mode 100644 src/main/java/at/procon/dip/architecture/SchemaNames.java create mode 100644 src/main/java/at/procon/dip/classification/spi/DetectionResult.java create mode 100644 src/main/java/at/procon/dip/classification/spi/DocumentTypeDetector.java create mode 100644 src/main/java/at/procon/dip/domain/access/DocumentAccessContext.java create mode 100644 src/main/java/at/procon/dip/domain/access/DocumentVisibility.java create mode 100644 src/main/java/at/procon/dip/domain/document/CanonicalDocumentMetadata.java create mode 100644 src/main/java/at/procon/dip/domain/document/ContentRole.java create mode 100644 src/main/java/at/procon/dip/domain/document/DistanceMetric.java create mode 100644 src/main/java/at/procon/dip/domain/document/DocumentFamily.java create mode 100644 src/main/java/at/procon/dip/domain/document/DocumentStatus.java create mode 100644 src/main/java/at/procon/dip/domain/document/DocumentType.java create mode 100644 src/main/java/at/procon/dip/domain/document/EmbeddingStatus.java create mode 100644 src/main/java/at/procon/dip/domain/document/RelationType.java create mode 100644 src/main/java/at/procon/dip/domain/document/RepresentationType.java create mode 100644 src/main/java/at/procon/dip/domain/document/SourceType.java create mode 100644 src/main/java/at/procon/dip/domain/document/StorageType.java create mode 100644 src/main/java/at/procon/dip/domain/document/entity/Document.java create mode 100644 src/main/java/at/procon/dip/domain/document/entity/DocumentContent.java create mode 100644 src/main/java/at/procon/dip/domain/document/entity/DocumentEmbedding.java create mode 100644 src/main/java/at/procon/dip/domain/document/entity/DocumentEmbeddingModel.java create mode 100644 src/main/java/at/procon/dip/domain/document/entity/DocumentRelation.java create mode 100644 src/main/java/at/procon/dip/domain/document/entity/DocumentSource.java create mode 100644 src/main/java/at/procon/dip/domain/document/entity/DocumentTextRepresentation.java create mode 100644 src/main/java/at/procon/dip/domain/document/repository/DocumentContentRepository.java create mode 100644 src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingModelRepository.java create mode 100644 src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingRepository.java create mode 100644 src/main/java/at/procon/dip/domain/document/repository/DocumentRelationRepository.java create mode 100644 src/main/java/at/procon/dip/domain/document/repository/DocumentRepository.java create mode 100644 src/main/java/at/procon/dip/domain/document/repository/DocumentSourceRepository.java create mode 100644 src/main/java/at/procon/dip/domain/document/repository/DocumentTextRepresentationRepository.java create mode 100644 src/main/java/at/procon/dip/domain/document/service/DocumentContentService.java create mode 100644 src/main/java/at/procon/dip/domain/document/service/DocumentEmbeddingService.java create mode 100644 src/main/java/at/procon/dip/domain/document/service/DocumentRelationService.java create mode 100644 src/main/java/at/procon/dip/domain/document/service/DocumentRepresentationService.java create mode 100644 src/main/java/at/procon/dip/domain/document/service/DocumentService.java create mode 100644 src/main/java/at/procon/dip/domain/document/service/DocumentSourceService.java create mode 100644 src/main/java/at/procon/dip/domain/document/service/command/AddDocumentContentCommand.java create mode 100644 src/main/java/at/procon/dip/domain/document/service/command/AddDocumentSourceCommand.java create mode 100644 src/main/java/at/procon/dip/domain/document/service/command/AddDocumentTextRepresentationCommand.java create mode 100644 src/main/java/at/procon/dip/domain/document/service/command/CreateDocumentCommand.java create mode 100644 src/main/java/at/procon/dip/domain/document/service/command/CreateDocumentRelationCommand.java create mode 100644 src/main/java/at/procon/dip/domain/document/service/command/RegisterEmbeddingModelCommand.java create mode 100644 src/main/java/at/procon/dip/domain/tenant/TenantRef.java create mode 100644 src/main/java/at/procon/dip/domain/tenant/entity/DocumentTenant.java create mode 100644 src/main/java/at/procon/dip/domain/tenant/repository/DocumentTenantRepository.java create mode 100644 src/main/java/at/procon/dip/domain/tenant/service/DocumentTenantService.java create mode 100644 src/main/java/at/procon/dip/domain/tenant/service/command/CreateTenantCommand.java create mode 100644 src/main/java/at/procon/dip/extraction/spi/DocumentExtractor.java create mode 100644 src/main/java/at/procon/dip/extraction/spi/ExtractedStructuredPayload.java create mode 100644 src/main/java/at/procon/dip/extraction/spi/ExtractionRequest.java create mode 100644 src/main/java/at/procon/dip/extraction/spi/ExtractionResult.java create mode 100644 src/main/java/at/procon/dip/ingestion/spi/DocumentIngestionAdapter.java create mode 100644 src/main/java/at/procon/dip/ingestion/spi/IngestionResult.java create mode 100644 src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java create mode 100644 src/main/java/at/procon/dip/migration/MigrationStrategyMode.java create mode 100644 src/main/java/at/procon/dip/normalization/spi/RepresentationBuildRequest.java create mode 100644 src/main/java/at/procon/dip/normalization/spi/TextRepresentationBuilder.java create mode 100644 src/main/java/at/procon/dip/normalization/spi/TextRepresentationDraft.java create mode 100644 src/main/java/at/procon/dip/processing/spi/ProcessingStage.java create mode 100644 src/main/java/at/procon/dip/search/spi/SearchDocumentScope.java create mode 100644 src/main/java/at/procon/dip/vectorization/camel/GenericVectorizationRoute.java create mode 100644 src/main/java/at/procon/dip/vectorization/service/DocumentEmbeddingProcessingService.java create mode 100644 src/main/java/at/procon/dip/vectorization/spi/EmbeddingModelDescriptor.java create mode 100644 src/main/java/at/procon/dip/vectorization/spi/EmbeddingProvider.java create mode 100644 src/main/java/at/procon/dip/vectorization/spi/EmbeddingResult.java create mode 100644 src/main/java/at/procon/dip/vectorization/startup/ConfiguredEmbeddingModelStartupRunner.java create mode 100644 src/main/java/at/procon/dip/vectorization/startup/GenericVectorizationStartupRunner.java create mode 100644 src/main/java/at/procon/ted/service/TedPhase2GenericDocumentService.java create mode 100644 src/main/resources/db/migration/V4__add_doc_generic_persistence_backbone.sql create mode 100644 src/main/resources/db/migration/V5__doc_phase2_vectorization_support.sql diff --git a/README.md b/README.md index 3c2dc1f..eb0ac3a 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,13 @@ -# TED Procurement Document Processor +# Document Intelligence Platform -**AI-Powered Semantic Search Demonstrator for EU Public Procurement** +**Generic document ingestion, normalization and semantic search platform with TED support** A production-ready Spring Boot application showcasing advanced AI semantic search capabilities for processing and searching EU eForms public procurement notices from TED (Tenders Electronic Daily). **Author:** Martin.Schweitzer@procon.co.at and claude.ai +> Phase 0 foundation is in place: the codebase now exposes the broader platform namespace `at.procon.dip` while the existing TED runtime under `at.procon.ted` remains operational during migration. + --- ## 🎯 Demonstrator Highlights diff --git a/docs/architecture/PHASE0_ARCHITECTURE_FOUNDATION.md b/docs/architecture/PHASE0_ARCHITECTURE_FOUNDATION.md new file mode 100644 index 0000000..5b4801e --- /dev/null +++ b/docs/architecture/PHASE0_ARCHITECTURE_FOUNDATION.md @@ -0,0 +1,76 @@ +# Phase 0 – Architecture Foundation + +## New project identity +- **Project name:** Procon Document Intelligence Platform +- **Short name:** DIP +- **Base namespace:** `at.procon.dip` +- **Legacy namespace kept during transition:** `at.procon.ted` + +## Why this naming +The application is no longer only a TED notice processor. The new name reflects the broader goal: +import arbitrary document types, derive canonical searchable text, vectorize it, and run semantic +search over those representations. + +## Phase 0 decisions implemented in code +1. New Spring Boot entry point under `at.procon.dip` +2. Legacy TED runtime kept through explicit package scanning +3. Generic vocabulary introduced via enums in `at.procon.dip.domain.document` +4. Tenant introduced as a first-class value object in `at.procon.dip.domain.tenant` +5. Ownership and access are explicitly separated through `DocumentAccessContext` +6. Canonical document metadata and ingestion descriptors support both: + - tenant-owned documents + - public documents without tenant ownership +7. Extension-point interfaces introduced for ingestion, classification, extraction, + normalization, and vectorization +8. Target schema split documented as: + - `DOC` for generic document model + - `TED` for TED-specific projections +9. Migration strategy formalized as phased additive migration: + - additive schema + - dual write + - backfill + - cutover + - retire legacy + +## Planned package areas +- `at.procon.dip.architecture` +- `at.procon.dip.domain.access` +- `at.procon.dip.domain.document` +- `at.procon.dip.domain.tenant` +- `at.procon.dip.ingestion.spi` +- `at.procon.dip.classification.spi` +- `at.procon.dip.extraction.spi` +- `at.procon.dip.normalization.spi` +- `at.procon.dip.vectorization.spi` +- `at.procon.dip.search.spi` +- `at.procon.dip.processing.spi` +- `at.procon.dip.migration` + +## Ownership and visibility decision +A tenant represents the owner of a document, but ownership is optional. + +A public TED notice therefore does not need a fake tenant. Instead, the canonical model uses: +- optional `ownerTenant` +- mandatory `DocumentVisibility` + +Examples: +- TED notice: `ownerTenant = null`, `visibility = PUBLIC` +- customer-private document: `ownerTenant = tenantA`, `visibility = TENANT` +- explicitly shared document: `ownerTenant = tenantA`, `visibility = SHARED` + +Phase 1 now realizes this persistence direction through the additive `DOC` schema. The resulting +backbone uses: +- `DOC.doc_document.owner_tenant_id` nullable +- `DOC.doc_document.visibility` not null + +The complete Phase 1 persistence details are documented in `docs/architecture/PHASE1_GENERIC_PERSISTENCE_MODEL.md`. + +## Non-goals of Phase 0 +- No database schema migration yet +- No runtime behavior changes in TED processing +- No replacement of `ProcurementDocument` yet +- No semantic search refactoring yet + +## Result +The codebase now has a stable generalized namespace and contract surface for future phases without +requiring a disruptive rewrite. diff --git a/docs/architecture/PHASE1_GENERIC_PERSISTENCE_MODEL.md b/docs/architecture/PHASE1_GENERIC_PERSISTENCE_MODEL.md new file mode 100644 index 0000000..c10551c --- /dev/null +++ b/docs/architecture/PHASE1_GENERIC_PERSISTENCE_MODEL.md @@ -0,0 +1,42 @@ +# Phase 1 – Generic Persistence Model + +## Goal +Introduce the generalized persistence backbone in an additive, non-breaking way. + +## New schema +The project now contains the `DOC` schema with the following tables: +- `doc_tenant` +- `doc_document` +- `doc_source` +- `doc_content` +- `doc_text_representation` +- `doc_embedding_model` +- `doc_embedding` +- `doc_relation` + +## Design choices +### Owner tenant is optional +Public TED notices can remain unowned documents with `visibility = PUBLIC`. + +### Visibility is mandatory +Every canonical document must carry `DocumentVisibility`. + +### Vectorization is separated already +`doc_embedding` holds vectorization lifecycle and model association outside `doc_document`. +The actual vector payload column exists in the schema, but the runtime still uses the legacy TED +vectorization flow until Phase 2. + +### Content and text representation are separate +`doc_content` stores payload variants. `doc_text_representation` stores search-oriented texts. +This is the key boundary needed for arbitrary future document types. + +## What is still intentionally missing +- no dual-write from TED import yet +- no generic ingestion routes yet +- no semantic search cutover yet +- no TED projection tables yet +- no historical migration yet + +## Result +The generalized platform is now backed by a real schema and service layer, which reduces the later +migration risk significantly. diff --git a/docs/architecture/PHASE2_VECTORIZATION_DECOUPLING.md b/docs/architecture/PHASE2_VECTORIZATION_DECOUPLING.md new file mode 100644 index 0000000..95109cb --- /dev/null +++ b/docs/architecture/PHASE2_VECTORIZATION_DECOUPLING.md @@ -0,0 +1,48 @@ +# Phase 2 - Representation-based vectorization and dual-write compatibility + +## Goal + +Decouple vectorization from the TED document entity so arbitrary document types can use a shared +representation-to-embedding pipeline. + +## Primary changes + +1. **Primary vectorization source** + - before: `TED.procurement_document.text_content` + - now: `DOC.doc_text_representation.text_body` + +2. **Primary vectorization target** + - before: `TED.procurement_document.content_vector` + - now: `DOC.doc_embedding.embedding_vector` + +3. **Compatibility during migration** + - completed embeddings are optionally mirrored back to the legacy TED vector columns using the + shared TED document hash (`document_hash` / `dedup_hash`) + +4. **TED dual-write bridge** + - fresh TED documents are projected into the generic `DOC` model immediately after persistence + +## Key services introduced + +- `TedPhase2GenericDocumentService` + - creates/refreshes generic DOC records for TED notices +- `DocumentEmbeddingProcessingService` + - processes DOC embedding lifecycle records +- `GenericVectorizationRoute` + - scheduler + worker route for asynchronous DOC embedding generation +- `ConfiguredEmbeddingModelStartupRunner` + - ensures the configured embedding model exists in `DOC.doc_embedding_model` +- `GenericVectorizationStartupRunner` + - queues pending/failed DOC embeddings on startup + +## Behavior when Phase 2 is enabled + +- legacy `VectorizationRoute` is disabled +- legacy startup queueing is disabled +- legacy event-based vectorization queueing is disabled +- generic scheduler and startup runner handle DOC embeddings instead + +## Compatibility intent + +This phase keeps the existing TED search endpoints working while the new generic indexing layer becomes +operational. The next phase can migrate search reads from the TED table to `DOC.doc_embedding`. diff --git a/pom.xml b/pom.xml index 0c30c06..007f25d 100644 --- a/pom.xml +++ b/pom.xml @@ -22,11 +22,11 @@ - at.procon.ted - ted-procurement-processor + at.procon.dip + document-intelligence-platform 1.0.0-SNAPSHOT - TED Procurement Processor - EU eForms TED document processor with vector search capabilities + Procon Document Intelligence Platform + Generic document ingestion, normalization, and semantic search platform with TED support 21 @@ -232,6 +232,7 @@ org.springframework.boot spring-boot-maven-plugin + at.procon.dip.DocumentIntelligencePlatformApplication org.projectlombok diff --git a/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java b/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java new file mode 100644 index 0000000..ba2c495 --- /dev/null +++ b/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java @@ -0,0 +1,28 @@ +package at.procon.dip; + +import at.procon.ted.config.TedProcessorProperties; +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.springframework.boot.context.properties.EnableConfigurationProperties; +import org.springframework.boot.autoconfigure.domain.EntityScan; +import org.springframework.data.jpa.repository.config.EnableJpaRepositories; +import org.springframework.scheduling.annotation.EnableAsync; + +/** + * Procon Document Intelligence Platform (DIP). + * + *

Phase 0 introduces a generic platform root namespace and architecture contracts + * while keeping the existing TED-specific runtime intact. Subsequent phases can move + * modules incrementally from {@code at.procon.ted} into the broader document platform.

+ */ +@SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"}) +@EnableAsync +//@EnableConfigurationProperties(TedProcessorProperties.class) +@EntityScan(basePackages = {"at.procon.ted.model.entity", "at.procon.dip.domain.document.entity", "at.procon.dip.domain.tenant.entity"}) +@EnableJpaRepositories(basePackages = {"at.procon.ted.repository", "at.procon.dip.domain.document.repository", "at.procon.dip.domain.tenant.repository"}) +public class DocumentIntelligencePlatformApplication { + + public static void main(String[] args) { + SpringApplication.run(DocumentIntelligencePlatformApplication.class, args); + } +} diff --git a/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java.bak b/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java.bak new file mode 100644 index 0000000..db03770 --- /dev/null +++ b/src/main/java/at/procon/dip/DocumentIntelligencePlatformApplication.java.bak @@ -0,0 +1,28 @@ +package at.procon.dip; + +import at.procon.ted.config.TedProcessorProperties; +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.springframework.boot.context.properties.EnableConfigurationProperties; +import org.springframework.boot.autoconfigure.domain.EntityScan; +import org.springframework.data.jpa.repository.config.EnableJpaRepositories; +import org.springframework.scheduling.annotation.EnableAsync; + +/** + * Procon Document Intelligence Platform (DIP). + * + *

Phase 0 introduces a generic platform root namespace and architecture contracts + * while keeping the existing TED-specific runtime intact. Subsequent phases can move + * modules incrementally from {@code at.procon.ted} into the broader document platform.

+ */ +@SpringBootApplication(scanBasePackages = {"at.procon.dip", "at.procon.ted"}) +@EnableAsync +//@EnableConfigurationProperties(TedProcessorProperties.class) +@EntityScan(basePackages = {"at.procon.ted.model.entity"}) +@EnableJpaRepositories(basePackages = {"at.procon.ted.repository"}) +public class DocumentIntelligencePlatformApplication { + + public static void main(String[] args) { + SpringApplication.run(DocumentIntelligencePlatformApplication.class, args); + } +} diff --git a/src/main/java/at/procon/dip/README_PHASE0.md b/src/main/java/at/procon/dip/README_PHASE0.md new file mode 100644 index 0000000..2fe29a8 --- /dev/null +++ b/src/main/java/at/procon/dip/README_PHASE0.md @@ -0,0 +1,39 @@ +# Phase 0 – Generic Platform Foundation + +This package introduces the new platform namespace `at.procon.dip` without breaking the existing +TED runtime under `at.procon.ted`. + +## New project identity +- Project name: **Procon Document Intelligence Platform** +- Short name: **DIP** +- Base namespace: `at.procon.dip` + +## Intent +Phase 0 is intentionally light-weight. It defines the canonical vocabulary and SPI contracts that +later phases will implement incrementally: +- generic document root model +- optional owner tenant + explicit document visibility/access model +- ingestion adapters +- type detection +- extraction +- text normalization +- vectorization provider abstraction +- generic search scope + +## Access model +Documents are no longer assumed to be always tenant-owned. + +Examples: +- public TED notice -> `ownerTenant = null`, `visibility = PUBLIC` +- tenant-owned private document -> `ownerTenant = tenantA`, `visibility = TENANT` + +This keeps ownership and access semantics separate from the beginning of the generalized model. + +## Compatibility +The new Spring Boot entry point is `at.procon.dip.DocumentIntelligencePlatformApplication` and it +explicitly scans the legacy TED packages so the current runtime remains operational while future +phases migrate modules gradually. + + +## Phase 1 note +The additive `DOC` schema and generic persistence services are introduced in `README_PHASE1.md`. diff --git a/src/main/java/at/procon/dip/README_PHASE1.md b/src/main/java/at/procon/dip/README_PHASE1.md new file mode 100644 index 0000000..03ca163 --- /dev/null +++ b/src/main/java/at/procon/dip/README_PHASE1.md @@ -0,0 +1,27 @@ +# Phase 1 – Generic Persistence Backbone + +Phase 1 introduces the additive `DOC` schema and the first concrete persistence layer for the +new generalized platform model. + +## What is implemented +- `DOC.doc_tenant` +- `DOC.doc_document` +- `DOC.doc_source` +- `DOC.doc_content` +- `DOC.doc_text_representation` +- `DOC.doc_embedding_model` +- `DOC.doc_embedding` +- `DOC.doc_relation` + +## Intent +The generic model now exists as real JPA entities, repositories, Flyway migration, and thin +transactional services. Existing TED runtime behavior is intentionally unchanged. + +## Important limitation +The actual TED processing pipeline still writes only to the legacy TED-specific model. Dual-write +and migration come in later phases. + +## Vector storage note +`doc_embedding` already separates vectorization lifecycle from the document root. The transient +`embeddingVector` field is intentionally not wired into Hibernate yet. Writing native pgvector data +and moving the vectorization pipeline to the new table is part of Phase 2. diff --git a/src/main/java/at/procon/dip/README_PHASE2.md b/src/main/java/at/procon/dip/README_PHASE2.md new file mode 100644 index 0000000..49a3e15 --- /dev/null +++ b/src/main/java/at/procon/dip/README_PHASE2.md @@ -0,0 +1,18 @@ +# Phase 2 - Vectorization decoupling + +Phase 2 moves the primary vectorization pipeline from `TED.procurement_document` to the generic `DOC` +representation and embedding model introduced in Phase 1. + +Implemented in this phase: +- `DOC.doc_text_representation` is now the primary text source for embeddings +- `DOC.doc_embedding` is the primary persistence target for embedding lifecycle and vectors +- a generic Camel route processes pending/failed embeddings asynchronously +- TED imports dual-write into the generic model by creating: + - canonical `DOC.doc_document` + - original `DOC.doc_content` + - primary `DOC.doc_text_representation` + - pending `DOC.doc_embedding` +- compatibility mode keeps writing completed TED embeddings back into + `TED.procurement_document.content_vector` so the legacy semantic search continues to work + +This phase is intentionally additive and does not yet migrate TED semantic search reads away from the legacy table. diff --git a/src/main/java/at/procon/dip/architecture/PlatformArchitecture.java b/src/main/java/at/procon/dip/architecture/PlatformArchitecture.java new file mode 100644 index 0000000..0356825 --- /dev/null +++ b/src/main/java/at/procon/dip/architecture/PlatformArchitecture.java @@ -0,0 +1,45 @@ +package at.procon.dip.architecture; + +import java.util.List; + +/** + * Central architecture constants for the generalized platform. + *

Phase 1 extends the package map with the additive generic persistence backbone.

+ */ +public final class PlatformArchitecture { + + public static final String PLATFORM_NAME = "Procon Document Intelligence Platform"; + public static final String PLATFORM_SHORT_NAME = "DIP"; + public static final String BASE_NAMESPACE = "at.procon.dip"; + public static final String LEGACY_NAMESPACE = "at.procon.ted"; + + public static final String GENERIC_SCHEMA = "DOC"; + public static final String TED_SCHEMA = "TED"; + + public static final List GENERIC_PACKAGE_AREAS = List.of( + "at.procon.dip.architecture", + "at.procon.dip.domain.access", + "at.procon.dip.domain.document", + "at.procon.dip.domain.tenant", + "at.procon.dip.domain.document.entity", + "at.procon.dip.domain.document.repository", + "at.procon.dip.domain.document.service", + "at.procon.dip.domain.tenant.entity", + "at.procon.dip.domain.tenant.repository", + "at.procon.dip.domain.tenant.service", + "at.procon.dip.ingestion.spi", + "at.procon.dip.classification.spi", + "at.procon.dip.extraction.spi", + "at.procon.dip.normalization.spi", + "at.procon.dip.vectorization.spi", + "at.procon.dip.vectorization.service", + "at.procon.dip.vectorization.camel", + "at.procon.dip.vectorization.startup", + "at.procon.dip.search.spi", + "at.procon.dip.processing.spi", + "at.procon.dip.migration" + ); + + private PlatformArchitecture() { + } +} diff --git a/src/main/java/at/procon/dip/architecture/SchemaNames.java b/src/main/java/at/procon/dip/architecture/SchemaNames.java new file mode 100644 index 0000000..a7f2b61 --- /dev/null +++ b/src/main/java/at/procon/dip/architecture/SchemaNames.java @@ -0,0 +1,13 @@ +package at.procon.dip.architecture; + +/** + * Target schema names for the generalized model. + */ +public final class SchemaNames { + + public static final String DOC = "DOC"; + public static final String TED = "TED"; + + private SchemaNames() { + } +} diff --git a/src/main/java/at/procon/dip/classification/spi/DetectionResult.java b/src/main/java/at/procon/dip/classification/spi/DetectionResult.java new file mode 100644 index 0000000..200b7d4 --- /dev/null +++ b/src/main/java/at/procon/dip/classification/spi/DetectionResult.java @@ -0,0 +1,17 @@ +package at.procon.dip.classification.spi; + +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentType; +import java.util.Map; + +/** + * Result of document type detection/classification. + */ +public record DetectionResult( + DocumentType documentType, + DocumentFamily documentFamily, + String mimeType, + String languageCode, + Map attributes +) { +} diff --git a/src/main/java/at/procon/dip/classification/spi/DocumentTypeDetector.java b/src/main/java/at/procon/dip/classification/spi/DocumentTypeDetector.java new file mode 100644 index 0000000..712938a --- /dev/null +++ b/src/main/java/at/procon/dip/classification/spi/DocumentTypeDetector.java @@ -0,0 +1,13 @@ +package at.procon.dip.classification.spi; + +import at.procon.dip.ingestion.spi.SourceDescriptor; + +/** + * Determines a canonical type/family before extraction starts. + */ +public interface DocumentTypeDetector { + + boolean supports(SourceDescriptor sourceDescriptor); + + DetectionResult detect(SourceDescriptor sourceDescriptor); +} diff --git a/src/main/java/at/procon/dip/domain/access/DocumentAccessContext.java b/src/main/java/at/procon/dip/domain/access/DocumentAccessContext.java new file mode 100644 index 0000000..1e79063 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/access/DocumentAccessContext.java @@ -0,0 +1,31 @@ +package at.procon.dip.domain.access; + +import at.procon.dip.domain.tenant.TenantRef; +import java.util.Objects; + +/** + * Canonical ownership and visibility descriptor for a document. + *

+ * A document may have no owner tenant, for example public TED notices. + * Visibility is always mandatory and defines who may search/read the document. + */ +public record DocumentAccessContext( + TenantRef ownerTenant, + DocumentVisibility visibility +) { + + public DocumentAccessContext { + Objects.requireNonNull(visibility, "visibility must not be null"); + } + + public static DocumentAccessContext publicDocument() { + return new DocumentAccessContext(null, DocumentVisibility.PUBLIC); + } + + public static DocumentAccessContext tenantOwned(TenantRef ownerTenant) { + return new DocumentAccessContext( + Objects.requireNonNull(ownerTenant, "ownerTenant must not be null"), + DocumentVisibility.TENANT + ); + } +} diff --git a/src/main/java/at/procon/dip/domain/access/DocumentVisibility.java b/src/main/java/at/procon/dip/domain/access/DocumentVisibility.java new file mode 100644 index 0000000..a8ecb27 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/access/DocumentVisibility.java @@ -0,0 +1,11 @@ +package at.procon.dip.domain.access; + +/** + * Describes who may access a document independently from ownership. + */ +public enum DocumentVisibility { + PUBLIC, + TENANT, + SHARED, + RESTRICTED +} diff --git a/src/main/java/at/procon/dip/domain/document/CanonicalDocumentMetadata.java b/src/main/java/at/procon/dip/domain/document/CanonicalDocumentMetadata.java new file mode 100644 index 0000000..7cad293 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/CanonicalDocumentMetadata.java @@ -0,0 +1,23 @@ +package at.procon.dip.domain.document; + +import at.procon.dip.domain.access.DocumentAccessContext; +import java.time.OffsetDateTime; +import java.util.UUID; + +/** + * Minimal canonical document descriptor used by Phase 0 SPI contracts. + */ +public record CanonicalDocumentMetadata( + UUID documentId, + DocumentAccessContext accessContext, + DocumentType documentType, + DocumentFamily documentFamily, + DocumentStatus status, + String title, + String languageCode, + String mimeType, + String dedupHash, + OffsetDateTime createdAt, + OffsetDateTime updatedAt +) { +} diff --git a/src/main/java/at/procon/dip/domain/document/ContentRole.java b/src/main/java/at/procon/dip/domain/document/ContentRole.java new file mode 100644 index 0000000..dcdfba6 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/ContentRole.java @@ -0,0 +1,14 @@ +package at.procon.dip.domain.document; + +/** + * Role of a stored content version. + */ +public enum ContentRole { + ORIGINAL, + NORMALIZED_TEXT, + OCR_TEXT, + HTML_CLEAN, + EXTRACTED_METADATA_JSON, + THUMBNAIL, + DERIVED_BINARY +} diff --git a/src/main/java/at/procon/dip/domain/document/DistanceMetric.java b/src/main/java/at/procon/dip/domain/document/DistanceMetric.java new file mode 100644 index 0000000..0bb8d68 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/DistanceMetric.java @@ -0,0 +1,10 @@ +package at.procon.dip.domain.document; + +/** + * Distance metric used by an embedding model. + */ +public enum DistanceMetric { + COSINE, + L2, + INNER_PRODUCT +} diff --git a/src/main/java/at/procon/dip/domain/document/DocumentFamily.java b/src/main/java/at/procon/dip/domain/document/DocumentFamily.java new file mode 100644 index 0000000..790fe33 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/DocumentFamily.java @@ -0,0 +1,12 @@ +package at.procon.dip.domain.document; + +/** + * Functional grouping used for broad search and routing decisions. + */ +public enum DocumentFamily { + PROCUREMENT, + MAIL, + ATTACHMENT, + KNOWLEDGE, + GENERIC +} diff --git a/src/main/java/at/procon/dip/domain/document/DocumentStatus.java b/src/main/java/at/procon/dip/domain/document/DocumentStatus.java new file mode 100644 index 0000000..b6ddddd --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/DocumentStatus.java @@ -0,0 +1,14 @@ +package at.procon.dip.domain.document; + +/** + * Generic lifecycle state for a canonical document. + */ +public enum DocumentStatus { + RECEIVED, + CLASSIFIED, + EXTRACTED, + REPRESENTED, + INDEXED, + FAILED, + ARCHIVED +} diff --git a/src/main/java/at/procon/dip/domain/document/DocumentType.java b/src/main/java/at/procon/dip/domain/document/DocumentType.java new file mode 100644 index 0000000..f6a651b --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/DocumentType.java @@ -0,0 +1,19 @@ +package at.procon.dip.domain.document; + +/** + * Canonical technical document type. + */ +public enum DocumentType { + TED_NOTICE, + EMAIL, + MIME_MESSAGE, + PDF, + DOCX, + HTML, + XML_GENERIC, + TEXT, + MARKDOWN, + ZIP_ARCHIVE, + GENERIC_BINARY, + UNKNOWN +} diff --git a/src/main/java/at/procon/dip/domain/document/EmbeddingStatus.java b/src/main/java/at/procon/dip/domain/document/EmbeddingStatus.java new file mode 100644 index 0000000..894ce3a --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/EmbeddingStatus.java @@ -0,0 +1,12 @@ +package at.procon.dip.domain.document; + +/** + * Generic lifecycle state of an embedding record in the DOC schema. + */ +public enum EmbeddingStatus { + PENDING, + PROCESSING, + COMPLETED, + FAILED, + SKIPPED +} diff --git a/src/main/java/at/procon/dip/domain/document/RelationType.java b/src/main/java/at/procon/dip/domain/document/RelationType.java new file mode 100644 index 0000000..1759192 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/RelationType.java @@ -0,0 +1,14 @@ +package at.procon.dip.domain.document; + +/** + * Logical relationship between canonical documents. + */ +public enum RelationType { + CONTAINS, + ATTACHMENT_OF, + EXTRACTED_FROM, + DERIVED_FROM, + PART_OF, + VERSION_OF, + RELATED_TO +} diff --git a/src/main/java/at/procon/dip/domain/document/RepresentationType.java b/src/main/java/at/procon/dip/domain/document/RepresentationType.java new file mode 100644 index 0000000..0cc7b3e --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/RepresentationType.java @@ -0,0 +1,13 @@ +package at.procon.dip.domain.document; + +/** + * Search-oriented text representation that can be embedded independently. + */ +public enum RepresentationType { + FULLTEXT, + SEMANTIC_TEXT, + SUMMARY, + TITLE_ABSTRACT, + CHUNK, + METADATA_ENRICHED +} diff --git a/src/main/java/at/procon/dip/domain/document/SourceType.java b/src/main/java/at/procon/dip/domain/document/SourceType.java new file mode 100644 index 0000000..d53b841 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/SourceType.java @@ -0,0 +1,15 @@ +package at.procon.dip.domain.document; + +/** + * Provenance of an imported document. + */ +public enum SourceType { + TED_PACKAGE, + MAIL, + FILE_SYSTEM, + REST_UPLOAD, + MANUAL_UPLOAD, + ZIP_CHILD, + API, + MIGRATION +} diff --git a/src/main/java/at/procon/dip/domain/document/StorageType.java b/src/main/java/at/procon/dip/domain/document/StorageType.java new file mode 100644 index 0000000..0ee68a0 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/StorageType.java @@ -0,0 +1,12 @@ +package at.procon.dip.domain.document; + +/** + * Physical storage strategy for content. + */ +public enum StorageType { + DB_TEXT, + DB_BINARY, + FILE_PATH, + OBJECT_STORAGE, + EXTERNAL_REFERENCE +} diff --git a/src/main/java/at/procon/dip/domain/document/entity/Document.java b/src/main/java/at/procon/dip/domain/document/entity/Document.java new file mode 100644 index 0000000..2d71e94 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/entity/Document.java @@ -0,0 +1,133 @@ +package at.procon.dip.domain.document.entity; + +import at.procon.dip.architecture.SchemaNames; +import at.procon.dip.domain.access.DocumentAccessContext; +import at.procon.dip.domain.access.DocumentVisibility; +import at.procon.dip.domain.document.CanonicalDocumentMetadata; +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentStatus; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.tenant.entity.DocumentTenant; +import jakarta.persistence.Column; +import jakarta.persistence.Entity; +import jakarta.persistence.EnumType; +import jakarta.persistence.Enumerated; +import jakarta.persistence.FetchType; +import jakarta.persistence.GeneratedValue; +import jakarta.persistence.GenerationType; +import jakarta.persistence.Id; +import jakarta.persistence.Index; +import jakarta.persistence.JoinColumn; +import jakarta.persistence.ManyToOne; +import jakarta.persistence.PrePersist; +import jakarta.persistence.PreUpdate; +import jakarta.persistence.Table; +import java.time.OffsetDateTime; +import java.util.UUID; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; + +/** + * Canonical document root entity for the generalized DOC schema. + */ +@Entity +@Table(schema = SchemaNames.DOC, name = "doc_document", indexes = { + @Index(name = "idx_doc_document_type", columnList = "document_type"), + @Index(name = "idx_doc_document_family", columnList = "document_family"), + @Index(name = "idx_doc_document_status", columnList = "status"), + @Index(name = "idx_doc_document_visibility", columnList = "visibility"), + @Index(name = "idx_doc_document_owner_tenant", columnList = "owner_tenant_id"), + @Index(name = "idx_doc_document_dedup_hash", columnList = "dedup_hash"), + @Index(name = "idx_doc_document_business_key", columnList = "business_key"), + @Index(name = "idx_doc_document_created_at", columnList = "created_at") +}) +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class Document { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + private UUID id; + + @ManyToOne(fetch = FetchType.LAZY) + @JoinColumn(name = "owner_tenant_id") + private DocumentTenant ownerTenant; + + @Enumerated(EnumType.STRING) + @Column(name = "visibility", nullable = false, length = 32) + @Builder.Default + private DocumentVisibility visibility = DocumentVisibility.PUBLIC; + + @Enumerated(EnumType.STRING) + @Column(name = "document_type", nullable = false, length = 64) + private DocumentType documentType; + + @Enumerated(EnumType.STRING) + @Column(name = "document_family", nullable = false, length = 64) + private DocumentFamily documentFamily; + + @Enumerated(EnumType.STRING) + @Column(name = "status", nullable = false, length = 32) + @Builder.Default + private DocumentStatus status = DocumentStatus.RECEIVED; + + @Column(name = "title", length = 1000) + private String title; + + @Column(name = "summary", columnDefinition = "TEXT") + private String summary; + + @Column(name = "language_code", length = 16) + private String languageCode; + + @Column(name = "mime_type", length = 255) + private String mimeType; + + @Column(name = "business_key", length = 255) + private String businessKey; + + @Column(name = "dedup_hash", length = 64) + private String dedupHash; + + @Builder.Default + @Column(name = "created_at", nullable = false, updatable = false) + private OffsetDateTime createdAt = OffsetDateTime.now(); + + @Builder.Default + @Column(name = "updated_at", nullable = false) + private OffsetDateTime updatedAt = OffsetDateTime.now(); + + @PrePersist + protected void onCreate() { + createdAt = OffsetDateTime.now(); + updatedAt = OffsetDateTime.now(); + } + + @PreUpdate + protected void onUpdate() { + updatedAt = OffsetDateTime.now(); + } + + public CanonicalDocumentMetadata toCanonicalMetadata() { + return new CanonicalDocumentMetadata( + id, + new DocumentAccessContext(ownerTenant == null ? null : new at.procon.dip.domain.tenant.TenantRef( + ownerTenant.getId().toString(), ownerTenant.getTenantKey(), ownerTenant.getDisplayName()), visibility), + documentType, + documentFamily, + status, + title, + languageCode, + mimeType, + dedupHash, + createdAt, + updatedAt + ); + } +} diff --git a/src/main/java/at/procon/dip/domain/document/entity/DocumentContent.java b/src/main/java/at/procon/dip/domain/document/entity/DocumentContent.java new file mode 100644 index 0000000..af746bf --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/entity/DocumentContent.java @@ -0,0 +1,86 @@ +package at.procon.dip.domain.document.entity; + +import at.procon.dip.architecture.SchemaNames; +import at.procon.dip.domain.document.ContentRole; +import at.procon.dip.domain.document.StorageType; +import jakarta.persistence.Column; +import jakarta.persistence.Entity; +import jakarta.persistence.EnumType; +import jakarta.persistence.Enumerated; +import jakarta.persistence.FetchType; +import jakarta.persistence.GeneratedValue; +import jakarta.persistence.GenerationType; +import jakarta.persistence.Id; +import jakarta.persistence.Index; +import jakarta.persistence.JoinColumn; +import jakarta.persistence.ManyToOne; +import jakarta.persistence.PrePersist; +import jakarta.persistence.Table; +import java.time.OffsetDateTime; +import java.util.UUID; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; + +/** + * Stored payload variant for a canonical document. + */ +@Entity +@Table(schema = SchemaNames.DOC, name = "doc_content", indexes = { + @Index(name = "idx_doc_content_document", columnList = "document_id"), + @Index(name = "idx_doc_content_role", columnList = "content_role"), + @Index(name = "idx_doc_content_hash", columnList = "content_hash"), + @Index(name = "idx_doc_content_storage_type", columnList = "storage_type") +}) +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class DocumentContent { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + private UUID id; + + @ManyToOne(fetch = FetchType.LAZY, optional = false) + @JoinColumn(name = "document_id", nullable = false) + private Document document; + + @Enumerated(EnumType.STRING) + @Column(name = "content_role", nullable = false, length = 64) + private ContentRole contentRole; + + @Enumerated(EnumType.STRING) + @Column(name = "storage_type", nullable = false, length = 64) + private StorageType storageType; + + @Column(name = "mime_type", length = 255) + private String mimeType; + + @Column(name = "charset_name", length = 120) + private String charsetName; + + @Column(name = "text_content", columnDefinition = "TEXT") + private String textContent; + + @Column(name = "binary_ref", columnDefinition = "TEXT") + private String binaryRef; + + @Column(name = "content_hash", length = 64) + private String contentHash; + + @Column(name = "size_bytes") + private Long sizeBytes; + + @Builder.Default + @Column(name = "created_at", nullable = false, updatable = false) + private OffsetDateTime createdAt = OffsetDateTime.now(); + + @PrePersist + protected void onCreate() { + createdAt = OffsetDateTime.now(); + } +} diff --git a/src/main/java/at/procon/dip/domain/document/entity/DocumentEmbedding.java b/src/main/java/at/procon/dip/domain/document/entity/DocumentEmbedding.java new file mode 100644 index 0000000..07797d1 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/entity/DocumentEmbedding.java @@ -0,0 +1,103 @@ +package at.procon.dip.domain.document.entity; + +import at.procon.dip.architecture.SchemaNames; +import at.procon.dip.domain.document.EmbeddingStatus; +import jakarta.persistence.Column; +import jakarta.persistence.Entity; +import jakarta.persistence.EnumType; +import jakarta.persistence.Enumerated; +import jakarta.persistence.FetchType; +import jakarta.persistence.GeneratedValue; +import jakarta.persistence.GenerationType; +import jakarta.persistence.Id; +import jakarta.persistence.Index; +import jakarta.persistence.JoinColumn; +import jakarta.persistence.ManyToOne; +import jakarta.persistence.PrePersist; +import jakarta.persistence.PreUpdate; +import jakarta.persistence.Table; +import jakarta.persistence.Transient; +import java.time.OffsetDateTime; +import java.util.UUID; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; + +/** + * Generic vectorization record separated from the canonical document structure. + *

+ * The actual pgvector payload is persisted in the {@code embedding_vector} column via native SQL + * in later phases. The transient field exists only as a convenient in-memory carrier. + */ +@Entity +@Table(schema = SchemaNames.DOC, name = "doc_embedding", indexes = { + @Index(name = "idx_doc_embedding_document", columnList = "document_id"), + @Index(name = "idx_doc_embedding_repr", columnList = "representation_id"), + @Index(name = "idx_doc_embedding_model", columnList = "model_id"), + @Index(name = "idx_doc_embedding_status", columnList = "embedding_status"), + @Index(name = "idx_doc_embedding_embedded_at", columnList = "embedded_at") +}) +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class DocumentEmbedding { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + private UUID id; + + @ManyToOne(fetch = FetchType.LAZY, optional = false) + @JoinColumn(name = "document_id", nullable = false) + private Document document; + + @ManyToOne(fetch = FetchType.LAZY, optional = false) + @JoinColumn(name = "representation_id", nullable = false) + private DocumentTextRepresentation representation; + + @ManyToOne(fetch = FetchType.LAZY, optional = false) + @JoinColumn(name = "model_id", nullable = false) + private DocumentEmbeddingModel model; + + @Enumerated(EnumType.STRING) + @Column(name = "embedding_status", nullable = false, length = 32) + @Builder.Default + private EmbeddingStatus embeddingStatus = EmbeddingStatus.PENDING; + + @Column(name = "token_count") + private Integer tokenCount; + + @Column(name = "embedding_dimensions") + private Integer embeddingDimensions; + + @Column(name = "error_message", columnDefinition = "TEXT") + private String errorMessage; + + @Column(name = "embedded_at") + private OffsetDateTime embeddedAt; + + @Builder.Default + @Column(name = "created_at", nullable = false, updatable = false) + private OffsetDateTime createdAt = OffsetDateTime.now(); + + @Builder.Default + @Column(name = "updated_at", nullable = false) + private OffsetDateTime updatedAt = OffsetDateTime.now(); + + @Transient + private float[] embeddingVector; + + @PrePersist + protected void onCreate() { + createdAt = OffsetDateTime.now(); + updatedAt = OffsetDateTime.now(); + } + + @PreUpdate + protected void onUpdate() { + updatedAt = OffsetDateTime.now(); + } +} diff --git a/src/main/java/at/procon/dip/domain/document/entity/DocumentEmbeddingModel.java b/src/main/java/at/procon/dip/domain/document/entity/DocumentEmbeddingModel.java new file mode 100644 index 0000000..9f49f35 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/entity/DocumentEmbeddingModel.java @@ -0,0 +1,86 @@ +package at.procon.dip.domain.document.entity; + +import at.procon.dip.architecture.SchemaNames; +import at.procon.dip.domain.document.DistanceMetric; +import jakarta.persistence.Column; +import jakarta.persistence.Entity; +import jakarta.persistence.EnumType; +import jakarta.persistence.Enumerated; +import jakarta.persistence.GeneratedValue; +import jakarta.persistence.GenerationType; +import jakarta.persistence.Id; +import jakarta.persistence.Index; +import jakarta.persistence.PrePersist; +import jakarta.persistence.PreUpdate; +import jakarta.persistence.Table; +import java.time.OffsetDateTime; +import java.util.UUID; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; + +/** + * Embedding model catalog row used by generic vectorization. + */ +@Entity +@Table(schema = SchemaNames.DOC, name = "doc_embedding_model", indexes = { + @Index(name = "idx_doc_embedding_model_key", columnList = "model_key", unique = true), + @Index(name = "idx_doc_embedding_model_active", columnList = "active") +}) +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class DocumentEmbeddingModel { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + private UUID id; + + @Column(name = "model_key", nullable = false, unique = true, length = 255) + private String modelKey; + + @Column(name = "provider", nullable = false, length = 120) + private String provider; + + @Column(name = "display_name", length = 255) + private String displayName; + + @Column(name = "dimensions", nullable = false) + private Integer dimensions; + + @Enumerated(EnumType.STRING) + @Column(name = "distance_metric", nullable = false, length = 32) + @Builder.Default + private DistanceMetric distanceMetric = DistanceMetric.COSINE; + + @Builder.Default + @Column(name = "query_prefix_required", nullable = false) + private boolean queryPrefixRequired = false; + + @Builder.Default + @Column(name = "active", nullable = false) + private boolean active = true; + + @Builder.Default + @Column(name = "created_at", nullable = false, updatable = false) + private OffsetDateTime createdAt = OffsetDateTime.now(); + + @Builder.Default + @Column(name = "updated_at", nullable = false) + private OffsetDateTime updatedAt = OffsetDateTime.now(); + + @PrePersist + protected void onCreate() { + createdAt = OffsetDateTime.now(); + updatedAt = OffsetDateTime.now(); + } + + @PreUpdate + protected void onUpdate() { + updatedAt = OffsetDateTime.now(); + } +} diff --git a/src/main/java/at/procon/dip/domain/document/entity/DocumentRelation.java b/src/main/java/at/procon/dip/domain/document/entity/DocumentRelation.java new file mode 100644 index 0000000..dfa6a9c --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/entity/DocumentRelation.java @@ -0,0 +1,72 @@ +package at.procon.dip.domain.document.entity; + +import at.procon.dip.architecture.SchemaNames; +import at.procon.dip.domain.document.RelationType; +import jakarta.persistence.Column; +import jakarta.persistence.Entity; +import jakarta.persistence.EnumType; +import jakarta.persistence.Enumerated; +import jakarta.persistence.FetchType; +import jakarta.persistence.GeneratedValue; +import jakarta.persistence.GenerationType; +import jakarta.persistence.Id; +import jakarta.persistence.Index; +import jakarta.persistence.JoinColumn; +import jakarta.persistence.ManyToOne; +import jakarta.persistence.PrePersist; +import jakarta.persistence.Table; +import java.time.OffsetDateTime; +import java.util.UUID; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; + +/** + * Directed relationship between two canonical documents. + */ +@Entity +@Table(schema = SchemaNames.DOC, name = "doc_relation", indexes = { + @Index(name = "idx_doc_relation_parent", columnList = "parent_document_id"), + @Index(name = "idx_doc_relation_child", columnList = "child_document_id"), + @Index(name = "idx_doc_relation_type", columnList = "relation_type") +}) +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class DocumentRelation { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + private UUID id; + + @ManyToOne(fetch = FetchType.LAZY, optional = false) + @JoinColumn(name = "parent_document_id", nullable = false) + private Document parentDocument; + + @ManyToOne(fetch = FetchType.LAZY, optional = false) + @JoinColumn(name = "child_document_id", nullable = false) + private Document childDocument; + + @Enumerated(EnumType.STRING) + @Column(name = "relation_type", nullable = false, length = 64) + private RelationType relationType; + + @Column(name = "sort_order") + private Integer sortOrder; + + @Column(name = "relation_metadata", columnDefinition = "TEXT") + private String relationMetadata; + + @Builder.Default + @Column(name = "created_at", nullable = false, updatable = false) + private OffsetDateTime createdAt = OffsetDateTime.now(); + + @PrePersist + protected void onCreate() { + createdAt = OffsetDateTime.now(); + } +} diff --git a/src/main/java/at/procon/dip/domain/document/entity/DocumentSource.java b/src/main/java/at/procon/dip/domain/document/entity/DocumentSource.java new file mode 100644 index 0000000..fff1e52 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/entity/DocumentSource.java @@ -0,0 +1,85 @@ +package at.procon.dip.domain.document.entity; + +import at.procon.dip.architecture.SchemaNames; +import at.procon.dip.domain.document.SourceType; +import jakarta.persistence.Column; +import jakarta.persistence.Entity; +import jakarta.persistence.EnumType; +import jakarta.persistence.Enumerated; +import jakarta.persistence.FetchType; +import jakarta.persistence.GeneratedValue; +import jakarta.persistence.GenerationType; +import jakarta.persistence.Id; +import jakarta.persistence.Index; +import jakarta.persistence.JoinColumn; +import jakarta.persistence.ManyToOne; +import jakarta.persistence.PrePersist; +import jakarta.persistence.Table; +import java.time.OffsetDateTime; +import java.util.UUID; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; + +/** + * Provenance row for a canonical document. + */ +@Entity +@Table(schema = SchemaNames.DOC, name = "doc_source", indexes = { + @Index(name = "idx_doc_source_document", columnList = "document_id"), + @Index(name = "idx_doc_source_type", columnList = "source_type"), + @Index(name = "idx_doc_source_external_id", columnList = "external_source_id"), + @Index(name = "idx_doc_source_received_at", columnList = "received_at"), + @Index(name = "idx_doc_source_parent_source", columnList = "parent_source_id") +}) +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class DocumentSource { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + private UUID id; + + @ManyToOne(fetch = FetchType.LAZY, optional = false) + @JoinColumn(name = "document_id", nullable = false) + private Document document; + + @Enumerated(EnumType.STRING) + @Column(name = "source_type", nullable = false, length = 64) + private SourceType sourceType; + + @Column(name = "external_source_id", length = 500) + private String externalSourceId; + + @Column(name = "source_uri", columnDefinition = "TEXT") + private String sourceUri; + + @Column(name = "source_filename", length = 1000) + private String sourceFilename; + + @Column(name = "parent_source_id") + private UUID parentSourceId; + + @Column(name = "import_batch_id", length = 255) + private String importBatchId; + + @Column(name = "received_at") + private OffsetDateTime receivedAt; + + @Builder.Default + @Column(name = "created_at", nullable = false, updatable = false) + private OffsetDateTime createdAt = OffsetDateTime.now(); + + @PrePersist + protected void onCreate() { + createdAt = OffsetDateTime.now(); + if (receivedAt == null) { + receivedAt = createdAt; + } + } +} diff --git a/src/main/java/at/procon/dip/domain/document/entity/DocumentTextRepresentation.java b/src/main/java/at/procon/dip/domain/document/entity/DocumentTextRepresentation.java new file mode 100644 index 0000000..cfb4774 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/entity/DocumentTextRepresentation.java @@ -0,0 +1,98 @@ +package at.procon.dip.domain.document.entity; + +import at.procon.dip.architecture.SchemaNames; +import at.procon.dip.domain.document.RepresentationType; +import jakarta.persistence.Column; +import jakarta.persistence.Entity; +import jakarta.persistence.EnumType; +import jakarta.persistence.Enumerated; +import jakarta.persistence.FetchType; +import jakarta.persistence.GeneratedValue; +import jakarta.persistence.GenerationType; +import jakarta.persistence.Id; +import jakarta.persistence.Index; +import jakarta.persistence.JoinColumn; +import jakarta.persistence.ManyToOne; +import jakarta.persistence.PrePersist; +import jakarta.persistence.Table; +import java.time.OffsetDateTime; +import java.util.UUID; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; + +/** + * Search-oriented text derived from a canonical document. + */ +@Entity +@Table(schema = SchemaNames.DOC, name = "doc_text_representation", indexes = { + @Index(name = "idx_doc_text_repr_document", columnList = "document_id"), + @Index(name = "idx_doc_text_repr_content", columnList = "content_id"), + @Index(name = "idx_doc_text_repr_type", columnList = "representation_type"), + @Index(name = "idx_doc_text_repr_primary", columnList = "is_primary") +}) +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class DocumentTextRepresentation { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + private UUID id; + + @ManyToOne(fetch = FetchType.LAZY, optional = false) + @JoinColumn(name = "document_id", nullable = false) + private Document document; + + @ManyToOne(fetch = FetchType.LAZY) + @JoinColumn(name = "content_id") + private DocumentContent content; + + @Enumerated(EnumType.STRING) + @Column(name = "representation_type", nullable = false, length = 64) + private RepresentationType representationType; + + @Column(name = "builder_key", length = 255) + private String builderKey; + + @Column(name = "language_code", length = 16) + private String languageCode; + + @Column(name = "token_count") + private Integer tokenCount; + + @Column(name = "char_count") + private Integer charCount; + + @Column(name = "chunk_index") + private Integer chunkIndex; + + @Column(name = "chunk_start_offset") + private Integer chunkStartOffset; + + @Column(name = "chunk_end_offset") + private Integer chunkEndOffset; + + @Builder.Default + @Column(name = "is_primary", nullable = false) + private boolean primaryRepresentation = false; + + @Column(name = "text_body", columnDefinition = "TEXT", nullable = false) + private String textBody; + + @Builder.Default + @Column(name = "created_at", nullable = false, updatable = false) + private OffsetDateTime createdAt = OffsetDateTime.now(); + + @PrePersist + protected void onCreate() { + createdAt = OffsetDateTime.now(); + if (charCount == null && textBody != null) { + charCount = textBody.length(); + } + } +} diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentContentRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentContentRepository.java new file mode 100644 index 0000000..3f67cf2 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentContentRepository.java @@ -0,0 +1,17 @@ +package at.procon.dip.domain.document.repository; + +import at.procon.dip.domain.document.ContentRole; +import at.procon.dip.domain.document.entity.DocumentContent; +import java.util.List; +import java.util.Optional; +import java.util.UUID; +import org.springframework.data.jpa.repository.JpaRepository; + +public interface DocumentContentRepository extends JpaRepository { + + List findByDocument_Id(UUID documentId); + + List findByDocument_IdAndContentRole(UUID documentId, ContentRole contentRole); + + Optional findByContentHash(String contentHash); +} diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingModelRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingModelRepository.java new file mode 100644 index 0000000..833c859 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingModelRepository.java @@ -0,0 +1,11 @@ +package at.procon.dip.domain.document.repository; + +import at.procon.dip.domain.document.entity.DocumentEmbeddingModel; +import java.util.Optional; +import java.util.UUID; +import org.springframework.data.jpa.repository.JpaRepository; + +public interface DocumentEmbeddingModelRepository extends JpaRepository { + + Optional findByModelKey(String modelKey); +} diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingRepository.java new file mode 100644 index 0000000..e5e1b99 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentEmbeddingRepository.java @@ -0,0 +1,55 @@ +package at.procon.dip.domain.document.repository; + +import at.procon.dip.domain.document.EmbeddingStatus; +import at.procon.dip.domain.document.entity.DocumentEmbedding; +import java.time.OffsetDateTime; +import java.util.List; +import java.util.Optional; +import java.util.UUID; +import org.springframework.data.domain.Pageable; +import org.springframework.data.jpa.repository.JpaRepository; +import org.springframework.data.jpa.repository.Modifying; +import org.springframework.data.jpa.repository.Query; +import org.springframework.data.repository.query.Param; + +public interface DocumentEmbeddingRepository extends JpaRepository { + + List findByDocument_Id(UUID documentId); + + List findByRepresentation_Id(UUID representationId); + + List findByEmbeddingStatus(EmbeddingStatus embeddingStatus); + + Optional findByRepresentation_IdAndModel_Id(UUID representationId, UUID modelId); + + @Query("SELECT e.id FROM DocumentEmbedding e WHERE e.embeddingStatus = :status ORDER BY e.createdAt ASC") + List findIdsByEmbeddingStatus(@Param("status") EmbeddingStatus status, Pageable pageable); + + @Query("SELECT e FROM DocumentEmbedding e " + + "JOIN FETCH e.document d " + + "JOIN FETCH e.representation r " + + "JOIN FETCH e.model m " + + "WHERE e.id = :embeddingId") + Optional findDetailedById(@Param("embeddingId") UUID embeddingId); + + @Modifying + @Query(value = "UPDATE doc.doc_embedding SET embedding_vector = CAST(:vectorData AS vector), " + + "embedding_status = 'COMPLETED', embedded_at = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP, " + + "error_message = NULL, token_count = :tokenCount, embedding_dimensions = :dimensions WHERE id = :id", + nativeQuery = true) + int updateEmbeddingVector(@Param("id") UUID id, + @Param("vectorData") String vectorData, + @Param("tokenCount") Integer tokenCount, + @Param("dimensions") Integer dimensions); + + @Modifying + @Query("UPDATE DocumentEmbedding e SET e.embeddingStatus = :status, e.errorMessage = :errorMessage, " + + "e.embeddedAt = :embeddedAt, e.updatedAt = CURRENT_TIMESTAMP WHERE e.id = :embeddingId") + int updateEmbeddingStatus(@Param("embeddingId") UUID embeddingId, + @Param("status") EmbeddingStatus status, + @Param("errorMessage") String errorMessage, + @Param("embeddedAt") OffsetDateTime embeddedAt); + + @Query("SELECT e.embeddingStatus, COUNT(e) FROM DocumentEmbedding e GROUP BY e.embeddingStatus") + List countByEmbeddingStatus(); +} diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentRelationRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentRelationRepository.java new file mode 100644 index 0000000..a039470 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentRelationRepository.java @@ -0,0 +1,16 @@ +package at.procon.dip.domain.document.repository; + +import at.procon.dip.domain.document.RelationType; +import at.procon.dip.domain.document.entity.DocumentRelation; +import java.util.List; +import java.util.UUID; +import org.springframework.data.jpa.repository.JpaRepository; + +public interface DocumentRelationRepository extends JpaRepository { + + List findByParentDocument_Id(UUID parentDocumentId); + + List findByChildDocument_Id(UUID childDocumentId); + + List findByParentDocument_IdAndRelationType(UUID parentDocumentId, RelationType relationType); +} diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentRepository.java new file mode 100644 index 0000000..6746b75 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentRepository.java @@ -0,0 +1,31 @@ +package at.procon.dip.domain.document.repository; + +import at.procon.dip.domain.access.DocumentVisibility; +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentStatus; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.entity.Document; +import java.util.Collection; +import java.util.List; +import java.util.Optional; +import java.util.UUID; +import org.springframework.data.jpa.repository.JpaRepository; + +public interface DocumentRepository extends JpaRepository { + + Optional findByDedupHash(String dedupHash); + + boolean existsByDedupHash(String dedupHash); + + List findByDocumentType(DocumentType documentType); + + List findByDocumentFamily(DocumentFamily documentFamily); + + List findByStatus(DocumentStatus status); + + List findByVisibility(DocumentVisibility visibility); + + List findByOwnerTenant_TenantKey(String tenantKey); + + List findByOwnerTenant_TenantKeyIn(Collection tenantKeys); +} diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentSourceRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentSourceRepository.java new file mode 100644 index 0000000..31e100d --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentSourceRepository.java @@ -0,0 +1,17 @@ +package at.procon.dip.domain.document.repository; + +import at.procon.dip.domain.document.SourceType; +import at.procon.dip.domain.document.entity.DocumentSource; +import java.util.List; +import java.util.Optional; +import java.util.UUID; +import org.springframework.data.jpa.repository.JpaRepository; + +public interface DocumentSourceRepository extends JpaRepository { + + List findByDocument_Id(UUID documentId); + + List findBySourceType(SourceType sourceType); + + Optional findByExternalSourceId(String externalSourceId); +} diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentTextRepresentationRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentTextRepresentationRepository.java new file mode 100644 index 0000000..8dcbf34 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentTextRepresentationRepository.java @@ -0,0 +1,19 @@ +package at.procon.dip.domain.document.repository; + +import at.procon.dip.domain.document.RepresentationType; +import at.procon.dip.domain.document.entity.DocumentTextRepresentation; +import java.util.List; +import java.util.Optional; +import java.util.UUID; +import org.springframework.data.jpa.repository.JpaRepository; + +public interface DocumentTextRepresentationRepository extends JpaRepository { + + List findByDocument_Id(UUID documentId); + + List findByDocument_IdAndRepresentationType(UUID documentId, RepresentationType representationType); + + List findByPrimaryRepresentationTrue(); + + Optional findFirstByDocument_IdAndPrimaryRepresentationTrue(UUID documentId); +} diff --git a/src/main/java/at/procon/dip/domain/document/service/DocumentContentService.java b/src/main/java/at/procon/dip/domain/document/service/DocumentContentService.java new file mode 100644 index 0000000..27ee608 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/service/DocumentContentService.java @@ -0,0 +1,45 @@ +package at.procon.dip.domain.document.service; + +import at.procon.dip.domain.document.entity.DocumentContent; +import at.procon.dip.domain.document.repository.DocumentContentRepository; +import at.procon.dip.domain.document.service.command.AddDocumentContentCommand; +import java.util.List; +import java.util.UUID; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; + +@Service +@RequiredArgsConstructor +@Transactional +public class DocumentContentService { + + private final DocumentService documentService; + private final DocumentContentRepository contentRepository; + + public DocumentContent addContent(AddDocumentContentCommand command) { + DocumentContent content = DocumentContent.builder() + .document(documentService.getRequired(command.documentId())) + .contentRole(command.contentRole()) + .storageType(command.storageType()) + .mimeType(command.mimeType()) + .charsetName(command.charsetName()) + .textContent(command.textContent()) + .binaryRef(command.binaryRef()) + .contentHash(command.contentHash()) + .sizeBytes(command.sizeBytes()) + .build(); + return contentRepository.save(content); + } + + @Transactional(readOnly = true) + public DocumentContent getRequired(UUID contentId) { + return contentRepository.findById(contentId) + .orElseThrow(() -> new IllegalArgumentException("Unknown content id: " + contentId)); + } + + @Transactional(readOnly = true) + public List findByDocument(UUID documentId) { + return contentRepository.findByDocument_Id(documentId); + } +} diff --git a/src/main/java/at/procon/dip/domain/document/service/DocumentEmbeddingService.java b/src/main/java/at/procon/dip/domain/document/service/DocumentEmbeddingService.java new file mode 100644 index 0000000..6caf4be --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/service/DocumentEmbeddingService.java @@ -0,0 +1,125 @@ +package at.procon.dip.domain.document.service; + +import at.procon.dip.domain.document.DistanceMetric; +import at.procon.dip.domain.document.EmbeddingStatus; +import at.procon.dip.domain.document.entity.DocumentEmbedding; +import at.procon.dip.domain.document.entity.DocumentEmbeddingModel; +import at.procon.dip.domain.document.repository.DocumentEmbeddingModelRepository; +import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository; +import at.procon.dip.domain.document.service.command.RegisterEmbeddingModelCommand; +import java.time.OffsetDateTime; +import java.util.List; +import java.util.Optional; +import java.util.UUID; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; + +@Service +@RequiredArgsConstructor +@Transactional +public class DocumentEmbeddingService { + + private final DocumentService documentService; + private final DocumentRepresentationService representationService; + private final DocumentEmbeddingRepository embeddingRepository; + private final DocumentEmbeddingModelRepository modelRepository; + + public DocumentEmbeddingModel registerModel(RegisterEmbeddingModelCommand command) { + DocumentEmbeddingModel model = modelRepository.findByModelKey(command.modelKey()) + .orElseGet(DocumentEmbeddingModel::new); + model.setModelKey(command.modelKey()); + model.setProvider(command.provider()); + model.setDisplayName(command.displayName()); + model.setDimensions(command.dimensions()); + model.setDistanceMetric(command.distanceMetric() == null ? DistanceMetric.COSINE : command.distanceMetric()); + model.setQueryPrefixRequired(command.queryPrefixRequired()); + model.setActive(command.active()); + return modelRepository.save(model); + } + + public DocumentEmbedding createPendingEmbedding(UUID documentId, UUID representationId, UUID modelId) { + DocumentEmbeddingModel model = getRequiredModel(modelId); + DocumentEmbedding embedding = DocumentEmbedding.builder() + .document(documentService.getRequired(documentId)) + .representation(representationService.getRequired(representationId)) + .model(model) + .embeddingDimensions(model.getDimensions()) + .embeddingStatus(EmbeddingStatus.PENDING) + .build(); + return embeddingRepository.save(embedding); + } + + public DocumentEmbedding ensurePendingEmbedding(UUID documentId, UUID representationId, UUID modelId) { + Optional existing = embeddingRepository.findByRepresentation_IdAndModel_Id(representationId, modelId); + if (existing.isPresent()) { + DocumentEmbedding embedding = existing.get(); + embedding.setDocument(documentService.getRequired(documentId)); + embedding.setRepresentation(representationService.getRequired(representationId)); + embedding.setModel(getRequiredModel(modelId)); + embedding.setEmbeddingDimensions(embedding.getModel().getDimensions()); + embedding.setEmbeddingStatus(EmbeddingStatus.PENDING); + embedding.setErrorMessage(null); + embedding.setEmbeddedAt(null); + return embeddingRepository.save(embedding); + } + return createPendingEmbedding(documentId, representationId, modelId); + } + + public DocumentEmbedding markCompleted(UUID embeddingId, Integer tokenCount) { + DocumentEmbedding embedding = getRequired(embeddingId); + embedding.setEmbeddingStatus(EmbeddingStatus.COMPLETED); + embedding.setTokenCount(tokenCount); + embedding.setEmbeddedAt(OffsetDateTime.now()); + embedding.setErrorMessage(null); + return embeddingRepository.save(embedding); + } + + public DocumentEmbedding markFailed(UUID embeddingId, String errorMessage) { + DocumentEmbedding embedding = getRequired(embeddingId); + embedding.setEmbeddingStatus(EmbeddingStatus.FAILED); + embedding.setErrorMessage(errorMessage); + embedding.setEmbeddedAt(null); + return embeddingRepository.save(embedding); + } + + public DocumentEmbedding markProcessing(UUID embeddingId) { + DocumentEmbedding embedding = getRequired(embeddingId); + embedding.setEmbeddingStatus(EmbeddingStatus.PROCESSING); + embedding.setErrorMessage(null); + return embeddingRepository.save(embedding); + } + + public DocumentEmbedding markSkipped(UUID embeddingId, String reason) { + DocumentEmbedding embedding = getRequired(embeddingId); + embedding.setEmbeddingStatus(EmbeddingStatus.SKIPPED); + embedding.setErrorMessage(reason); + embedding.setEmbeddedAt(OffsetDateTime.now()); + return embeddingRepository.save(embedding); + } + + @Transactional(readOnly = true) + public DocumentEmbedding getRequired(UUID embeddingId) { + return embeddingRepository.findById(embeddingId) + .orElseThrow(() -> new IllegalArgumentException("Unknown embedding id: " + embeddingId)); + } + + @Transactional(readOnly = true) + public DocumentEmbeddingModel getRequiredModel(UUID modelId) { + return modelRepository.findById(modelId) + .orElseThrow(() -> new IllegalArgumentException("Unknown embedding model id: " + modelId)); + } + + + @Transactional(readOnly = true) + public DocumentEmbeddingModel findActiveModelByKey(String modelKey) { + return modelRepository.findByModelKey(modelKey) + .orElseThrow(() -> new IllegalArgumentException("Unknown embedding model key: " + modelKey)); + } + + @Transactional(readOnly = true) + public List findPendingEmbeddings() { + return embeddingRepository.findByEmbeddingStatus(EmbeddingStatus.PENDING); + } +} + diff --git a/src/main/java/at/procon/dip/domain/document/service/DocumentRelationService.java b/src/main/java/at/procon/dip/domain/document/service/DocumentRelationService.java new file mode 100644 index 0000000..f9c1bb1 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/service/DocumentRelationService.java @@ -0,0 +1,35 @@ +package at.procon.dip.domain.document.service; + +import at.procon.dip.domain.document.entity.DocumentRelation; +import at.procon.dip.domain.document.repository.DocumentRelationRepository; +import at.procon.dip.domain.document.service.command.CreateDocumentRelationCommand; +import java.util.List; +import java.util.UUID; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; + +@Service +@RequiredArgsConstructor +@Transactional +public class DocumentRelationService { + + private final DocumentService documentService; + private final DocumentRelationRepository relationRepository; + + public DocumentRelation createRelation(CreateDocumentRelationCommand command) { + DocumentRelation relation = DocumentRelation.builder() + .parentDocument(documentService.getRequired(command.parentDocumentId())) + .childDocument(documentService.getRequired(command.childDocumentId())) + .relationType(command.relationType()) + .sortOrder(command.sortOrder()) + .relationMetadata(command.relationMetadata()) + .build(); + return relationRepository.save(relation); + } + + @Transactional(readOnly = true) + public List findChildren(UUID parentDocumentId) { + return relationRepository.findByParentDocument_Id(parentDocumentId); + } +} diff --git a/src/main/java/at/procon/dip/domain/document/service/DocumentRepresentationService.java b/src/main/java/at/procon/dip/domain/document/service/DocumentRepresentationService.java new file mode 100644 index 0000000..8111e08 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/service/DocumentRepresentationService.java @@ -0,0 +1,50 @@ +package at.procon.dip.domain.document.service; + +import at.procon.dip.domain.document.entity.DocumentContent; +import at.procon.dip.domain.document.entity.DocumentTextRepresentation; +import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository; +import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand; +import java.util.List; +import java.util.UUID; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; + +@Service +@RequiredArgsConstructor +@Transactional +public class DocumentRepresentationService { + + private final DocumentService documentService; + private final DocumentContentService contentService; + private final DocumentTextRepresentationRepository representationRepository; + + public DocumentTextRepresentation addRepresentation(AddDocumentTextRepresentationCommand command) { + DocumentContent content = command.contentId() == null ? null : contentService.getRequired(command.contentId()); + DocumentTextRepresentation representation = DocumentTextRepresentation.builder() + .document(documentService.getRequired(command.documentId())) + .content(content) + .representationType(command.representationType()) + .builderKey(command.builderKey()) + .languageCode(command.languageCode()) + .tokenCount(command.tokenCount()) + .chunkIndex(command.chunkIndex()) + .chunkStartOffset(command.chunkStartOffset()) + .chunkEndOffset(command.chunkEndOffset()) + .primaryRepresentation(command.primaryRepresentation()) + .textBody(command.textBody()) + .build(); + return representationRepository.save(representation); + } + + @Transactional(readOnly = true) + public DocumentTextRepresentation getRequired(UUID representationId) { + return representationRepository.findById(representationId) + .orElseThrow(() -> new IllegalArgumentException("Unknown representation id: " + representationId)); + } + + @Transactional(readOnly = true) + public List findByDocument(UUID documentId) { + return representationRepository.findByDocument_Id(documentId); + } +} diff --git a/src/main/java/at/procon/dip/domain/document/service/DocumentService.java b/src/main/java/at/procon/dip/domain/document/service/DocumentService.java new file mode 100644 index 0000000..22bfe22 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/service/DocumentService.java @@ -0,0 +1,75 @@ +package at.procon.dip.domain.document.service; + +import at.procon.dip.domain.document.CanonicalDocumentMetadata; +import at.procon.dip.domain.document.DocumentStatus; +import at.procon.dip.domain.document.entity.Document; +import at.procon.dip.domain.document.repository.DocumentRepository; +import at.procon.dip.domain.document.service.command.CreateDocumentCommand; +import at.procon.dip.domain.tenant.entity.DocumentTenant; +import at.procon.dip.domain.tenant.repository.DocumentTenantRepository; +import java.util.List; +import java.util.UUID; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; + +@Service +@RequiredArgsConstructor +@Transactional +public class DocumentService { + + private final DocumentRepository documentRepository; + private final DocumentTenantRepository tenantRepository; + + public Document create(CreateDocumentCommand command) { + DocumentTenant ownerTenant = resolveOwnerTenant(command.ownerTenantKey()); + Document document = Document.builder() + .ownerTenant(ownerTenant) + .visibility(command.visibility()) + .documentType(command.documentType()) + .documentFamily(command.documentFamily()) + .status(command.status() == null ? DocumentStatus.RECEIVED : command.status()) + .title(command.title()) + .summary(command.summary()) + .languageCode(command.languageCode()) + .mimeType(command.mimeType()) + .businessKey(command.businessKey()) + .dedupHash(command.dedupHash()) + .build(); + return documentRepository.save(document); + } + + public Document save(Document document) { + return documentRepository.save(document); + } + + public Document updateStatus(UUID documentId, DocumentStatus status) { + Document document = getRequired(documentId); + document.setStatus(status); + return documentRepository.save(document); + } + + @Transactional(readOnly = true) + public Document getRequired(UUID documentId) { + return documentRepository.findById(documentId) + .orElseThrow(() -> new IllegalArgumentException("Unknown document id: " + documentId)); + } + + @Transactional(readOnly = true) + public List findAll() { + return documentRepository.findAll(); + } + + @Transactional(readOnly = true) + public CanonicalDocumentMetadata getMetadata(UUID documentId) { + return getRequired(documentId).toCanonicalMetadata(); + } + + private DocumentTenant resolveOwnerTenant(String ownerTenantKey) { + if (ownerTenantKey == null || ownerTenantKey.isBlank()) { + return null; + } + return tenantRepository.findByTenantKey(ownerTenantKey) + .orElseThrow(() -> new IllegalArgumentException("Unknown tenant key: " + ownerTenantKey)); + } +} diff --git a/src/main/java/at/procon/dip/domain/document/service/DocumentSourceService.java b/src/main/java/at/procon/dip/domain/document/service/DocumentSourceService.java new file mode 100644 index 0000000..c26e226 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/service/DocumentSourceService.java @@ -0,0 +1,38 @@ +package at.procon.dip.domain.document.service; + +import at.procon.dip.domain.document.entity.DocumentSource; +import at.procon.dip.domain.document.repository.DocumentSourceRepository; +import at.procon.dip.domain.document.service.command.AddDocumentSourceCommand; +import java.util.List; +import java.util.UUID; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; + +@Service +@RequiredArgsConstructor +@Transactional +public class DocumentSourceService { + + private final DocumentService documentService; + private final DocumentSourceRepository sourceRepository; + + public DocumentSource addSource(AddDocumentSourceCommand command) { + DocumentSource source = DocumentSource.builder() + .document(documentService.getRequired(command.documentId())) + .sourceType(command.sourceType()) + .externalSourceId(command.externalSourceId()) + .sourceUri(command.sourceUri()) + .sourceFilename(command.sourceFilename()) + .parentSourceId(command.parentSourceId()) + .importBatchId(command.importBatchId()) + .receivedAt(command.receivedAt()) + .build(); + return sourceRepository.save(source); + } + + @Transactional(readOnly = true) + public List findByDocument(UUID documentId) { + return sourceRepository.findByDocument_Id(documentId); + } +} diff --git a/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentContentCommand.java b/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentContentCommand.java new file mode 100644 index 0000000..284f9be --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentContentCommand.java @@ -0,0 +1,18 @@ +package at.procon.dip.domain.document.service.command; + +import at.procon.dip.domain.document.ContentRole; +import at.procon.dip.domain.document.StorageType; +import java.util.UUID; + +public record AddDocumentContentCommand( + UUID documentId, + ContentRole contentRole, + StorageType storageType, + String mimeType, + String charsetName, + String textContent, + String binaryRef, + String contentHash, + Long sizeBytes +) { +} diff --git a/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentSourceCommand.java b/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentSourceCommand.java new file mode 100644 index 0000000..75961b8 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentSourceCommand.java @@ -0,0 +1,17 @@ +package at.procon.dip.domain.document.service.command; + +import at.procon.dip.domain.document.SourceType; +import java.time.OffsetDateTime; +import java.util.UUID; + +public record AddDocumentSourceCommand( + UUID documentId, + SourceType sourceType, + String externalSourceId, + String sourceUri, + String sourceFilename, + UUID parentSourceId, + String importBatchId, + OffsetDateTime receivedAt +) { +} diff --git a/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentTextRepresentationCommand.java b/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentTextRepresentationCommand.java new file mode 100644 index 0000000..3106218 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/service/command/AddDocumentTextRepresentationCommand.java @@ -0,0 +1,19 @@ +package at.procon.dip.domain.document.service.command; + +import at.procon.dip.domain.document.RepresentationType; +import java.util.UUID; + +public record AddDocumentTextRepresentationCommand( + UUID documentId, + UUID contentId, + RepresentationType representationType, + String builderKey, + String languageCode, + Integer tokenCount, + Integer chunkIndex, + Integer chunkStartOffset, + Integer chunkEndOffset, + boolean primaryRepresentation, + String textBody +) { +} diff --git a/src/main/java/at/procon/dip/domain/document/service/command/CreateDocumentCommand.java b/src/main/java/at/procon/dip/domain/document/service/command/CreateDocumentCommand.java new file mode 100644 index 0000000..77345d3 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/service/command/CreateDocumentCommand.java @@ -0,0 +1,24 @@ +package at.procon.dip.domain.document.service.command; + +import at.procon.dip.domain.access.DocumentVisibility; +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentStatus; +import at.procon.dip.domain.document.DocumentType; + +/** + * Minimal Phase 1 command for creating the canonical document root. + */ +public record CreateDocumentCommand( + String ownerTenantKey, + DocumentVisibility visibility, + DocumentType documentType, + DocumentFamily documentFamily, + DocumentStatus status, + String title, + String summary, + String languageCode, + String mimeType, + String businessKey, + String dedupHash +) { +} diff --git a/src/main/java/at/procon/dip/domain/document/service/command/CreateDocumentRelationCommand.java b/src/main/java/at/procon/dip/domain/document/service/command/CreateDocumentRelationCommand.java new file mode 100644 index 0000000..8bc0d39 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/service/command/CreateDocumentRelationCommand.java @@ -0,0 +1,13 @@ +package at.procon.dip.domain.document.service.command; + +import at.procon.dip.domain.document.RelationType; +import java.util.UUID; + +public record CreateDocumentRelationCommand( + UUID parentDocumentId, + UUID childDocumentId, + RelationType relationType, + Integer sortOrder, + String relationMetadata +) { +} diff --git a/src/main/java/at/procon/dip/domain/document/service/command/RegisterEmbeddingModelCommand.java b/src/main/java/at/procon/dip/domain/document/service/command/RegisterEmbeddingModelCommand.java new file mode 100644 index 0000000..fbb52b9 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/service/command/RegisterEmbeddingModelCommand.java @@ -0,0 +1,14 @@ +package at.procon.dip.domain.document.service.command; + +import at.procon.dip.domain.document.DistanceMetric; + +public record RegisterEmbeddingModelCommand( + String modelKey, + String provider, + String displayName, + Integer dimensions, + DistanceMetric distanceMetric, + boolean queryPrefixRequired, + boolean active +) { +} diff --git a/src/main/java/at/procon/dip/domain/tenant/TenantRef.java b/src/main/java/at/procon/dip/domain/tenant/TenantRef.java new file mode 100644 index 0000000..0e56fc1 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/tenant/TenantRef.java @@ -0,0 +1,11 @@ +package at.procon.dip.domain.tenant; + +/** + * Canonical tenant reference used to express document ownership. + */ +public record TenantRef( + String tenantId, + String tenantKey, + String displayName +) { +} diff --git a/src/main/java/at/procon/dip/domain/tenant/entity/DocumentTenant.java b/src/main/java/at/procon/dip/domain/tenant/entity/DocumentTenant.java new file mode 100644 index 0000000..fffc368 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/tenant/entity/DocumentTenant.java @@ -0,0 +1,71 @@ +package at.procon.dip.domain.tenant.entity; + +import at.procon.dip.architecture.SchemaNames; +import jakarta.persistence.Column; +import jakarta.persistence.Entity; +import jakarta.persistence.GeneratedValue; +import jakarta.persistence.GenerationType; +import jakarta.persistence.Id; +import jakarta.persistence.Index; +import jakarta.persistence.PrePersist; +import jakarta.persistence.PreUpdate; +import jakarta.persistence.Table; +import java.time.OffsetDateTime; +import java.util.UUID; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; + +/** + * Canonical owner tenant catalog for the generalized DOC schema. + */ +@Entity +@Table(schema = SchemaNames.DOC, name = "doc_tenant", indexes = { + @Index(name = "idx_doc_tenant_key", columnList = "tenant_key", unique = true), + @Index(name = "idx_doc_tenant_active", columnList = "active") +}) +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class DocumentTenant { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + private UUID id; + + @Column(name = "tenant_key", nullable = false, unique = true, length = 120) + private String tenantKey; + + @Column(name = "display_name", nullable = false, length = 255) + private String displayName; + + @Column(name = "description", columnDefinition = "TEXT") + private String description; + + @Builder.Default + @Column(name = "active", nullable = false) + private boolean active = true; + + @Builder.Default + @Column(name = "created_at", nullable = false, updatable = false) + private OffsetDateTime createdAt = OffsetDateTime.now(); + + @Builder.Default + @Column(name = "updated_at", nullable = false) + private OffsetDateTime updatedAt = OffsetDateTime.now(); + + @PrePersist + protected void onCreate() { + createdAt = OffsetDateTime.now(); + updatedAt = OffsetDateTime.now(); + } + + @PreUpdate + protected void onUpdate() { + updatedAt = OffsetDateTime.now(); + } +} diff --git a/src/main/java/at/procon/dip/domain/tenant/repository/DocumentTenantRepository.java b/src/main/java/at/procon/dip/domain/tenant/repository/DocumentTenantRepository.java new file mode 100644 index 0000000..7bd8299 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/tenant/repository/DocumentTenantRepository.java @@ -0,0 +1,13 @@ +package at.procon.dip.domain.tenant.repository; + +import at.procon.dip.domain.tenant.entity.DocumentTenant; +import java.util.Optional; +import java.util.UUID; +import org.springframework.data.jpa.repository.JpaRepository; + +public interface DocumentTenantRepository extends JpaRepository { + + Optional findByTenantKey(String tenantKey); + + boolean existsByTenantKey(String tenantKey); +} diff --git a/src/main/java/at/procon/dip/domain/tenant/service/DocumentTenantService.java b/src/main/java/at/procon/dip/domain/tenant/service/DocumentTenantService.java new file mode 100644 index 0000000..811e114 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/tenant/service/DocumentTenantService.java @@ -0,0 +1,45 @@ +package at.procon.dip.domain.tenant.service; + +import at.procon.dip.domain.tenant.entity.DocumentTenant; +import at.procon.dip.domain.tenant.repository.DocumentTenantRepository; +import at.procon.dip.domain.tenant.service.command.CreateTenantCommand; +import java.util.List; +import java.util.UUID; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; + +@Service +@RequiredArgsConstructor +@Transactional +public class DocumentTenantService { + + private final DocumentTenantRepository tenantRepository; + + public DocumentTenant createOrUpdate(CreateTenantCommand command) { + DocumentTenant tenant = tenantRepository.findByTenantKey(command.tenantKey()) + .orElseGet(DocumentTenant::new); + tenant.setTenantKey(command.tenantKey()); + tenant.setDisplayName(command.displayName()); + tenant.setDescription(command.description()); + tenant.setActive(command.active()); + return tenantRepository.save(tenant); + } + + @Transactional(readOnly = true) + public DocumentTenant getRequiredById(UUID id) { + return tenantRepository.findById(id) + .orElseThrow(() -> new IllegalArgumentException("Unknown tenant id: " + id)); + } + + @Transactional(readOnly = true) + public DocumentTenant getRequiredByTenantKey(String tenantKey) { + return tenantRepository.findByTenantKey(tenantKey) + .orElseThrow(() -> new IllegalArgumentException("Unknown tenant key: " + tenantKey)); + } + + @Transactional(readOnly = true) + public List findAll() { + return tenantRepository.findAll(); + } +} diff --git a/src/main/java/at/procon/dip/domain/tenant/service/command/CreateTenantCommand.java b/src/main/java/at/procon/dip/domain/tenant/service/command/CreateTenantCommand.java new file mode 100644 index 0000000..ea6fe20 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/tenant/service/command/CreateTenantCommand.java @@ -0,0 +1,9 @@ +package at.procon.dip.domain.tenant.service.command; + +public record CreateTenantCommand( + String tenantKey, + String displayName, + String description, + boolean active +) { +} diff --git a/src/main/java/at/procon/dip/extraction/spi/DocumentExtractor.java b/src/main/java/at/procon/dip/extraction/spi/DocumentExtractor.java new file mode 100644 index 0000000..934837a --- /dev/null +++ b/src/main/java/at/procon/dip/extraction/spi/DocumentExtractor.java @@ -0,0 +1,13 @@ +package at.procon.dip.extraction.spi; + +import at.procon.dip.domain.document.DocumentType; + +/** + * Type-specific extraction contract. + */ +public interface DocumentExtractor { + + boolean supports(DocumentType documentType, String mimeType); + + ExtractionResult extract(ExtractionRequest extractionRequest); +} diff --git a/src/main/java/at/procon/dip/extraction/spi/ExtractedStructuredPayload.java b/src/main/java/at/procon/dip/extraction/spi/ExtractedStructuredPayload.java new file mode 100644 index 0000000..5c74505 --- /dev/null +++ b/src/main/java/at/procon/dip/extraction/spi/ExtractedStructuredPayload.java @@ -0,0 +1,12 @@ +package at.procon.dip.extraction.spi; + +import java.util.Map; + +/** + * Type-specific structured payload produced by an extractor. + */ +public record ExtractedStructuredPayload( + String projectionName, + Map attributes +) { +} diff --git a/src/main/java/at/procon/dip/extraction/spi/ExtractionRequest.java b/src/main/java/at/procon/dip/extraction/spi/ExtractionRequest.java new file mode 100644 index 0000000..e6a3e54 --- /dev/null +++ b/src/main/java/at/procon/dip/extraction/spi/ExtractionRequest.java @@ -0,0 +1,15 @@ +package at.procon.dip.extraction.spi; + +import at.procon.dip.classification.spi.DetectionResult; +import at.procon.dip.ingestion.spi.SourceDescriptor; + +/** + * Input to a document extractor. + */ +public record ExtractionRequest( + SourceDescriptor sourceDescriptor, + DetectionResult detectionResult, + String textContent, + byte[] binaryContent +) { +} diff --git a/src/main/java/at/procon/dip/extraction/spi/ExtractionResult.java b/src/main/java/at/procon/dip/extraction/spi/ExtractionResult.java new file mode 100644 index 0000000..c5c0ac1 --- /dev/null +++ b/src/main/java/at/procon/dip/extraction/spi/ExtractionResult.java @@ -0,0 +1,15 @@ +package at.procon.dip.extraction.spi; + +import at.procon.dip.domain.document.ContentRole; +import java.util.List; +import java.util.Map; + +/** + * Output of a document extractor before normalization and persistence. + */ +public record ExtractionResult( + Map derivedTextByRole, + List structuredPayloads, + List warnings +) { +} diff --git a/src/main/java/at/procon/dip/ingestion/spi/DocumentIngestionAdapter.java b/src/main/java/at/procon/dip/ingestion/spi/DocumentIngestionAdapter.java new file mode 100644 index 0000000..4c8be89 --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/spi/DocumentIngestionAdapter.java @@ -0,0 +1,11 @@ +package at.procon.dip.ingestion.spi; + +/** + * Extension point for source-specific import adapters. + */ +public interface DocumentIngestionAdapter { + + boolean supports(SourceDescriptor sourceDescriptor); + + IngestionResult ingest(SourceDescriptor sourceDescriptor); +} diff --git a/src/main/java/at/procon/dip/ingestion/spi/IngestionResult.java b/src/main/java/at/procon/dip/ingestion/spi/IngestionResult.java new file mode 100644 index 0000000..fff5bc4 --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/spi/IngestionResult.java @@ -0,0 +1,13 @@ +package at.procon.dip.ingestion.spi; + +import at.procon.dip.domain.document.CanonicalDocumentMetadata; +import java.util.List; + +/** + * Result of an ingestion adapter execution. + */ +public record IngestionResult( + List documents, + List warnings +) { +} diff --git a/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java b/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java new file mode 100644 index 0000000..644bd5e --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/spi/SourceDescriptor.java @@ -0,0 +1,19 @@ +package at.procon.dip.ingestion.spi; + +import at.procon.dip.domain.access.DocumentAccessContext; +import at.procon.dip.domain.document.SourceType; +import java.util.Map; + +/** + * Describes a source object that should be ingested into the canonical document model. + */ +public record SourceDescriptor( + DocumentAccessContext accessContext, + SourceType sourceType, + String sourceIdentifier, + String sourceUri, + String fileName, + String mediaType, + Map attributes +) { +} diff --git a/src/main/java/at/procon/dip/migration/MigrationStrategyMode.java b/src/main/java/at/procon/dip/migration/MigrationStrategyMode.java new file mode 100644 index 0000000..57e5b53 --- /dev/null +++ b/src/main/java/at/procon/dip/migration/MigrationStrategyMode.java @@ -0,0 +1,12 @@ +package at.procon.dip.migration; + +/** + * Phase 0 decision for introducing the generalized model incrementally. + */ +public enum MigrationStrategyMode { + ADDITIVE_SCHEMA, + DUAL_WRITE, + BACKFILL, + CUTOVER, + RETIRE_LEGACY +} diff --git a/src/main/java/at/procon/dip/normalization/spi/RepresentationBuildRequest.java b/src/main/java/at/procon/dip/normalization/spi/RepresentationBuildRequest.java new file mode 100644 index 0000000..13358e4 --- /dev/null +++ b/src/main/java/at/procon/dip/normalization/spi/RepresentationBuildRequest.java @@ -0,0 +1,15 @@ +package at.procon.dip.normalization.spi; + +import at.procon.dip.classification.spi.DetectionResult; +import at.procon.dip.extraction.spi.ExtractionResult; +import at.procon.dip.ingestion.spi.SourceDescriptor; + +/** + * Input for text-representation builders. + */ +public record RepresentationBuildRequest( + SourceDescriptor sourceDescriptor, + DetectionResult detectionResult, + ExtractionResult extractionResult +) { +} diff --git a/src/main/java/at/procon/dip/normalization/spi/TextRepresentationBuilder.java b/src/main/java/at/procon/dip/normalization/spi/TextRepresentationBuilder.java new file mode 100644 index 0000000..c7fa594 --- /dev/null +++ b/src/main/java/at/procon/dip/normalization/spi/TextRepresentationBuilder.java @@ -0,0 +1,14 @@ +package at.procon.dip.normalization.spi; + +import at.procon.dip.domain.document.DocumentType; +import java.util.List; + +/** + * Builds search-oriented text representations independently from raw extraction. + */ +public interface TextRepresentationBuilder { + + boolean supports(DocumentType documentType); + + List build(RepresentationBuildRequest request); +} diff --git a/src/main/java/at/procon/dip/normalization/spi/TextRepresentationDraft.java b/src/main/java/at/procon/dip/normalization/spi/TextRepresentationDraft.java new file mode 100644 index 0000000..af1f49a --- /dev/null +++ b/src/main/java/at/procon/dip/normalization/spi/TextRepresentationDraft.java @@ -0,0 +1,15 @@ +package at.procon.dip.normalization.spi; + +import at.procon.dip.domain.document.RepresentationType; + +/** + * Candidate text representation for semantic indexing. + */ +public record TextRepresentationDraft( + RepresentationType representationType, + String languageCode, + String textBody, + boolean primary, + Integer chunkIndex +) { +} diff --git a/src/main/java/at/procon/dip/processing/spi/ProcessingStage.java b/src/main/java/at/procon/dip/processing/spi/ProcessingStage.java new file mode 100644 index 0000000..474a259 --- /dev/null +++ b/src/main/java/at/procon/dip/processing/spi/ProcessingStage.java @@ -0,0 +1,14 @@ +package at.procon.dip.processing.spi; + +/** + * Cross-cutting processing stages for generic document orchestration. + */ +public enum ProcessingStage { + INGESTION, + CLASSIFICATION, + EXTRACTION, + NORMALIZATION, + VECTORIZATION, + INDEXING, + SEARCH +} diff --git a/src/main/java/at/procon/dip/search/spi/SearchDocumentScope.java b/src/main/java/at/procon/dip/search/spi/SearchDocumentScope.java new file mode 100644 index 0000000..fd2a373 --- /dev/null +++ b/src/main/java/at/procon/dip/search/spi/SearchDocumentScope.java @@ -0,0 +1,18 @@ +package at.procon.dip.search.spi; + +import at.procon.dip.domain.access.DocumentVisibility; +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentType; +import java.util.Set; + +/** + * Minimal generic search scope for future hybrid/semantic search services. + */ +public record SearchDocumentScope( + Set ownerTenantKeys, + Set documentTypes, + Set documentFamilies, + Set visibilities, + String languageCode +) { +} diff --git a/src/main/java/at/procon/dip/vectorization/camel/GenericVectorizationRoute.java b/src/main/java/at/procon/dip/vectorization/camel/GenericVectorizationRoute.java new file mode 100644 index 0000000..8330325 --- /dev/null +++ b/src/main/java/at/procon/dip/vectorization/camel/GenericVectorizationRoute.java @@ -0,0 +1,211 @@ +package at.procon.dip.vectorization.camel; + +import at.procon.dip.domain.document.EmbeddingStatus; +import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository; +import at.procon.dip.vectorization.service.DocumentEmbeddingProcessingService; +import com.fasterxml.jackson.annotation.JsonProperty; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.apache.camel.Exchange; +import org.apache.camel.LoggingLevel; +import org.apache.camel.builder.RouteBuilder; +import org.apache.camel.model.dataformat.JsonLibrary; +import org.springframework.data.domain.PageRequest; +import org.springframework.stereotype.Component; + +import at.procon.ted.config.TedProcessorProperties; +import java.util.List; +import java.util.UUID; + +/** + * Phase 2 generic vectorization route. + * Uses DOC.doc_text_representation as the source text and DOC.doc_embedding as the write target. + */ +@Component +@RequiredArgsConstructor +@Slf4j +public class GenericVectorizationRoute extends RouteBuilder { + + private static final String ROUTE_ID_TRIGGER = "generic-vectorization-trigger"; + private static final String ROUTE_ID_PROCESSOR = "generic-vectorization-processor"; + private static final String ROUTE_ID_SCHEDULER = "generic-vectorization-scheduler"; + + private final TedProcessorProperties properties; + private final DocumentEmbeddingRepository embeddingRepository; + private final DocumentEmbeddingProcessingService processingService; + + private java.util.concurrent.ExecutorService executorService() { + return java.util.concurrent.Executors.newFixedThreadPool( + 1, + r -> { + Thread thread = new Thread(r); + thread.setName("doc-vectorization-" + thread.getId()); + thread.setDaemon(true); + thread.setPriority(Thread.MAX_PRIORITY); + return thread; + } + ); + } + + @Override + public void configure() { + if (!properties.getVectorization().isEnabled() || !properties.getVectorization().isGenericPipelineEnabled()) { + log.info("Phase 2 generic vectorization route disabled"); + return; + } + + log.info("Configuring generic vectorization routes (phase2=true, apiUrl={}, scheduler={}ms)", + properties.getVectorization().getApiUrl(), + properties.getVectorization().getGenericSchedulerPeriodMs()); + + onException(Exception.class) + .handled(true) + .process(exchange -> { + UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class); + Exception exception = exchange.getProperty(Exchange.EXCEPTION_CAUGHT, Exception.class); + String error = exception != null ? exception.getMessage() : "Unknown vectorization error"; + if (embeddingId != null) { + try { + processingService.markAsFailed(embeddingId, error); + } catch (Exception nested) { + log.warn("Failed to mark embedding {} as failed: {}", embeddingId, nested.getMessage()); + } + } + }) + .to("log:generic-vectorization-error?level=WARN"); + + from("direct:vectorize-embedding") + .routeId(ROUTE_ID_TRIGGER) + .doTry() + .to("seda:vectorize-embedding-async?waitForTaskToComplete=Never&size=1000&blockWhenFull=true&timeout=5000") + .doCatch(Exception.class) + .log(LoggingLevel.WARN, "Failed to queue embedding ${header.embeddingId}: ${exception.message}") + .end(); + + from("seda:vectorize-embedding-async?size=1000") + .routeId(ROUTE_ID_PROCESSOR) + .threads().executorService(executorService()) + .process(exchange -> { + UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class); + DocumentEmbeddingProcessingService.EmbeddingPayload payload = + processingService.prepareEmbeddingForVectorization(embeddingId); + if (payload == null) { + exchange.setProperty("skipVectorization", true); + return; + } + + EmbedRequest request = new EmbedRequest(); + request.text = payload.textContent(); + request.isQuery = false; + + exchange.getIn().setHeader("embeddingId", payload.embeddingId()); + exchange.getIn().setHeader("documentId", payload.documentId()); + exchange.getIn().setHeader(Exchange.HTTP_METHOD, "POST"); + exchange.getIn().setHeader(Exchange.CONTENT_TYPE, "application/json"); + exchange.getIn().setBody(request); + }) + .choice() + .when(exchangeProperty("skipVectorization").isEqualTo(true)) + .log(LoggingLevel.DEBUG, "Skipping generic vectorization for ${header.embeddingId}") + .otherwise() + .marshal().json(JsonLibrary.Jackson) + .setProperty("retryCount", constant(0)) + .setProperty("maxRetries", constant(properties.getVectorization().getMaxRetries())) + .setProperty("vectorizationSuccess", constant(false)) + .loopDoWhile(simple("${exchangeProperty.vectorizationSuccess} == false && ${exchangeProperty.retryCount} < ${exchangeProperty.maxRetries}")) + .process(exchange -> { + Integer retryCount = exchange.getProperty("retryCount", Integer.class); + exchange.setProperty("retryCount", retryCount + 1); + if (retryCount > 0) { + long backoffMs = (long) Math.pow(2, retryCount) * 1000L; + Thread.sleep(backoffMs); + } + }) + .doTry() + .toD(properties.getVectorization().getApiUrl() + "/embed?bridgeEndpoint=true&throwExceptionOnFailure=false&connectTimeout=" + + properties.getVectorization().getConnectTimeout() + "&socketTimeout=" + + properties.getVectorization().getSocketTimeout()) + .process(exchange -> { + Integer statusCode = exchange.getIn().getHeader(Exchange.HTTP_RESPONSE_CODE, Integer.class); + if (statusCode == null || statusCode != 200) { + String body = exchange.getIn().getBody(String.class); + throw new RuntimeException("Embedding service returned HTTP " + statusCode + ": " + body); + } + }) + .unmarshal().json(JsonLibrary.Jackson, EmbedResponse.class) + .process(exchange -> { + UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class); + EmbedResponse response = exchange.getIn().getBody(EmbedResponse.class); + if (response == null || response.embedding == null) { + throw new RuntimeException("Embedding service returned null embedding response"); + } + processingService.saveEmbedding(embeddingId, response.embedding, response.tokenCount); + exchange.setProperty("vectorizationSuccess", true); + }) + .doCatch(Exception.class) + .process(exchange -> { + UUID embeddingId = exchange.getIn().getHeader("embeddingId", UUID.class); + Integer retryCount = exchange.getProperty("retryCount", Integer.class); + Integer maxRetries = exchange.getProperty("maxRetries", Integer.class); + Exception exception = exchange.getProperty(Exchange.EXCEPTION_CAUGHT, Exception.class); + String errorMsg = exception != null ? exception.getMessage() : "Unknown error"; + if (errorMsg != null && errorMsg.contains("Connection pool shut down")) { + log.warn("Generic vectorization aborted for {} because the application is shutting down", embeddingId); + exchange.setProperty("vectorizationSuccess", true); + return; + } + if (retryCount >= maxRetries) { + processingService.markAsFailed(embeddingId, errorMsg); + } else { + log.warn("Generic vectorization attempt #{} failed for {}: {}", retryCount, embeddingId, errorMsg); + } + }) + .end() + .end() + .end(); + + from("timer:generic-vectorization-scheduler?period=" + properties.getVectorization().getGenericSchedulerPeriodMs() + "&delay=500") + .routeId(ROUTE_ID_SCHEDULER) + .process(exchange -> { + int batchSize = properties.getVectorization().getBatchSize(); + List pending = embeddingRepository.findIdsByEmbeddingStatus(EmbeddingStatus.PENDING, PageRequest.of(0, batchSize)); + List failed = List.of(); + if (pending.isEmpty()) { + failed = embeddingRepository.findIdsByEmbeddingStatus(EmbeddingStatus.FAILED, PageRequest.of(0, batchSize)); + } + List toProcess = !pending.isEmpty() ? pending : failed; + if (toProcess.isEmpty()) { + exchange.setProperty("noPendingEmbeddings", true); + } else { + exchange.getIn().setBody(toProcess); + } + }) + .choice() + .when(exchangeProperty("noPendingEmbeddings").isEqualTo(true)) + .log(LoggingLevel.DEBUG, "Generic vectorization scheduler: nothing pending") + .otherwise() + .split(body()) + .process(exchange -> { + UUID embeddingId = exchange.getIn().getBody(UUID.class); + exchange.getIn().setHeader("embeddingId", embeddingId); + }) + .to("direct:vectorize-embedding") + .end() + .end(); + } + + public static class EmbedRequest { + @JsonProperty("text") + public String text; + + @JsonProperty("is_query") + public boolean isQuery; + } + + public static class EmbedResponse { + public float[] embedding; + public int dimensions; + @JsonProperty("token_count") + public int tokenCount; + } +} diff --git a/src/main/java/at/procon/dip/vectorization/service/DocumentEmbeddingProcessingService.java b/src/main/java/at/procon/dip/vectorization/service/DocumentEmbeddingProcessingService.java new file mode 100644 index 0000000..b81c022 --- /dev/null +++ b/src/main/java/at/procon/dip/vectorization/service/DocumentEmbeddingProcessingService.java @@ -0,0 +1,142 @@ +package at.procon.dip.vectorization.service; + +import at.procon.dip.domain.document.DocumentStatus; +import at.procon.dip.domain.document.EmbeddingStatus; +import at.procon.dip.domain.document.entity.DocumentEmbedding; +import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository; +import at.procon.dip.domain.document.service.DocumentService; +import at.procon.ted.config.TedProcessorProperties; +import at.procon.ted.model.entity.VectorizationStatus; +import at.procon.ted.repository.ProcurementDocumentRepository; +import at.procon.ted.service.VectorizationService; +import java.time.OffsetDateTime; +import java.util.UUID; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Propagation; +import org.springframework.transaction.annotation.Transactional; + +/** + * Phase 2 generic vectorization processor that works on DOC text representations and DOC embeddings. + *

+ * The service keeps the existing TED semantic search operational by optionally dual-writing completed + * embeddings back into the legacy TED procurement_document vector columns, resolved by document hash. + */ +@Service +@RequiredArgsConstructor +@Slf4j +public class DocumentEmbeddingProcessingService { + + private final DocumentEmbeddingRepository embeddingRepository; + private final DocumentService documentService; + private final VectorizationService vectorizationService; + private final TedProcessorProperties properties; + private final ProcurementDocumentRepository procurementDocumentRepository; + + @Transactional(propagation = Propagation.REQUIRES_NEW) + public EmbeddingPayload prepareEmbeddingForVectorization(UUID embeddingId) { + DocumentEmbedding embedding = embeddingRepository.findDetailedById(embeddingId) + .orElseThrow(() -> new IllegalArgumentException("Unknown embedding id: " + embeddingId)); + + if (embedding.getEmbeddingStatus() == EmbeddingStatus.PROCESSING) { + log.debug("Embedding {} is already PROCESSING, skipping duplicate queue entry", embeddingId); + return null; + } + + embedding.setEmbeddingStatus(EmbeddingStatus.PROCESSING); + embedding.setErrorMessage(null); + embeddingRepository.save(embedding); + + String textBody = embedding.getRepresentation().getTextBody(); + if (textBody == null || textBody.isBlank()) { + embedding.setEmbeddingStatus(EmbeddingStatus.SKIPPED); + embedding.setErrorMessage("No text representation available"); + embedding.setEmbeddedAt(OffsetDateTime.now()); + embeddingRepository.save(embedding); + documentService.updateStatus(embedding.getDocument().getId(), DocumentStatus.REPRESENTED); + return null; + } + + int maxLength = properties.getVectorization().getMaxTextLength(); + if (textBody.length() > maxLength) { + log.debug("Truncating representation {} for embedding {} from {} to {} chars", + embedding.getRepresentation().getId(), embeddingId, textBody.length(), maxLength); + textBody = textBody.substring(0, maxLength); + } + + return new EmbeddingPayload( + embedding.getId(), + embedding.getDocument().getId(), + embedding.getDocument().getDedupHash(), + textBody, + embedding.getModel().getDimensions(), + embedding.getModel().isQueryPrefixRequired(), + embedding.getRepresentation().getId() + ); + } + + @Transactional(propagation = Propagation.REQUIRES_NEW) + public void saveEmbedding(UUID embeddingId, float[] embedding, Integer tokenCount) { + DocumentEmbedding loaded = embeddingRepository.findDetailedById(embeddingId) + .orElseThrow(() -> new IllegalArgumentException("Unknown embedding id: " + embeddingId)); + + int expectedDimensions = loaded.getModel().getDimensions(); + if (embedding == null || embedding.length != expectedDimensions) { + throw new IllegalArgumentException("Invalid embedding dimension for " + embeddingId + + ": expected " + expectedDimensions + ", got " + (embedding == null ? 0 : embedding.length)); + } + + String vectorString = vectorizationService.floatArrayToVectorString(embedding); + embeddingRepository.updateEmbeddingVector(embeddingId, vectorString, tokenCount, embedding.length); + documentService.updateStatus(loaded.getDocument().getId(), DocumentStatus.INDEXED); + + if (properties.getVectorization().isDualWriteLegacyTedVectors()) { + dualWriteLegacyTedVector(loaded, vectorString, tokenCount); + } + } + + @Transactional(propagation = Propagation.REQUIRES_NEW) + public void markAsFailed(UUID embeddingId, String errorMessage) { + DocumentEmbedding loaded = embeddingRepository.findDetailedById(embeddingId) + .orElseThrow(() -> new IllegalArgumentException("Unknown embedding id: " + embeddingId)); + + embeddingRepository.updateEmbeddingStatus(embeddingId, EmbeddingStatus.FAILED, errorMessage, null); + documentService.updateStatus(loaded.getDocument().getId(), DocumentStatus.FAILED); + + if (properties.getVectorization().isDualWriteLegacyTedVectors()) { + loaded.getDocument().getDedupHash(); + procurementDocumentRepository.findByDocumentHash(loaded.getDocument().getDedupHash()) + .ifPresent(doc -> procurementDocumentRepository.updateVectorizationStatus( + doc.getId(), VectorizationStatus.FAILED, errorMessage, null)); + } + } + + private void dualWriteLegacyTedVector(DocumentEmbedding embedding, String vectorString, Integer tokenCount) { + String dedupHash = embedding.getDocument().getDedupHash(); + if (dedupHash == null || dedupHash.isBlank()) { + return; + } + + procurementDocumentRepository.findByDocumentHash(dedupHash) + .ifPresentOrElse( + legacy -> { + procurementDocumentRepository.updateContentVector(legacy.getId(), vectorString, tokenCount); + log.debug("Dual-wrote embedding {} back to legacy TED document {}", embedding.getId(), legacy.getId()); + }, + () -> log.debug("No legacy TED document found for DOC embedding {} with dedup hash {}", + embedding.getId(), dedupHash) + ); + } + + public record EmbeddingPayload( + UUID embeddingId, + UUID documentId, + String dedupHash, + String textContent, + Integer expectedDimensions, + boolean queryPrefixRequired, + UUID representationId + ) { + } +} diff --git a/src/main/java/at/procon/dip/vectorization/spi/EmbeddingModelDescriptor.java b/src/main/java/at/procon/dip/vectorization/spi/EmbeddingModelDescriptor.java new file mode 100644 index 0000000..9e31809 --- /dev/null +++ b/src/main/java/at/procon/dip/vectorization/spi/EmbeddingModelDescriptor.java @@ -0,0 +1,13 @@ +package at.procon.dip.vectorization.spi; + +/** + * Describes one embedding model registered in the platform. + */ +public record EmbeddingModelDescriptor( + String modelKey, + String provider, + int dimensions, + String distanceMetric, + boolean queryPrefixRequired +) { +} diff --git a/src/main/java/at/procon/dip/vectorization/spi/EmbeddingProvider.java b/src/main/java/at/procon/dip/vectorization/spi/EmbeddingProvider.java new file mode 100644 index 0000000..5d1cbff --- /dev/null +++ b/src/main/java/at/procon/dip/vectorization/spi/EmbeddingProvider.java @@ -0,0 +1,13 @@ +package at.procon.dip.vectorization.spi; + +import java.util.List; + +/** + * Provider abstraction for vectorization backends. + */ +public interface EmbeddingProvider { + + EmbeddingModelDescriptor model(); + + EmbeddingResult embed(List texts, boolean queryMode); +} diff --git a/src/main/java/at/procon/dip/vectorization/spi/EmbeddingResult.java b/src/main/java/at/procon/dip/vectorization/spi/EmbeddingResult.java new file mode 100644 index 0000000..eea6b65 --- /dev/null +++ b/src/main/java/at/procon/dip/vectorization/spi/EmbeddingResult.java @@ -0,0 +1,13 @@ +package at.procon.dip.vectorization.spi; + +import java.util.List; + +/** + * Embedding output for one or more representations. + */ +public record EmbeddingResult( + EmbeddingModelDescriptor model, + List vectors, + List warnings +) { +} diff --git a/src/main/java/at/procon/dip/vectorization/startup/ConfiguredEmbeddingModelStartupRunner.java b/src/main/java/at/procon/dip/vectorization/startup/ConfiguredEmbeddingModelStartupRunner.java new file mode 100644 index 0000000..6aff6e8 --- /dev/null +++ b/src/main/java/at/procon/dip/vectorization/startup/ConfiguredEmbeddingModelStartupRunner.java @@ -0,0 +1,41 @@ +package at.procon.dip.vectorization.startup; + +import at.procon.dip.domain.document.service.DocumentEmbeddingService; +import at.procon.dip.domain.document.service.command.RegisterEmbeddingModelCommand; +import at.procon.ted.config.TedProcessorProperties; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.boot.ApplicationArguments; +import org.springframework.boot.ApplicationRunner; +import org.springframework.stereotype.Component; + +/** + * Ensures the configured embedding model exists in DOC.doc_embedding_model. + */ +@Component +@RequiredArgsConstructor +@Slf4j +public class ConfiguredEmbeddingModelStartupRunner implements ApplicationRunner { + + private final TedProcessorProperties properties; + private final DocumentEmbeddingService embeddingService; + + @Override + public void run(ApplicationArguments args) { + if (!properties.getVectorization().isEnabled() || !properties.getVectorization().isGenericPipelineEnabled()) { + return; + } + + embeddingService.registerModel(new RegisterEmbeddingModelCommand( + properties.getVectorization().getModelName(), + properties.getVectorization().getEmbeddingProvider(), + properties.getVectorization().getModelName(), + properties.getVectorization().getDimensions(), + null, + false, + true + )); + + log.info("Phase 2 embedding model ensured: {}", properties.getVectorization().getModelName()); + } +} diff --git a/src/main/java/at/procon/dip/vectorization/startup/GenericVectorizationStartupRunner.java b/src/main/java/at/procon/dip/vectorization/startup/GenericVectorizationStartupRunner.java new file mode 100644 index 0000000..5266c62 --- /dev/null +++ b/src/main/java/at/procon/dip/vectorization/startup/GenericVectorizationStartupRunner.java @@ -0,0 +1,60 @@ +package at.procon.dip.vectorization.startup; + +import at.procon.dip.domain.document.EmbeddingStatus; +import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository; +import at.procon.ted.config.TedProcessorProperties; +import java.util.List; +import java.util.UUID; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.apache.camel.ProducerTemplate; +import org.springframework.boot.ApplicationArguments; +import org.springframework.boot.ApplicationRunner; +import org.springframework.data.domain.PageRequest; +import org.springframework.stereotype.Component; + +/** + * Queues pending and failed DOC embeddings immediately on startup. + */ +@Component +@RequiredArgsConstructor +@Slf4j +public class GenericVectorizationStartupRunner implements ApplicationRunner { + + private static final int BATCH_SIZE = 1000; + + private final TedProcessorProperties properties; + private final DocumentEmbeddingRepository embeddingRepository; + private final ProducerTemplate producerTemplate; + + @Override + public void run(ApplicationArguments args) { + if (!properties.getVectorization().isEnabled() || !properties.getVectorization().isGenericPipelineEnabled()) { + return; + } + + int queued = 0; + queued += queueByStatus(EmbeddingStatus.PENDING, "PENDING"); + queued += queueByStatus(EmbeddingStatus.FAILED, "FAILED"); + log.info("Generic vectorization startup runner queued {} embedding jobs", queued); + } + + private int queueByStatus(EmbeddingStatus status, String label) { + int queued = 0; + int page = 0; + List ids; + do { + ids = embeddingRepository.findIdsByEmbeddingStatus(status, PageRequest.of(page, BATCH_SIZE)); + for (UUID id : ids) { + try { + producerTemplate.sendBodyAndHeader("direct:vectorize-embedding", null, "embeddingId", id); + queued++; + } catch (Exception e) { + log.warn("Failed to queue {} embedding {}: {}", label, id, e.getMessage()); + } + } + page++; + } while (ids.size() == BATCH_SIZE); + return queued; + } +} diff --git a/src/main/java/at/procon/ted/TedProcurementProcessorApplication.java b/src/main/java/at/procon/ted/TedProcurementProcessorApplication.java index c86a461..b8409c9 100644 --- a/src/main/java/at/procon/ted/TedProcurementProcessorApplication.java +++ b/src/main/java/at/procon/ted/TedProcurementProcessorApplication.java @@ -1,26 +1,20 @@ package at.procon.ted; -import org.springframework.boot.SpringApplication; -import org.springframework.boot.autoconfigure.SpringBootApplication; -import org.springframework.scheduling.annotation.EnableAsync; +import at.procon.dip.DocumentIntelligencePlatformApplication; /** - * TED Procurement Document Processor Application. - * - * Processes EU eForms public procurement notices from TED (Tenders Electronic Daily). - * Features: - * - Directory watching with Apache Camel for automated XML processing - * - PostgreSQL storage with native XML support and pgvector for semantic search - * - Asynchronous document vectorization using multilingual-e5-large model - * - REST API for structured and semantic search - * - * @author Martin.Schweitzer@procon.co.at and claude.ai + * Legacy entry point kept for backward compatibility. + * + *

The platform is being generalized beyond TED-specific procurement documents. + * New runtime packaging should use {@link DocumentIntelligencePlatformApplication}.

*/ -@SpringBootApplication -@EnableAsync -public class TedProcurementProcessorApplication { +@Deprecated(forRemoval = false, since = "1.1.0") +public final class TedProcurementProcessorApplication { + + private TedProcurementProcessorApplication() { + } public static void main(String[] args) { - SpringApplication.run(TedProcurementProcessorApplication.class, args); + DocumentIntelligencePlatformApplication.main(args); } } diff --git a/src/main/java/at/procon/ted/camel/VectorizationRoute.java b/src/main/java/at/procon/ted/camel/VectorizationRoute.java index 40203af..84865ee 100644 --- a/src/main/java/at/procon/ted/camel/VectorizationRoute.java +++ b/src/main/java/at/procon/ted/camel/VectorizationRoute.java @@ -68,6 +68,10 @@ public class VectorizationRoute extends RouteBuilder { log.info("Vectorization is disabled, skipping route configuration"); return; } + if (properties.getVectorization().isGenericPipelineEnabled()) { + log.info("Legacy vectorization route disabled because Phase 2 generic pipeline is enabled"); + return; + } log.info("Configuring vectorization routes (enabled=true, apiUrl={}, connectTimeout={}ms, socketTimeout={}ms, maxRetries={}, scheduler every 6s)", properties.getVectorization().getApiUrl(), diff --git a/src/main/java/at/procon/ted/config/TedProcessorProperties.java b/src/main/java/at/procon/ted/config/TedProcessorProperties.java index 0e307cd..aa434c0 100644 --- a/src/main/java/at/procon/ted/config/TedProcessorProperties.java +++ b/src/main/java/at/procon/ted/config/TedProcessorProperties.java @@ -152,6 +152,37 @@ public class TedProcessorProperties { */ @Min(0) private int maxRetries = 5; + + /** + * Enable the Phase 2 generic vectorization pipeline based on DOC text representations + * and DOC embeddings instead of the legacy TED document vector columns as the primary + * write target. + */ + private boolean genericPipelineEnabled = true; + + /** + * Keep writing completed TED embeddings back to the legacy ted.procurement_document + * vector columns so the existing semantic search stays operational during migration. + */ + private boolean dualWriteLegacyTedVectors = true; + + /** + * Scheduler interval for generic embedding polling (milliseconds). + */ + @Positive + private long genericSchedulerPeriodMs = 6000; + + /** + * Builder key for the primary TED semantic representation created during Phase 2 dual-write. + */ + @NotBlank + private String primaryRepresentationBuilderKey = "ted-phase2-primary-representation"; + + /** + * Provider key used when registering the configured embedding model in DOC.doc_embedding_model. + */ + @NotBlank + private String embeddingProvider = "http-embedding-service"; } /** diff --git a/src/main/java/at/procon/ted/controller/AdminController.java b/src/main/java/at/procon/ted/controller/AdminController.java index acf5c3f..6434142 100644 --- a/src/main/java/at/procon/ted/controller/AdminController.java +++ b/src/main/java/at/procon/ted/controller/AdminController.java @@ -1,5 +1,8 @@ package at.procon.ted.controller; +import at.procon.dip.domain.document.EmbeddingStatus; +import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository; +import at.procon.ted.config.TedProcessorProperties; import at.procon.ted.model.entity.ProcessingLog; import at.procon.ted.model.entity.VectorizationStatus; import at.procon.ted.repository.ProcurementDocumentRepository; @@ -41,6 +44,9 @@ public class AdminController { private final VectorizationService vectorizationService; private final DocumentProcessingService documentProcessingService; private final ProcurementDocumentRepository documentRepository; + private final DocumentEmbeddingRepository documentEmbeddingRepository; + private final TedProcessorProperties properties; + private final at.procon.ted.service.TedPhase2GenericDocumentService tedPhase2GenericDocumentService; private final at.procon.ted.repository.ProcessingLogRepository logRepository; private final ProducerTemplate producerTemplate; private final at.procon.ted.service.DataCleanupService dataCleanupService; @@ -68,10 +74,17 @@ public class AdminController { public ResponseEntity> getVectorizationStatus() { Map status = new HashMap<>(); - List counts = documentRepository.countByVectorizationStatus(); Map statusCounts = new HashMap<>(); - for (Object[] row : counts) { - statusCounts.put(((VectorizationStatus) row[0]).name(), (Long) row[1]); + if (properties.getVectorization().isGenericPipelineEnabled()) { + List counts = documentEmbeddingRepository.countByEmbeddingStatus(); + for (Object[] row : counts) { + statusCounts.put(((EmbeddingStatus) row[0]).name(), (Long) row[1]); + } + } else { + List counts = documentRepository.countByVectorizationStatus(); + for (Object[] row : counts) { + statusCounts.put(((VectorizationStatus) row[0]).name(), (Long) row[1]); + } } status.put("counts", statusCounts); @@ -102,8 +115,14 @@ public class AdminController { return ResponseEntity.badRequest().body(result); } - // Trigger vectorization via Camel route - producerTemplate.sendBodyAndHeader("direct:vectorize", null, "documentId", documentId); + if (properties.getVectorization().isGenericPipelineEnabled()) { + var document = documentRepository.findById(documentId).orElseThrow(); + UUID embeddingId = tedPhase2GenericDocumentService.registerOrRefreshTedDocument(document); + producerTemplate.sendBodyAndHeader("direct:vectorize-embedding", null, "embeddingId", embeddingId); + result.put("embeddingId", embeddingId); + } else { + producerTemplate.sendBodyAndHeader("direct:vectorize", null, "documentId", documentId); + } result.put("success", true); result.put("message", "Vectorization triggered for document " + documentId); @@ -127,15 +146,24 @@ public class AdminController { return ResponseEntity.badRequest().body(result); } - var pending = documentRepository.findByVectorizationStatus( - VectorizationStatus.PENDING, - PageRequest.of(0, Math.min(batchSize, 500))); - int count = 0; - for (var doc : pending) { - // Trigger vectorization via Camel route - producerTemplate.sendBodyAndHeader("direct:vectorize", null, "documentId", doc.getId()); - count++; + if (properties.getVectorization().isGenericPipelineEnabled()) { + var pending = documentEmbeddingRepository.findIdsByEmbeddingStatus( + EmbeddingStatus.PENDING, + PageRequest.of(0, Math.min(batchSize, 500))); + for (UUID embeddingId : pending) { + producerTemplate.sendBodyAndHeader("direct:vectorize-embedding", null, "embeddingId", embeddingId); + count++; + } + } else { + var pending = documentRepository.findByVectorizationStatus( + VectorizationStatus.PENDING, + PageRequest.of(0, Math.min(batchSize, 500))); + + for (var doc : pending) { + producerTemplate.sendBodyAndHeader("direct:vectorize", null, "documentId", doc.getId()); + count++; + } } result.put("success", true); diff --git a/src/main/java/at/procon/ted/event/VectorizationEventListener.java b/src/main/java/at/procon/ted/event/VectorizationEventListener.java index 6823dac..0c2efd7 100644 --- a/src/main/java/at/procon/ted/event/VectorizationEventListener.java +++ b/src/main/java/at/procon/ted/event/VectorizationEventListener.java @@ -28,7 +28,7 @@ public class VectorizationEventListener { */ @TransactionalEventListener(phase = TransactionPhase.AFTER_COMMIT) public void onDocumentSaved(DocumentSavedEvent event) { - if (!properties.getVectorization().isEnabled()) { + if (!properties.getVectorization().isEnabled() || properties.getVectorization().isGenericPipelineEnabled()) { return; } diff --git a/src/main/java/at/procon/ted/service/BatchDocumentProcessingService.java b/src/main/java/at/procon/ted/service/BatchDocumentProcessingService.java index 0412b15..1526192 100644 --- a/src/main/java/at/procon/ted/service/BatchDocumentProcessingService.java +++ b/src/main/java/at/procon/ted/service/BatchDocumentProcessingService.java @@ -38,6 +38,7 @@ public class BatchDocumentProcessingService { private final XmlParserService xmlParserService; private final ProcurementDocumentRepository documentRepository; private final ProcessingLogService processingLogService; + private final TedPhase2GenericDocumentService tedPhase2GenericDocumentService; /** * Process a batch of XML files from a Daily Package. @@ -129,6 +130,10 @@ public class BatchDocumentProcessingService { ProcessingLog.EventStatus.SUCCESS, "Document parsed and stored successfully (batch)", null, doc.getSourceFilename(), 0); + + if (doc.getDocumentHash() != null) { + tedPhase2GenericDocumentService.registerOrRefreshTedDocument(doc); + } } log.info("Successfully inserted {} documents in batch", savedDocuments.size()); diff --git a/src/main/java/at/procon/ted/service/DocumentProcessingService.java b/src/main/java/at/procon/ted/service/DocumentProcessingService.java index 0d480fc..dd04db1 100644 --- a/src/main/java/at/procon/ted/service/DocumentProcessingService.java +++ b/src/main/java/at/procon/ted/service/DocumentProcessingService.java @@ -36,6 +36,7 @@ public class DocumentProcessingService { private final ProcessingLogService processingLogService; private final TedProcessorProperties properties; private final ApplicationEventPublisher eventPublisher; + private final TedPhase2GenericDocumentService tedPhase2GenericDocumentService; /** * Process an XML document from the file system. @@ -87,10 +88,15 @@ public class DocumentProcessingService { "Document parsed and stored successfully", null, filename, (int) (System.currentTimeMillis() - startTime)); - // Publish event to trigger vectorization AFTER transaction commit - // This ensures document is visible in DB and avoids transaction isolation issues - eventPublisher.publishEvent(new DocumentSavedEvent(document.getId(), document.getPublicationId())); - log.debug("Document saved successfully, vectorization event published: {}", document.getId()); + if (properties.getVectorization().isGenericPipelineEnabled()) { + tedPhase2GenericDocumentService.registerOrRefreshTedDocument(document); + log.debug("Document saved successfully, Phase 2 generic vectorization record ensured: {}", document.getId()); + } else { + // Publish event to trigger vectorization AFTER transaction commit + // This ensures document is visible in DB and avoids transaction isolation issues + eventPublisher.publishEvent(new DocumentSavedEvent(document.getId(), document.getPublicationId())); + log.debug("Document saved successfully, vectorization event published: {}", document.getId()); + } return ProcessingResult.success(document.getId(), documentHash, document.getPublicationId()); @@ -141,9 +147,11 @@ public class DocumentProcessingService { documentRepository.save(updated); - // Note: Re-vectorization will be triggered automatically by - // VectorizationRoute scheduler (checks for PENDING documents every 60s) + if (properties.getVectorization().isGenericPipelineEnabled()) { + tedPhase2GenericDocumentService.registerOrRefreshTedDocument(updated); + } + // Note: Re-vectorization will be triggered automatically by the active scheduler return updated; } catch (Exception e) { log.error("Failed to reprocess document {}: {}", publicationId, e.getMessage()); diff --git a/src/main/java/at/procon/ted/service/TedPhase2GenericDocumentService.java b/src/main/java/at/procon/ted/service/TedPhase2GenericDocumentService.java new file mode 100644 index 0000000..9563578 --- /dev/null +++ b/src/main/java/at/procon/ted/service/TedPhase2GenericDocumentService.java @@ -0,0 +1,197 @@ +package at.procon.ted.service; + +import at.procon.dip.domain.access.DocumentVisibility; +import at.procon.dip.domain.document.ContentRole; +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentStatus; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.RepresentationType; +import at.procon.dip.domain.document.SourceType; +import at.procon.dip.domain.document.StorageType; +import at.procon.dip.domain.document.entity.Document; +import at.procon.dip.domain.document.entity.DocumentContent; +import at.procon.dip.domain.document.entity.DocumentEmbedding; +import at.procon.dip.domain.document.entity.DocumentEmbeddingModel; +import at.procon.dip.domain.document.entity.DocumentSource; +import at.procon.dip.domain.document.entity.DocumentTextRepresentation; +import at.procon.dip.domain.document.repository.DocumentContentRepository; +import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository; +import at.procon.dip.domain.document.repository.DocumentRepository; +import at.procon.dip.domain.document.repository.DocumentSourceRepository; +import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository; +import at.procon.dip.domain.document.service.DocumentEmbeddingService; +import at.procon.dip.domain.document.service.DocumentService; +import at.procon.dip.domain.document.service.command.RegisterEmbeddingModelCommand; +import at.procon.ted.config.TedProcessorProperties; +import at.procon.ted.model.entity.ProcurementDocument; +import java.time.OffsetDateTime; +import java.util.List; +import java.util.UUID; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; + +/** + * Phase 2 bridge that dual-writes TED documents into the generic DOC persistence backbone. + */ +@Service +@RequiredArgsConstructor +@Slf4j +public class TedPhase2GenericDocumentService { + + private final TedProcessorProperties properties; + private final DocumentRepository documentRepository; + private final DocumentContentRepository contentRepository; + private final DocumentSourceRepository sourceRepository; + private final DocumentTextRepresentationRepository representationRepository; + private final DocumentEmbeddingRepository embeddingRepository; + private final DocumentService documentService; + private final DocumentEmbeddingService embeddingService; + + @Transactional + public UUID registerOrRefreshTedDocument(ProcurementDocument tedDocument) { + if (!properties.getVectorization().isGenericPipelineEnabled()) { + return null; + } + + Document document = documentRepository.findByDedupHash(tedDocument.getDocumentHash()) + .orElseGet(() -> createGenericDocument(tedDocument)); + + document.setDocumentType(DocumentType.TED_NOTICE); + document.setDocumentFamily(DocumentFamily.PROCUREMENT); + document.setVisibility(DocumentVisibility.PUBLIC); + document.setStatus(DocumentStatus.REPRESENTED); + document.setTitle(tedDocument.getProjectTitle()); + document.setSummary(tedDocument.getProjectDescription()); + document.setLanguageCode(tedDocument.getLanguageCode()); + document.setMimeType("application/xml"); + document.setBusinessKey(buildBusinessKey(tedDocument)); + document.setDedupHash(tedDocument.getDocumentHash()); + document = documentRepository.save(document); + + ensureTedSource(document, tedDocument); + DocumentContent originalContent = ensureOriginalContent(document, tedDocument); + DocumentTextRepresentation representation = ensurePrimaryRepresentation(document, originalContent, tedDocument); + DocumentEmbedding embedding = ensurePendingEmbedding(document, representation); + + log.debug("Phase 2 DOC bridge ensured generic TED document {} -> embedding {}", document.getId(), embedding.getId()); + return embedding.getId(); + } + + private Document createGenericDocument(ProcurementDocument tedDocument) { + return documentService.create(new at.procon.dip.domain.document.service.command.CreateDocumentCommand( + null, + DocumentVisibility.PUBLIC, + DocumentType.TED_NOTICE, + DocumentFamily.PROCUREMENT, + DocumentStatus.REPRESENTED, + tedDocument.getProjectTitle(), + tedDocument.getProjectDescription(), + tedDocument.getLanguageCode(), + "application/xml", + buildBusinessKey(tedDocument), + tedDocument.getDocumentHash() + )); + } + + private void ensureTedSource(Document document, ProcurementDocument tedDocument) { + String externalId = tedDocument.getPublicationId() != null ? tedDocument.getPublicationId() : tedDocument.getId().toString(); + boolean sourceExists = sourceRepository.findByDocument_Id(document.getId()).stream() + .anyMatch(existing -> externalId.equals(existing.getExternalSourceId())); + if (sourceExists) { + return; + } + + DocumentSource source = DocumentSource.builder() + .document(document) + .sourceType(SourceType.FILE_SYSTEM) + .externalSourceId(externalId) + .sourceUri(tedDocument.getSourcePath()) + .sourceFilename(tedDocument.getSourceFilename()) + .importBatchId("ted-phase2") + .receivedAt(OffsetDateTime.now()) + .build(); + sourceRepository.save(source); + } + + private DocumentContent ensureOriginalContent(Document document, ProcurementDocument tedDocument) { + List existing = contentRepository.findByDocument_IdAndContentRole(document.getId(), ContentRole.ORIGINAL); + if (!existing.isEmpty()) { + DocumentContent content = existing.get(0); + content.setMimeType("application/xml"); + content.setStorageType(StorageType.DB_TEXT); + content.setTextContent(tedDocument.getXmlDocument()); + content.setContentHash(tedDocument.getDocumentHash()); + content.setSizeBytes(tedDocument.getFileSizeBytes()); + return contentRepository.save(content); + } + + DocumentContent content = DocumentContent.builder() + .document(document) + .contentRole(ContentRole.ORIGINAL) + .storageType(StorageType.DB_TEXT) + .mimeType("application/xml") + .charsetName("UTF-8") + .textContent(tedDocument.getXmlDocument()) + .contentHash(tedDocument.getDocumentHash()) + .sizeBytes(tedDocument.getFileSizeBytes()) + .build(); + return contentRepository.save(content); + } + + private DocumentTextRepresentation ensurePrimaryRepresentation(Document document, + DocumentContent originalContent, + ProcurementDocument tedDocument) { + DocumentTextRepresentation representation = representationRepository + .findFirstByDocument_IdAndPrimaryRepresentationTrue(document.getId()) + .orElseGet(DocumentTextRepresentation::new); + + representation.setDocument(document); + representation.setContent(originalContent); + representation.setRepresentationType(RepresentationType.SEMANTIC_TEXT); + representation.setBuilderKey(properties.getVectorization().getPrimaryRepresentationBuilderKey()); + representation.setLanguageCode(tedDocument.getLanguageCode()); + representation.setPrimaryRepresentation(true); + representation.setTextBody(tedDocument.getTextContent() != null ? tedDocument.getTextContent() : tedDocument.getProjectDescription()); + representation.setTokenCount(null); + representation.setChunkIndex(null); + representation.setChunkStartOffset(null); + representation.setChunkEndOffset(null); + return representationRepository.save(representation); + } + + private DocumentEmbedding ensurePendingEmbedding(Document document, DocumentTextRepresentation representation) { + DocumentEmbeddingModel model = embeddingService.registerModel(new RegisterEmbeddingModelCommand( + properties.getVectorization().getModelName(), + properties.getVectorization().getEmbeddingProvider(), + properties.getVectorization().getModelName(), + properties.getVectorization().getDimensions(), + null, + false, + true + )); + + return embeddingRepository.findByRepresentation_IdAndModel_Id(representation.getId(), model.getId()) + .map(existing -> { + existing.setDocument(document); + existing.setRepresentation(representation); + existing.setModel(model); + existing.setEmbeddingStatus(at.procon.dip.domain.document.EmbeddingStatus.PENDING); + existing.setErrorMessage(null); + existing.setEmbeddedAt(null); + return embeddingRepository.save(existing); + }) + .orElseGet(() -> embeddingService.createPendingEmbedding(document.getId(), representation.getId(), model.getId())); + } + + private String buildBusinessKey(ProcurementDocument tedDocument) { + if (tedDocument.getPublicationId() != null && !tedDocument.getPublicationId().isBlank()) { + return "TED:publication:" + tedDocument.getPublicationId(); + } + if (tedDocument.getNoticeUrl() != null && !tedDocument.getNoticeUrl().isBlank()) { + return "TED:url:" + tedDocument.getNoticeUrl(); + } + return "TED:hash:" + tedDocument.getDocumentHash(); + } +} diff --git a/src/main/java/at/procon/ted/startup/VectorizationStartupRunner.java b/src/main/java/at/procon/ted/startup/VectorizationStartupRunner.java index 2048f3d..b75c2be 100644 --- a/src/main/java/at/procon/ted/startup/VectorizationStartupRunner.java +++ b/src/main/java/at/procon/ted/startup/VectorizationStartupRunner.java @@ -44,6 +44,10 @@ public class VectorizationStartupRunner implements ApplicationRunner { log.info("Vectorization is disabled, skipping startup processing"); return; } + if (properties.getVectorization().isGenericPipelineEnabled()) { + log.info("Legacy vectorization startup runner disabled because Phase 2 generic pipeline is enabled"); + return; + } log.info("Checking for pending and failed vectorizations on startup..."); diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 8edd412..15dcad8 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -1,19 +1,19 @@ -# TED Procurement Document Processor Configuration +# Document Intelligence Platform Configuration # Author: Martin.Schweitzer@procon.co.at and claude.ai server: - port: 8888 + port: 8889 servlet: context-path: /api spring: application: - name: ted-procurement-processor + name: document-intelligence-platform datasource: - url: jdbc:postgresql://94.130.218.54:32333/RELM + url: jdbc:postgresql://localhost:5432/RELM username: ${DB_USERNAME:postgres} - password: ${DB_PASSWORD:PDmXRx0Rbk9OFOn9qO5Gm/mPCfqW8zwbZ+/YIU1lySc=} + password: ${DB_PASSWORD:P54!pcd#Wi} driver-class-name: org.postgresql.Driver hikari: maximum-pool-size: 5 @@ -25,11 +25,12 @@ spring: jpa: hibernate: - ddl-auto: none + ddl-auto: update show-sql: false open-in-view: false properties: hibernate: + dialect: org.hibernate.dialect.PostgreSQLDialect format_sql: true default_schema: TED jdbc: @@ -42,7 +43,9 @@ spring: locations: classpath:db/migration baseline-on-migrate: true create-schemas: true - schemas: TED + schemas: + - TED + - DOC default-schema: TED # Apache Camel Configuration @@ -102,6 +105,16 @@ ted: socket-timeout: 60000 # Maximum retries on connection failure max-retries: 5 + # Phase 2: use generic DOC representation/embedding pipeline as primary vectorization path + generic-pipeline-enabled: true + # Keep legacy TED vector columns updated until semantic search is migrated + dual-write-legacy-ted-vectors: true + # Scheduler interval for generic embedding polling + generic-scheduler-period-ms: 6000 + # Builder identifier for primary TED semantic representations in DOC + primary-representation-builder-key: ted-phase2-primary-representation + # Provider key stored in DOC.doc_embedding_model + embedding-provider: http-embedding-service # Search configuration search: @@ -115,7 +128,7 @@ ted: # TED Daily Package Download configuration download: # Enable/disable automatic package download - enabled: true + enabled: false # Base URL for TED Daily Packages base-url: https://ted.europa.eu/packages/daily/ # Download directory for tar.gz files @@ -148,7 +161,7 @@ ted: # IMAP Mail configuration mail: # Enable/disable mail processing - enabled: true + enabled: false # IMAP server hostname host: mail.mymagenta.business # IMAP server port (993 for IMAPS) @@ -172,11 +185,11 @@ ted: # Max messages per poll max-messages-per-poll: 10 # Output directory for processed attachments - attachment-output-directory: D:/ted.europe/mail-attachments + attachment-output-directory: /ted.europe/mail-attachments # Enable/disable MIME file input processing mime-input-enabled: true # Input directory for MIME files (.eml) - mime-input-directory: D:/ted.europe/mime-input + mime-input-directory: /ted.europe/mime-input # File pattern for MIME files (regex) mime-input-pattern: .*\\.eml # Polling interval for MIME input directory (milliseconds) @@ -185,7 +198,7 @@ ted: # Solution Brief processing configuration solution-brief: # Enable/disable Solution Brief processing - enabled: true + enabled: false # Input directory for Solution Brief PDF files input-directory: C:/work/SolutionBrief # Output directory for Excel result files (relative to input or absolute) diff --git a/src/main/resources/db/migration/V4__add_doc_generic_persistence_backbone.sql b/src/main/resources/db/migration/V4__add_doc_generic_persistence_backbone.sql new file mode 100644 index 0000000..4e81c62 --- /dev/null +++ b/src/main/resources/db/migration/V4__add_doc_generic_persistence_backbone.sql @@ -0,0 +1,281 @@ +-- Phase 1: Generic DOC persistence backbone for the Procon Document Intelligence Platform +-- This migration is additive and intentionally does not modify the existing TED runtime tables. + +CREATE SCHEMA IF NOT EXISTS DOC; + +SET search_path TO TED, DOC, public; + +DO $$ +BEGIN + CREATE EXTENSION IF NOT EXISTS pgcrypto SCHEMA public; +EXCEPTION + WHEN insufficient_privilege THEN + RAISE NOTICE 'Skipping pgcrypto extension creation (insufficient privileges)'; + WHEN duplicate_object THEN + RAISE NOTICE 'Extension pgcrypto already exists'; +END +$$; + +DO $$ +BEGIN + CREATE EXTENSION IF NOT EXISTS vector SCHEMA public; +EXCEPTION + WHEN insufficient_privilege THEN + RAISE NOTICE 'Skipping vector extension creation (insufficient privileges)'; + WHEN duplicate_object THEN + RAISE NOTICE 'Extension vector already exists'; + WHEN undefined_file THEN + RAISE WARNING 'Extension vector not available - install pgvector on the database server'; +END +$$; + +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_document_visibility') THEN + CREATE TYPE DOC.doc_document_visibility AS ENUM ('PUBLIC', 'TENANT', 'SHARED', 'RESTRICTED'); + END IF; +END +$$; + +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_document_type') THEN + CREATE TYPE DOC.doc_document_type AS ENUM ( + 'TED_NOTICE', 'EMAIL', 'MIME_MESSAGE', 'PDF', 'DOCX', 'HTML', + 'XML_GENERIC', 'TEXT', 'MARKDOWN', 'ZIP_ARCHIVE', 'GENERIC_BINARY', 'UNKNOWN' + ); + END IF; +END +$$; + +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_document_family') THEN + CREATE TYPE DOC.doc_document_family AS ENUM ('PROCUREMENT', 'MAIL', 'ATTACHMENT', 'KNOWLEDGE', 'GENERIC'); + END IF; +END +$$; + +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_document_status') THEN + CREATE TYPE DOC.doc_document_status AS ENUM ('RECEIVED', 'CLASSIFIED', 'EXTRACTED', 'REPRESENTED', 'INDEXED', 'FAILED', 'ARCHIVED'); + END IF; +END +$$; + +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_source_type') THEN + CREATE TYPE DOC.doc_source_type AS ENUM ('TED_PACKAGE', 'MAIL', 'FILE_SYSTEM', 'REST_UPLOAD', 'MANUAL_UPLOAD', 'ZIP_CHILD', 'API', 'MIGRATION'); + END IF; +END +$$; + +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_content_role') THEN + CREATE TYPE DOC.doc_content_role AS ENUM ( + 'ORIGINAL', 'NORMALIZED_TEXT', 'OCR_TEXT', 'HTML_CLEAN', + 'EXTRACTED_METADATA_JSON', 'THUMBNAIL', 'DERIVED_BINARY' + ); + END IF; +END +$$; + +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_storage_type') THEN + CREATE TYPE DOC.doc_storage_type AS ENUM ('DB_TEXT', 'DB_BINARY', 'FILE_PATH', 'OBJECT_STORAGE', 'EXTERNAL_REFERENCE'); + END IF; +END +$$; + +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_representation_type') THEN + CREATE TYPE DOC.doc_representation_type AS ENUM ('FULLTEXT', 'SEMANTIC_TEXT', 'SUMMARY', 'TITLE_ABSTRACT', 'CHUNK', 'METADATA_ENRICHED'); + END IF; +END +$$; + +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_embedding_status') THEN + CREATE TYPE DOC.doc_embedding_status AS ENUM ('PENDING', 'PROCESSING', 'COMPLETED', 'FAILED', 'SKIPPED'); + END IF; +END +$$; + +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_distance_metric') THEN + CREATE TYPE DOC.doc_distance_metric AS ENUM ('COSINE', 'L2', 'INNER_PRODUCT'); + END IF; +END +$$; + +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'doc_relation_type') THEN + CREATE TYPE DOC.doc_relation_type AS ENUM ('CONTAINS', 'ATTACHMENT_OF', 'EXTRACTED_FROM', 'DERIVED_FROM', 'PART_OF', 'VERSION_OF', 'RELATED_TO'); + END IF; +END +$$; + +CREATE TABLE IF NOT EXISTS DOC.doc_tenant ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + tenant_key VARCHAR(120) NOT NULL UNIQUE, + display_name VARCHAR(255) NOT NULL, + description TEXT, + active BOOLEAN NOT NULL DEFAULT TRUE, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS DOC.doc_document ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + owner_tenant_id UUID REFERENCES DOC.doc_tenant(id), + visibility DOC.doc_document_visibility NOT NULL, + document_type DOC.doc_document_type NOT NULL, + document_family DOC.doc_document_family NOT NULL, + status DOC.doc_document_status NOT NULL DEFAULT 'RECEIVED', + title VARCHAR(1000), + summary TEXT, + language_code VARCHAR(16), + mime_type VARCHAR(255), + business_key VARCHAR(255), + dedup_hash VARCHAR(64), + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS DOC.doc_source ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE, + source_type DOC.doc_source_type NOT NULL, + external_source_id VARCHAR(500), + source_uri TEXT, + source_filename VARCHAR(1000), + parent_source_id UUID, + import_batch_id VARCHAR(255), + received_at TIMESTAMP WITH TIME ZONE, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS DOC.doc_content ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE, + content_role DOC.doc_content_role NOT NULL, + storage_type DOC.doc_storage_type NOT NULL, + mime_type VARCHAR(255), + charset_name VARCHAR(120), + text_content TEXT, + binary_ref TEXT, + content_hash VARCHAR(64), + size_bytes BIGINT, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS DOC.doc_text_representation ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE, + content_id UUID REFERENCES DOC.doc_content(id) ON DELETE SET NULL, + representation_type DOC.doc_representation_type NOT NULL, + builder_key VARCHAR(255), + language_code VARCHAR(16), + token_count INTEGER, + char_count INTEGER, + chunk_index INTEGER, + chunk_start_offset INTEGER, + chunk_end_offset INTEGER, + is_primary BOOLEAN NOT NULL DEFAULT FALSE, + text_body TEXT NOT NULL, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS DOC.doc_embedding_model ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + model_key VARCHAR(255) NOT NULL UNIQUE, + provider VARCHAR(120) NOT NULL, + display_name VARCHAR(255), + dimensions INTEGER NOT NULL, + distance_metric DOC.doc_distance_metric NOT NULL DEFAULT 'COSINE', + query_prefix_required BOOLEAN NOT NULL DEFAULT FALSE, + active BOOLEAN NOT NULL DEFAULT TRUE, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS DOC.doc_embedding ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE, + representation_id UUID NOT NULL REFERENCES DOC.doc_text_representation(id) ON DELETE CASCADE, + model_id UUID NOT NULL REFERENCES DOC.doc_embedding_model(id), + embedding_status DOC.doc_embedding_status NOT NULL DEFAULT 'PENDING', + token_count INTEGER, + embedding_dimensions INTEGER, + error_message TEXT, + embedded_at TIMESTAMP WITH TIME ZONE, + embedding_vector public.vector, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS DOC.doc_relation ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + parent_document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE, + child_document_id UUID NOT NULL REFERENCES DOC.doc_document(id) ON DELETE CASCADE, + relation_type DOC.doc_relation_type NOT NULL, + sort_order INTEGER, + relation_metadata TEXT, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, + CONSTRAINT chk_doc_relation_no_self CHECK (parent_document_id <> child_document_id) +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_doc_tenant_key ON DOC.doc_tenant(tenant_key); +CREATE INDEX IF NOT EXISTS idx_doc_tenant_active ON DOC.doc_tenant(active); + +CREATE INDEX IF NOT EXISTS idx_doc_document_type ON DOC.doc_document(document_type); +CREATE INDEX IF NOT EXISTS idx_doc_document_family ON DOC.doc_document(document_family); +CREATE INDEX IF NOT EXISTS idx_doc_document_status ON DOC.doc_document(status); +CREATE INDEX IF NOT EXISTS idx_doc_document_visibility ON DOC.doc_document(visibility); +CREATE INDEX IF NOT EXISTS idx_doc_document_owner_tenant ON DOC.doc_document(owner_tenant_id); +CREATE INDEX IF NOT EXISTS idx_doc_document_dedup_hash ON DOC.doc_document(dedup_hash); +CREATE INDEX IF NOT EXISTS idx_doc_document_business_key ON DOC.doc_document(business_key); +CREATE INDEX IF NOT EXISTS idx_doc_document_created_at ON DOC.doc_document(created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_doc_source_document ON DOC.doc_source(document_id); +CREATE INDEX IF NOT EXISTS idx_doc_source_type ON DOC.doc_source(source_type); +CREATE INDEX IF NOT EXISTS idx_doc_source_external_id ON DOC.doc_source(external_source_id); +CREATE INDEX IF NOT EXISTS idx_doc_source_received_at ON DOC.doc_source(received_at DESC); +CREATE INDEX IF NOT EXISTS idx_doc_source_parent_source ON DOC.doc_source(parent_source_id); + +CREATE INDEX IF NOT EXISTS idx_doc_content_document ON DOC.doc_content(document_id); +CREATE INDEX IF NOT EXISTS idx_doc_content_role ON DOC.doc_content(content_role); +CREATE INDEX IF NOT EXISTS idx_doc_content_hash ON DOC.doc_content(content_hash); +CREATE INDEX IF NOT EXISTS idx_doc_content_storage_type ON DOC.doc_content(storage_type); + +CREATE INDEX IF NOT EXISTS idx_doc_text_repr_document ON DOC.doc_text_representation(document_id); +CREATE INDEX IF NOT EXISTS idx_doc_text_repr_content ON DOC.doc_text_representation(content_id); +CREATE INDEX IF NOT EXISTS idx_doc_text_repr_type ON DOC.doc_text_representation(representation_type); +CREATE INDEX IF NOT EXISTS idx_doc_text_repr_primary ON DOC.doc_text_representation(is_primary); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_doc_embedding_model_key ON DOC.doc_embedding_model(model_key); +CREATE INDEX IF NOT EXISTS idx_doc_embedding_model_active ON DOC.doc_embedding_model(active); + +CREATE INDEX IF NOT EXISTS idx_doc_embedding_document ON DOC.doc_embedding(document_id); +CREATE INDEX IF NOT EXISTS idx_doc_embedding_repr ON DOC.doc_embedding(representation_id); +CREATE INDEX IF NOT EXISTS idx_doc_embedding_model ON DOC.doc_embedding(model_id); +CREATE INDEX IF NOT EXISTS idx_doc_embedding_status ON DOC.doc_embedding(embedding_status); +CREATE INDEX IF NOT EXISTS idx_doc_embedding_embedded_at ON DOC.doc_embedding(embedded_at DESC); + +CREATE INDEX IF NOT EXISTS idx_doc_relation_parent ON DOC.doc_relation(parent_document_id); +CREATE INDEX IF NOT EXISTS idx_doc_relation_child ON DOC.doc_relation(child_document_id); +CREATE INDEX IF NOT EXISTS idx_doc_relation_type ON DOC.doc_relation(relation_type); + +COMMENT ON SCHEMA DOC IS 'Generic document platform schema introduced in Phase 1'; +COMMENT ON TABLE DOC.doc_document IS 'Canonical document root with optional owner tenant and mandatory visibility'; +COMMENT ON TABLE DOC.doc_content IS 'Stored payload variants for a canonical document'; +COMMENT ON TABLE DOC.doc_text_representation IS 'Search-oriented text representations derived from document content'; +COMMENT ON TABLE DOC.doc_embedding IS 'Embedding lifecycle separated from document structure'; diff --git a/src/main/resources/db/migration/V5__doc_phase2_vectorization_support.sql b/src/main/resources/db/migration/V5__doc_phase2_vectorization_support.sql new file mode 100644 index 0000000..cce7b19 --- /dev/null +++ b/src/main/resources/db/migration/V5__doc_phase2_vectorization_support.sql @@ -0,0 +1,14 @@ +-- Phase 2: Vectorization decoupling support in the generic DOC schema +-- Adds safety constraints and indexes for representation-based embedding processing. + +CREATE UNIQUE INDEX IF NOT EXISTS uq_doc_embedding_representation_model + ON DOC.doc_embedding(representation_id, model_id); + +CREATE INDEX IF NOT EXISTS idx_doc_embedding_status_created + ON DOC.doc_embedding(embedding_status, created_at); + +CREATE INDEX IF NOT EXISTS idx_doc_embedding_status_updated + ON DOC.doc_embedding(embedding_status, updated_at); + +CREATE INDEX IF NOT EXISTS idx_doc_text_repr_document_primary + ON DOC.doc_text_representation(document_id, is_primary);