fixed flyway db migration scripts

This commit is contained in:
trifonovt 2026-04-29 13:32:48 +02:00
parent 4bc503ed29
commit f9df7c8d22
26 changed files with 731 additions and 46 deletions

View File

@ -5,9 +5,9 @@ package at.procon.dip.architecture;
*/ */
public final class SchemaNames { public final class SchemaNames {
public static final String DOC = "DOC"; public static final String DOC = "doc";
public static final String TED = "TED"; public static final String TED = "ted";
public static final String TIME = "TIME"; public static final String TIME = "time";
private SchemaNames() { private SchemaNames() {
} }

View File

@ -6,6 +6,8 @@ import jakarta.persistence.*;
import java.time.OffsetDateTime; import java.time.OffsetDateTime;
import java.util.UUID; import java.util.UUID;
import lombok.*; import lombok.*;
import org.hibernate.annotations.JdbcTypeCode;
import org.hibernate.type.SqlTypes;
@Entity @Entity
@Table(schema = SchemaNames.TIME, name = "time_entry_search_projection", indexes = { @Table(schema = SchemaNames.TIME, name = "time_entry_search_projection", indexes = {
@ -35,7 +37,8 @@ public class TimeEntrySearchProjection {
private Document document; private Document document;
@Enumerated(EnumType.STRING) @Enumerated(EnumType.STRING)
@Column(name = "source_system", nullable = false, length = 32) @JdbcTypeCode(SqlTypes.NAMED_ENUM)
@Column(name = "source_system", nullable = false, columnDefinition = "TIME.time_source_system")
private TimeSourceSystem sourceSystem; private TimeSourceSystem sourceSystem;
@Column(name = "external_id", nullable = false, length = 255) @Column(name = "external_id", nullable = false, length = 255)

View File

@ -50,7 +50,7 @@ public class TimeSyncRun {
@Enumerated(EnumType.STRING) @Enumerated(EnumType.STRING)
@JdbcTypeCode(SqlTypes.NAMED_ENUM) @JdbcTypeCode(SqlTypes.NAMED_ENUM)
@Column(name = "run_type", columnDefinition = "TIME.time_sync_run_system") @Column(name = "run_type", columnDefinition = "TIME.time_sync_run_type")
private TimeSyncRunType runType; private TimeSyncRunType runType;
@Column(name = "scope_key", nullable = false, length = 255) @Column(name = "scope_key", nullable = false, length = 255)

View File

@ -38,13 +38,13 @@ public class Organization {
/** /**
* Internal organization reference from XML (e.g., "ORG-0001"). * Internal organization reference from XML (e.g., "ORG-0001").
*/ */
@Column(name = "org_reference", length = 50) @Column(name = "org_reference", length = 100)
private String orgReference; private String orgReference;
/** /**
* Role of the organization (e.g., "buyer", "review-body", "ted-esen"). * Role of the organization (e.g., "buyer", "review-body", "ted-esen").
*/ */
@Column(name = "role", length = 50) @Column(name = "role", length = 100)
private String role; private String role;
@Column(name = "name", columnDefinition = "TEXT") @Column(name = "name", columnDefinition = "TEXT")
@ -62,7 +62,7 @@ public class Organization {
@Column(name = "city", columnDefinition = "TEXT") @Column(name = "city", columnDefinition = "TEXT")
private String city; private String city;
@Column(name = "postal_code", length = 255) @Column(name = "postal_code", columnDefinition = "TEXT")
private String postalCode; private String postalCode;
@Column(name = "street_name", columnDefinition = "TEXT") @Column(name = "street_name", columnDefinition = "TEXT")
@ -77,7 +77,7 @@ public class Organization {
@Column(name = "email", length = 255) @Column(name = "email", length = 255)
private String email; private String email;
@Column(name = "phone", length = 50) @Column(name = "phone", length = 100)
private String phone; private String phone;
@Column(name = "created_at", nullable = false, updatable = false) @Column(name = "created_at", nullable = false, updatable = false)

View File

@ -49,11 +49,11 @@ dip:
execution-batch-size: 48 execution-batch-size: 48
startup: startup:
# Enqueue missing DOC representation embeddings on NEW-runtime startup. # Enqueue missing DOC representation embeddings on NEW-runtime startup.
enqueue-missing-enabled: true enqueue-missing-enabled: false
# Also process ready embedding jobs during startup. Requires dip.embedding.jobs.enabled=true. # Also process ready embedding jobs during startup. Requires dip.embedding.jobs.enabled=true.
process-ready-enabled: true process-ready-enabled: false
# Leave empty to enqueue missing embeddings for all document types, or set e.g. TED_NOTICE_LOT. # Leave empty to enqueue missing embeddings for all document types, or set e.g. TED_NOTICE_LOT.
document-type: TED_NOTICE_LOT document-type:
# Optional representation filter, e.g. SEMANTIC_TEXT. # Optional representation filter, e.g. SEMANTIC_TEXT.
representation-type: representation-type:
# Optional builder filter, e.g. ted-lot-clustering-text-v1. # Optional builder filter, e.g. ted-lot-clustering-text-v1.
@ -371,9 +371,9 @@ dip:
structured-search-facet-bucket-limit: 12 structured-search-facet-bucket-limit: 12
lot-documents: lot-documents:
# Materialize one canonical DOC document of type TED_NOTICE_LOT per TED lot. # Materialize one canonical DOC document of type TED_NOTICE_LOT per TED lot.
enabled: true enabled: false
# Optional startup/backfill path for notices that were imported before lot documents existed. # Optional startup/backfill path for notices that were imported before lot documents existed.
startup-backfill-enabled: true startup-backfill-enabled: false
# Maximum number of legacy TED lot documents to backfill during startup (0 = all) # Maximum number of legacy TED lot documents to backfill during startup (0 = all)
startup-backfill-limit: 0 startup-backfill-limit: 0
# Queue embeddings whenever the lot semantic text representation is created or changed. # Queue embeddings whenever the lot semantic text representation is created or changed.

View File

@ -14,8 +14,7 @@ spring:
name: document-intelligence-platform name: document-intelligence-platform
datasource: datasource:
url: jdbc:postgresql://localhost:5432/RELM url: jdbc:postgresql://${DB_HOST}:${DB_PORT}/${DB_NAME}
#url: jdbc:postgresql://94.130.218.54:32333/RELM
username: ${DB_USERNAME} username: ${DB_USERNAME}
password: ${DB_PASSWORD} password: ${DB_PASSWORD}
@ -37,21 +36,21 @@ spring:
hibernate: hibernate:
dialect: org.hibernate.dialect.PostgreSQLDialect dialect: org.hibernate.dialect.PostgreSQLDialect
format_sql: true format_sql: true
default_schema: TED default_schema: ted
jdbc: jdbc:
batch_size: 25 # Match chunk size for optimal batch processing batch_size: 25 # Match chunk size for optimal batch processing
order_inserts: true order_inserts: true
order_updates: true order_updates: true
flyway: flyway:
enabled: false enabled: true
locations: classpath:db/migration locations: classpath:db/migration
baseline-on-migrate: true baseline-on-migrate: true
create-schemas: true create-schemas: true
schemas: schemas:
- TED - ted
- DOC - doc
default-schema: TED default-schema: ted
# Apache Camel Configuration # Apache Camel Configuration
camel: camel:

View File

@ -12,6 +12,12 @@ BEGIN
FROM pg_constraint FROM pg_constraint
WHERE conname = 'uq_doc_embedding_representation_model' WHERE conname = 'uq_doc_embedding_representation_model'
AND conrelid = 'doc.doc_embedding'::regclass AND conrelid = 'doc.doc_embedding'::regclass
) AND NOT EXISTS (
SELECT 1
FROM pg_class c
JOIN pg_namespace n ON n.oid = c.relnamespace
WHERE n.nspname = 'doc'
AND c.relname = 'uq_doc_embedding_representation_model'
) THEN ) THEN
ALTER TABLE DOC.doc_embedding ALTER TABLE DOC.doc_embedding
ADD CONSTRAINT uq_doc_embedding_representation_model ADD CONSTRAINT uq_doc_embedding_representation_model
@ -36,4 +42,4 @@ COMMENT ON COLUMN DOC.doc_embedding.embedding_vector IS
-- --
-- If you use inner product or euclidean distance for a model, pick the matching operator class: -- If you use inner product or euclidean distance for a model, pick the matching operator class:
-- vector_ip_ops -- vector_ip_ops
-- vector_l2_ops -- vector_l2_ops

View File

@ -6,7 +6,7 @@
CREATE SCHEMA IF NOT EXISTS ted; CREATE SCHEMA IF NOT EXISTS ted;
-- Set search path to use TED schema -- Set search path to use TED schema
SET search_path TO ted; SET search_path TO ted, public;
-- Enable required PostgreSQL extensions (wenn Berechtigung vorhanden) -- Enable required PostgreSQL extensions (wenn Berechtigung vorhanden)
-- Falls Extensions nicht erstellt werden können, müssen diese vom DBA manuell erstellt werden -- Falls Extensions nicht erstellt werden können, müssen diese vom DBA manuell erstellt werden

View File

@ -17,7 +17,8 @@ WITH legacy_package_map AS (
FROM legacy_package_map l FROM legacy_package_map l
) )
INSERT INTO DOC.doc_document ( INSERT INTO DOC.doc_document (
id, visibility, document_type, document_family, status, title, summary, mime_type, business_key, dedup_hash id, visibility, document_type, document_family, status, title, summary, mime_type, business_key, dedup_hash,
created_at, updated_at
) )
SELECT SELECT
gen_random_uuid(), gen_random_uuid(),
@ -40,7 +41,9 @@ SELECT
END, END,
'application/gzip', 'application/gzip',
pd.business_key, pd.business_key,
pd.dedup_hash pd.dedup_hash,
CURRENT_TIMESTAMP,
CURRENT_TIMESTAMP
FROM package_documents pd FROM package_documents pd
LEFT JOIN TED.ted_daily_package pkg LEFT JOIN TED.ted_daily_package pkg
ON pkg.package_identifier = pd.package_identifier ON pkg.package_identifier = pd.package_identifier
@ -74,7 +77,7 @@ WITH legacy_package_map AS (
AND doc.business_key LIKE 'TED:package:%' AND doc.business_key LIKE 'TED:package:%'
) )
INSERT INTO DOC.doc_relation ( INSERT INTO DOC.doc_relation (
id, parent_document_id, child_document_id, relation_type, sort_order, relation_metadata id, parent_document_id, child_document_id, relation_type, sort_order, relation_metadata, created_at
) )
SELECT SELECT
gen_random_uuid(), gen_random_uuid(),
@ -82,7 +85,8 @@ SELECT
l.child_document_id, l.child_document_id,
'CONTAINS', 'CONTAINS',
NULL, NULL,
'packageIdentifier=' || l.package_identifier 'packageIdentifier=' || l.package_identifier,
CURRENT_TIMESTAMP
FROM legacy_package_map l FROM legacy_package_map l
JOIN package_documents pkg JOIN package_documents pkg
ON pkg.package_identifier = l.package_identifier ON pkg.package_identifier = l.package_identifier

View File

@ -29,9 +29,21 @@ CREATE INDEX IF NOT EXISTS idx_doc_attr_name_context
ALTER TABLE DOC.doc_document_attribute ALTER TABLE DOC.doc_document_attribute
ADD COLUMN IF NOT EXISTS string_value TEXT; ADD COLUMN IF NOT EXISTS string_value TEXT;
UPDATE DOC.doc_document_attribute DO $$
SET string_value = attribute_value BEGIN
WHERE string_value IS NULL AND attribute_value IS NOT NULL; IF EXISTS (
SELECT 1
FROM information_schema.columns
WHERE table_schema = 'doc'
AND table_name = 'doc_document_attribute'
AND column_name = 'attribute_value'
) THEN
UPDATE DOC.doc_document_attribute
SET string_value = attribute_value
WHERE string_value IS NULL AND attribute_value IS NOT NULL;
END IF;
END
$$;
ALTER TABLE DOC.doc_document_attribute ALTER TABLE DOC.doc_document_attribute
ADD COLUMN IF NOT EXISTS number_value NUMERIC; ADD COLUMN IF NOT EXISTS number_value NUMERIC;
@ -54,7 +66,7 @@ ALTER TABLE DOC.doc_document_attribute
(CASE WHEN number_value IS NOT NULL THEN 1 ELSE 0 END) + (CASE WHEN number_value IS NOT NULL THEN 1 ELSE 0 END) +
(CASE WHEN date_value IS NOT NULL THEN 1 ELSE 0 END) + (CASE WHEN date_value IS NOT NULL THEN 1 ELSE 0 END) +
(CASE WHEN boolean_value IS NOT NULL THEN 1 ELSE 0 END) = 1 (CASE WHEN boolean_value IS NOT NULL THEN 1 ELSE 0 END) = 1
); ) NOT VALID;
COMMENT ON COLUMN DOC.doc_attribute_name.attribute_context IS 'Optional namespace/context for avoiding name conflicts. GLOBAL is the default context.'; COMMENT ON COLUMN DOC.doc_attribute_name.attribute_context IS 'Optional namespace/context for avoiding name conflicts. GLOBAL is the default context.';
COMMENT ON COLUMN DOC.doc_attribute_name.attribute_value_type IS 'Declared type of the attribute value for this catalog entry.'; COMMENT ON COLUMN DOC.doc_attribute_name.attribute_value_type IS 'Declared type of the attribute value for this catalog entry.';

View File

@ -17,7 +17,7 @@ ALTER TABLE DOC.doc_document_attribute
(CASE WHEN date_value IS NOT NULL THEN 1 ELSE 0 END) + (CASE WHEN date_value IS NOT NULL THEN 1 ELSE 0 END) +
(CASE WHEN datetime_value IS NOT NULL THEN 1 ELSE 0 END) + (CASE WHEN datetime_value IS NOT NULL THEN 1 ELSE 0 END) +
(CASE WHEN boolean_value IS NOT NULL THEN 1 ELSE 0 END) = 1 (CASE WHEN boolean_value IS NOT NULL THEN 1 ELSE 0 END) = 1
); ) NOT VALID;
COMMENT ON COLUMN DOC.doc_document_attribute.integer_value IS 'Integer representation when the catalog entry type is INTEGER.'; COMMENT ON COLUMN DOC.doc_document_attribute.integer_value IS 'Integer representation when the catalog entry type is INTEGER.';
COMMENT ON COLUMN DOC.doc_document_attribute.datetime_value IS 'Date-time representation when the catalog entry type is DATETIME.'; COMMENT ON COLUMN DOC.doc_document_attribute.datetime_value IS 'Date-time representation when the catalog entry type is DATETIME.';

View File

@ -1,10 +1,10 @@
-- TIME Phase T3: search projection and representation materialization foundation for time entries. -- TIME Phase T3: search projection and representation materialization foundation for time entries.
CREATE TABLE IF NOT EXISTS TIME.time_entry_search_projection ( CREATE TABLE IF NOT EXISTS "time".time_entry_search_projection (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(), id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
time_entry_id UUID NOT NULL UNIQUE REFERENCES TIME.time_entry(id) ON DELETE CASCADE, time_entry_id UUID NOT NULL UNIQUE REFERENCES "time".time_entry(id) ON DELETE CASCADE,
document_id UUID NOT NULL UNIQUE REFERENCES DOC.doc_document(id) ON DELETE CASCADE, document_id UUID NOT NULL UNIQUE REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
source_system TIME.time_source_system NOT NULL, source_system "time".time_source_system NOT NULL,
external_id VARCHAR(255) NOT NULL, external_id VARCHAR(255) NOT NULL,
language_code VARCHAR(16), language_code VARCHAR(16),
entry_start TIMESTAMP WITH TIME ZONE, entry_start TIMESTAMP WITH TIME ZONE,
@ -54,8 +54,8 @@ CREATE TABLE IF NOT EXISTS TIME.time_entry_search_projection (
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
); );
CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_document ON TIME.time_entry_search_projection(document_id); CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_document ON "time".time_entry_search_projection(document_id);
CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_source ON TIME.time_entry_search_projection(source_system, external_id); CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_source ON "time".time_entry_search_projection(source_system, external_id);
CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_entry_start ON TIME.time_entry_search_projection(entry_start DESC); CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_entry_start ON "time".time_entry_search_projection(entry_start DESC);
CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_person ON TIME.time_entry_search_projection(person_external_id); CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_person ON "time".time_entry_search_projection(person_external_id);
CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_activity_type ON TIME.time_entry_search_projection(activity_type_id); CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_activity_type ON "time".time_entry_search_projection(activity_type_id);

View File

@ -0,0 +1,61 @@
-- Keep clustering run filter constraints aligned with the Java DOC enum values.
-- V32 added TED_NOTICE_LOT to doc.doc_document, but cluster runs persist their
-- own optional document_type filter and therefore need the same enum expansion.
ALTER TABLE doc.doc_embedding_cluster_run
DROP CONSTRAINT IF EXISTS doc_embedding_cluster_run_document_type_check;
ALTER TABLE doc.doc_embedding_cluster_run
ADD CONSTRAINT doc_embedding_cluster_run_document_type_check
CHECK (
document_type IS NULL OR document_type IN (
'TED_PACKAGE',
'TED_NOTICE',
'TED_NOTICE_LOT',
'TIME_ENTRY',
'EMAIL',
'MIME_MESSAGE',
'PDF',
'DOCX',
'HTML',
'XML_GENERIC',
'TEXT',
'MARKDOWN',
'ZIP_ARCHIVE',
'GENERIC_BINARY',
'UNKNOWN'
)
);
ALTER TABLE doc.doc_embedding_cluster_run
DROP CONSTRAINT IF EXISTS doc_embedding_cluster_run_document_family_check;
ALTER TABLE doc.doc_embedding_cluster_run
ADD CONSTRAINT doc_embedding_cluster_run_document_family_check
CHECK (
document_family IS NULL OR document_family IN (
'PROCUREMENT',
'TIME',
'MAIL',
'ATTACHMENT',
'KNOWLEDGE',
'GENERIC'
)
);
ALTER TABLE doc.doc_embedding_cluster_run
DROP CONSTRAINT IF EXISTS doc_embedding_cluster_run_representation_type_check;
ALTER TABLE doc.doc_embedding_cluster_run
ADD CONSTRAINT doc_embedding_cluster_run_representation_type_check
CHECK (
representation_type IS NULL OR representation_type IN (
'FULLTEXT',
'SEMANTIC_TEXT',
'SUMMARY',
'TITLE_ABSTRACT',
'CHUNK',
'METADATA_ENRICHED',
'ATTACHMENT_ROLLUP'
)
);

View File

@ -0,0 +1,172 @@
-- Align DOC enum-backed columns and Hibernate-created check constraints with
-- the current Java enum values.
DO $$
BEGIN
IF EXISTS (
SELECT 1
FROM pg_type t
JOIN pg_namespace n ON n.oid = t.typnamespace
WHERE n.nspname = 'doc'
AND t.typname = 'doc_document_type'
) THEN
ALTER TYPE DOC.doc_document_type ADD VALUE IF NOT EXISTS 'TED_PACKAGE';
ALTER TYPE DOC.doc_document_type ADD VALUE IF NOT EXISTS 'TED_NOTICE_LOT';
ALTER TYPE DOC.doc_document_type ADD VALUE IF NOT EXISTS 'TIME_ENTRY';
END IF;
END
$$;
DO $$
BEGIN
IF EXISTS (
SELECT 1
FROM pg_type t
JOIN pg_namespace n ON n.oid = t.typnamespace
WHERE n.nspname = 'doc'
AND t.typname = 'doc_document_family'
) THEN
ALTER TYPE DOC.doc_document_family ADD VALUE IF NOT EXISTS 'TIME';
END IF;
END
$$;
DO $$
BEGIN
IF EXISTS (
SELECT 1
FROM pg_type t
JOIN pg_namespace n ON n.oid = t.typnamespace
WHERE n.nspname = 'doc'
AND t.typname = 'doc_source_type'
) THEN
ALTER TYPE DOC.doc_source_type ADD VALUE IF NOT EXISTS 'PACKAGE_CHILD';
ALTER TYPE DOC.doc_source_type ADD VALUE IF NOT EXISTS 'MAIL_ATTACHMENT';
END IF;
END
$$;
DO $$
BEGIN
IF EXISTS (
SELECT 1
FROM pg_type t
JOIN pg_namespace n ON n.oid = t.typnamespace
WHERE n.nspname = 'doc'
AND t.typname = 'doc_representation_type'
) THEN
ALTER TYPE DOC.doc_representation_type ADD VALUE IF NOT EXISTS 'ATTACHMENT_ROLLUP';
END IF;
END
$$;
DO $$
BEGIN
IF EXISTS (
SELECT 1
FROM pg_type t
JOIN pg_namespace n ON n.oid = t.typnamespace
WHERE n.nspname = 'doc'
AND t.typname = 'doc_distance_metric'
) THEN
ALTER TYPE DOC.doc_distance_metric ADD VALUE IF NOT EXISTS 'EUCLIDEAN';
END IF;
END
$$;
DO $$
BEGIN
IF EXISTS (
SELECT 1
FROM pg_constraint c
JOIN pg_class r ON r.oid = c.conrelid
JOIN pg_namespace n ON n.oid = r.relnamespace
WHERE n.nspname = 'doc'
AND r.relname = 'doc_document'
AND c.conname = 'doc_document_document_type_check'
) THEN
ALTER TABLE DOC.doc_document DROP CONSTRAINT doc_document_document_type_check;
ALTER TABLE DOC.doc_document
ADD CONSTRAINT doc_document_document_type_check
CHECK (
document_type IN (
'TED_PACKAGE',
'TED_NOTICE',
'TED_NOTICE_LOT',
'TIME_ENTRY',
'EMAIL',
'MIME_MESSAGE',
'PDF',
'DOCX',
'HTML',
'XML_GENERIC',
'TEXT',
'MARKDOWN',
'ZIP_ARCHIVE',
'GENERIC_BINARY',
'UNKNOWN'
)
);
END IF;
END
$$;
DO $$
BEGIN
IF EXISTS (
SELECT 1
FROM pg_constraint c
JOIN pg_class r ON r.oid = c.conrelid
JOIN pg_namespace n ON n.oid = r.relnamespace
WHERE n.nspname = 'doc'
AND r.relname = 'doc_document'
AND c.conname = 'doc_document_document_family_check'
) THEN
ALTER TABLE DOC.doc_document DROP CONSTRAINT doc_document_document_family_check;
ALTER TABLE DOC.doc_document
ADD CONSTRAINT doc_document_document_family_check
CHECK (
document_family IN (
'PROCUREMENT',
'TIME',
'MAIL',
'ATTACHMENT',
'KNOWLEDGE',
'GENERIC'
)
);
END IF;
END
$$;
DO $$
BEGIN
IF EXISTS (
SELECT 1
FROM pg_constraint c
JOIN pg_class r ON r.oid = c.conrelid
JOIN pg_namespace n ON n.oid = r.relnamespace
WHERE n.nspname = 'doc'
AND r.relname = 'doc_source'
AND c.conname = 'doc_source_source_type_check'
) THEN
ALTER TABLE DOC.doc_source DROP CONSTRAINT doc_source_source_type_check;
ALTER TABLE DOC.doc_source
ADD CONSTRAINT doc_source_source_type_check
CHECK (
source_type IN (
'TED_PACKAGE',
'PACKAGE_CHILD',
'MAIL',
'MAIL_ATTACHMENT',
'FILE_SYSTEM',
'REST_UPLOAD',
'MANUAL_UPLOAD',
'ZIP_CHILD',
'API',
'MIGRATION'
)
);
END IF;
END
$$;

View File

@ -0,0 +1,7 @@
-- Align TIME text columns with the JPA mappings that store normalized labels/descriptions as TEXT.
ALTER TABLE "time".time_entry
ALTER COLUMN description_short TYPE TEXT;
ALTER TABLE "time".time_entry
ALTER COLUMN search_anchor_label TYPE TEXT;

View File

@ -0,0 +1,39 @@
-- Align legacy TED procurement_document table with the current JPA entity.
ALTER TABLE TED.procurement_document
ADD COLUMN IF NOT EXISTS notice_url VARCHAR(255);
ALTER TABLE TED.procurement_document
ADD COLUMN IF NOT EXISTS issue_datetime TIMESTAMP WITH TIME ZONE;
DO $$
BEGIN
IF EXISTS (
SELECT 1
FROM information_schema.columns
WHERE table_schema = 'ted'
AND table_name = 'procurement_document'
AND column_name = 'issue_date'
) THEN
UPDATE TED.procurement_document
SET issue_datetime = (issue_date + COALESCE(issue_time, TIME '00:00')) AT TIME ZONE current_setting('TIMEZONE')
WHERE issue_datetime IS NULL
AND issue_date IS NOT NULL;
END IF;
END
$$;
ALTER TABLE TED.procurement_document
ADD COLUMN IF NOT EXISTS embedding_token_count INTEGER;
ALTER TABLE TED.procurement_document
ALTER COLUMN buyer_city TYPE TEXT;
ALTER TABLE TED.procurement_document
ALTER COLUMN buyer_postal_code TYPE TEXT;
ALTER TABLE TED.procurement_document
ALTER COLUMN internal_reference TYPE TEXT;
ALTER TABLE TED.procurement_lot
ALTER COLUMN internal_id TYPE TEXT;

View File

@ -0,0 +1,16 @@
-- Align VARCHAR columns that are mapped as TEXT in current JPA entities.
ALTER TABLE DOC.doc_document
ALTER COLUMN title TYPE TEXT;
ALTER TABLE TED.ted_notice_projection
ALTER COLUMN notice_url TYPE TEXT;
ALTER TABLE TED.ted_notice_projection
ALTER COLUMN buyer_postal_code TYPE TEXT;
ALTER TABLE TED.ted_notice_projection
ALTER COLUMN internal_reference TYPE TEXT;
ALTER TABLE TED.ted_notice_organization
ALTER COLUMN company_id TYPE TEXT;

View File

@ -27,6 +27,7 @@ BEGIN
AND t.typname = 'doc_source_type' AND t.typname = 'doc_source_type'
) THEN ) THEN
ALTER TYPE DOC.doc_source_type ADD VALUE IF NOT EXISTS 'PACKAGE_CHILD'; ALTER TYPE DOC.doc_source_type ADD VALUE IF NOT EXISTS 'PACKAGE_CHILD';
ALTER TYPE DOC.doc_source_type ADD VALUE IF NOT EXISTS 'MAIL_ATTACHMENT';
END IF; END IF;
END END
$$; $$;
@ -34,6 +35,7 @@ $$;
DO $$ DO $$
BEGIN BEGIN
IF EXISTS ( IF EXISTS (
SELECT 1
FROM pg_constraint c FROM pg_constraint c
JOIN pg_class r ON r.oid = c.conrelid JOIN pg_class r ON r.oid = c.conrelid
JOIN pg_namespace n ON n.oid = r.relnamespace JOIN pg_namespace n ON n.oid = r.relnamespace
@ -46,7 +48,7 @@ BEGIN
ADD CONSTRAINT doc_document_document_type_check ADD CONSTRAINT doc_document_document_type_check
CHECK ( CHECK (
document_type IN ( document_type IN (
'TED_PACKAGE', 'TED_NOTICE', 'EMAIL', 'MIME_MESSAGE', 'PDF', 'DOCX', 'HTML', 'TED_PACKAGE', 'TED_NOTICE', 'TED_NOTICE_LOT','EMAIL', 'MIME_MESSAGE', 'PDF', 'DOCX', 'HTML',
'XML_GENERIC', 'TEXT', 'MARKDOWN', 'ZIP_ARCHIVE', 'GENERIC_BINARY', 'TIME_ENTRY', 'UNKNOWN' 'XML_GENERIC', 'TEXT', 'MARKDOWN', 'ZIP_ARCHIVE', 'GENERIC_BINARY', 'TIME_ENTRY', 'UNKNOWN'
) )
); );
@ -70,7 +72,7 @@ BEGIN
ADD CONSTRAINT doc_source_source_type_check ADD CONSTRAINT doc_source_source_type_check
CHECK ( CHECK (
source_type IN ( source_type IN (
'TED_PACKAGE', 'PACKAGE_CHILD', 'MAIL', 'FILE_SYSTEM', 'REST_UPLOAD', 'TED_PACKAGE', 'PACKAGE_CHILD', 'MAIL', 'MAIL_ATTACHMENT', 'FILE_SYSTEM', 'REST_UPLOAD',
'MANUAL_UPLOAD', 'ZIP_CHILD', 'API', 'MIGRATION' 'MANUAL_UPLOAD', 'ZIP_CHILD', 'API', 'MIGRATION'
) )
); );

View File

@ -0,0 +1,95 @@
package at.procon.dip.domain.ted.startup;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import at.procon.dip.domain.ted.config.TedProjectionProperties;
import at.procon.dip.domain.ted.entity.TedNoticeProjection;
import at.procon.dip.domain.ted.repository.TedNoticeProjectionRepository;
import at.procon.dip.domain.ted.service.TedLotDocumentMaterializationService;
import at.procon.dip.domain.ted.service.TedNoticeProjectionService;
import at.procon.ted.model.entity.ProcurementDocument;
import at.procon.ted.repository.ProcurementDocumentRepository;
import java.util.List;
import java.util.UUID;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.ArgumentCaptor;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import org.springframework.data.domain.PageImpl;
import org.springframework.data.domain.Pageable;
@ExtendWith(MockitoExtension.class)
class TedProjectionStartupRunnerTest {
@Mock
private ProcurementDocumentRepository procurementDocumentRepository;
@Mock
private TedNoticeProjectionRepository projectionRepository;
@Mock
private TedNoticeProjectionService projectionService;
@Mock
private TedLotDocumentMaterializationService lotDocumentMaterializationService;
private TedProjectionProperties properties;
private TedProjectionStartupRunner runner;
@BeforeEach
void setUp() {
properties = new TedProjectionProperties();
runner = new TedProjectionStartupRunner(
properties,
procurementDocumentRepository,
projectionRepository,
projectionService,
lotDocumentMaterializationService
);
}
@Test
void lotDocumentBackfillTreatsZeroLimitAsUnboundedBatch() {
properties.getLotDocuments().setEnabled(true);
properties.getLotDocuments().setStartupBackfillEnabled(true);
properties.getLotDocuments().setStartupBackfillLimit(0);
UUID projectionId = UUID.randomUUID();
TedNoticeProjection projection = new TedNoticeProjection();
projection.setId(projectionId);
when(projectionRepository.findAll(any(Pageable.class)))
.thenAnswer(invocation -> new PageImpl<>(List.of(projection), invocation.getArgument(0), 1));
when(lotDocumentMaterializationService.materializeProjectionLots(projectionId)).thenReturn(2);
runner.run(null);
ArgumentCaptor<Pageable> pageable = ArgumentCaptor.forClass(Pageable.class);
verify(projectionRepository).findAll(pageable.capture());
assertThat(pageable.getValue().getPageSize()).isEqualTo(1000);
verify(lotDocumentMaterializationService).materializeProjectionLots(projectionId);
}
@Test
void noticeProjectionBackfillTreatsZeroLimitAsUnboundedBatch() {
properties.setStartupBackfillEnabled(true);
properties.setStartupBackfillLimit(0);
UUID legacyDocumentId = UUID.randomUUID();
ProcurementDocument legacyDocument = new ProcurementDocument();
legacyDocument.setId(legacyDocumentId);
when(procurementDocumentRepository.findAll(any(Pageable.class)))
.thenAnswer(invocation -> new PageImpl<>(List.of(legacyDocument), invocation.getArgument(0), 1));
when(projectionRepository.existsByLegacyProcurementDocumentId(legacyDocumentId)).thenReturn(false);
runner.run(null);
ArgumentCaptor<Pageable> pageable = ArgumentCaptor.forClass(Pageable.class);
verify(procurementDocumentRepository).findAll(pageable.capture());
assertThat(pageable.getValue().getPageSize()).isEqualTo(1000);
verify(projectionService).registerOrRefreshProjection(legacyDocument);
}
}

View File

@ -0,0 +1,112 @@
package at.procon.dip.embedding.service;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.EmbeddingStatus;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.domain.document.entity.Document;
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
import at.procon.dip.embedding.job.service.EmbeddingJobService;
import at.procon.dip.embedding.model.EmbeddingJobType;
import at.procon.dip.embedding.model.EmbeddingModelDescriptor;
import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
import java.util.List;
import java.util.UUID;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.ArgumentCaptor;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import org.springframework.data.domain.Pageable;
@ExtendWith(MockitoExtension.class)
class ScopedEmbeddingEnqueueServiceTest {
@Mock
private DocumentTextRepresentationRepository representationRepository;
@Mock
private EmbeddingJobService jobService;
@Mock
private EmbeddingModelRegistry modelRegistry;
private ScopedEmbeddingEnqueueService service;
@BeforeEach
void setUp() {
service = new ScopedEmbeddingEnqueueService(representationRepository, jobService, modelRegistry);
}
@Test
void queuesOnlyMatchingRepresentationsForDocumentType() {
String modelKey = "e5-default";
UUID documentId = UUID.randomUUID();
UUID representationId = UUID.randomUUID();
Document document = new Document();
document.setId(documentId);
DocumentTextRepresentation representation = new DocumentTextRepresentation();
representation.setId(representationId);
representation.setDocument(document);
when(modelRegistry.getRequiredDefaultDocumentModelKey()).thenReturn(modelKey);
when(modelRegistry.getRequired(modelKey)).thenReturn(anyModel(modelKey));
when(representationRepository.findEmbeddingCandidatesByDocumentType(
any(), any(), any(), any(Boolean.class), any(), any(), any(Boolean.class), any(Pageable.class)))
.thenReturn(List.of(representation));
ScopedEmbeddingEnqueueResult result = service.enqueueByDocumentType(
DocumentType.TED_NOTICE_LOT,
RepresentationType.SEMANTIC_TEXT,
null,
true,
null,
false,
250
);
verify(representationRepository).findEmbeddingCandidatesByDocumentType(
DocumentType.TED_NOTICE_LOT,
RepresentationType.SEMANTIC_TEXT,
null,
true,
modelKey,
EmbeddingStatus.COMPLETED,
false,
Pageable.ofSize(250)
);
verify(jobService).enqueueForRepresentation(
documentId,
representationId,
modelKey,
EmbeddingJobType.DOCUMENT_EMBED
);
assertThat(result.jobsQueuedOrAlreadyActive()).isEqualTo(1);
assertThat(result.matchedRepresentations()).isEqualTo(1);
assertThat(result.modelKey()).isEqualTo(modelKey);
}
@Test
void capsLargeLimits() {
String modelKey = "e5-default";
when(modelRegistry.getRequired(modelKey)).thenReturn(anyModel(modelKey));
when(representationRepository.findEmbeddingCandidatesByDocumentType(
any(), any(), any(), any(Boolean.class), any(), any(), any(Boolean.class), any(Pageable.class)))
.thenReturn(List.of());
service.enqueueByDocumentType(DocumentType.TED_NOTICE_LOT, null, null, false, modelKey, true, 50_000);
ArgumentCaptor<Pageable> pageable = ArgumentCaptor.forClass(Pageable.class);
verify(representationRepository).findEmbeddingCandidatesByDocumentType(
any(), any(), any(), any(Boolean.class), any(), any(), any(Boolean.class), pageable.capture());
assertThat(pageable.getValue().getPageSize()).isEqualTo(10_000);
}
private EmbeddingModelDescriptor anyModel(String modelKey) {
return new EmbeddingModelDescriptor(modelKey, "mock", modelKey, 3, null, true, false, null, true, null, null, null);
}
}

View File

@ -0,0 +1,88 @@
package at.procon.dip.embedding.startup;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.embedding.config.EmbeddingProperties;
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueResult;
import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueService;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
@ExtendWith(MockitoExtension.class)
class EmbeddingStartupRunnerTest {
@Mock
private ScopedEmbeddingEnqueueService enqueueService;
@Mock
private RepresentationEmbeddingOrchestrator orchestrator;
private EmbeddingProperties properties;
private EmbeddingStartupRunner runner;
@BeforeEach
void setUp() {
properties = new EmbeddingProperties();
properties.setEnabled(true);
runner = new EmbeddingStartupRunner(properties, enqueueService, orchestrator);
}
@Test
void startupCanEnqueueAllMissingTedLotEmbeddingsAcrossBatches() {
EmbeddingProperties.StartupProperties startup = properties.getStartup();
startup.setEnqueueMissingEnabled(true);
startup.setDocumentType(DocumentType.TED_NOTICE_LOT);
startup.setRepresentationType(RepresentationType.SEMANTIC_TEXT);
startup.setPrimaryOnly(true);
startup.setBatchSize(2);
when(enqueueService.enqueueByDocumentType(
DocumentType.TED_NOTICE_LOT,
RepresentationType.SEMANTIC_TEXT,
null,
true,
null,
false,
0,
2
)).thenReturn(new ScopedEmbeddingEnqueueResult(
DocumentType.TED_NOTICE_LOT, RepresentationType.SEMANTIC_TEXT, null, true, "e5-default", false, 2, 2, 2));
when(enqueueService.enqueueByDocumentType(
DocumentType.TED_NOTICE_LOT,
RepresentationType.SEMANTIC_TEXT,
null,
true,
null,
false,
1,
2
)).thenReturn(new ScopedEmbeddingEnqueueResult(
DocumentType.TED_NOTICE_LOT, RepresentationType.SEMANTIC_TEXT, null, true, "e5-default", false, 2, 1, 1));
runner.run(null);
verify(enqueueService).enqueueByDocumentType(
DocumentType.TED_NOTICE_LOT, RepresentationType.SEMANTIC_TEXT, null, true, null, false, 0, 2);
verify(enqueueService).enqueueByDocumentType(
DocumentType.TED_NOTICE_LOT, RepresentationType.SEMANTIC_TEXT, null, true, null, false, 1, 2);
verify(orchestrator, never()).processNextReadyBatch();
}
@Test
void startupCanProcessReadyJobsWhenJobSubsystemIsEnabled() {
properties.getStartup().setProcessReadyEnabled(true);
properties.getJobs().setEnabled(true);
when(orchestrator.processNextReadyBatch()).thenReturn(3);
runner.run(null);
verify(orchestrator).processNextReadyBatch();
}
}

View File

@ -0,0 +1,69 @@
package at.procon.dip.embedding.web;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import at.procon.dip.domain.document.DocumentType;
import at.procon.dip.domain.document.RepresentationType;
import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueResult;
import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueService;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.ArgumentCaptor;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
@ExtendWith(MockitoExtension.class)
class EmbeddingAdminControllerTest {
@Mock
private ScopedEmbeddingEnqueueService enqueueService;
private EmbeddingAdminController controller;
@BeforeEach
void setUp() {
controller = new EmbeddingAdminController(enqueueService);
}
@Test
void acceptsLowercaseDocumentTypeAndRepresentationType() {
when(enqueueService.enqueueByDocumentType(
org.mockito.ArgumentMatchers.any(),
org.mockito.ArgumentMatchers.any(),
org.mockito.ArgumentMatchers.any(),
org.mockito.ArgumentMatchers.anyBoolean(),
org.mockito.ArgumentMatchers.any(),
org.mockito.ArgumentMatchers.anyBoolean(),
org.mockito.ArgumentMatchers.anyInt()))
.thenReturn(new ScopedEmbeddingEnqueueResult(
DocumentType.TED_NOTICE_LOT,
RepresentationType.SEMANTIC_TEXT,
null,
true,
"e5-default",
false,
1000,
0,
0
));
controller.enqueueByDocumentType("ted_notice_lot", "semantic-text", null, true, null, false, 1000);
ArgumentCaptor<DocumentType> documentType = ArgumentCaptor.forClass(DocumentType.class);
ArgumentCaptor<RepresentationType> representationType = ArgumentCaptor.forClass(RepresentationType.class);
verify(enqueueService).enqueueByDocumentType(
documentType.capture(),
representationType.capture(),
org.mockito.ArgumentMatchers.isNull(),
org.mockito.ArgumentMatchers.eq(true),
org.mockito.ArgumentMatchers.isNull(),
org.mockito.ArgumentMatchers.eq(false),
org.mockito.ArgumentMatchers.eq(1000)
);
assertThat(documentType.getValue()).isEqualTo(DocumentType.TED_NOTICE_LOT);
assertThat(representationType.getValue()).isEqualTo(RepresentationType.SEMANTIC_TEXT);
}
}

View File

@ -91,7 +91,7 @@ import static org.assertj.core.api.Assertions.assertThat;
"spring.jpa.hibernate.ddl-auto=create-drop", "spring.jpa.hibernate.ddl-auto=create-drop",
"spring.jpa.show-sql=false", "spring.jpa.show-sql=false",
"spring.jpa.open-in-view=false", "spring.jpa.open-in-view=false",
"spring.jpa.properties.hibernate.default_schema=DOC", "spring.jpa.properties.hibernate.default_schema=doc",
"ted.vectorization.enabled=false", "ted.vectorization.enabled=false",
"dip.ingestion.enabled=true", "dip.ingestion.enabled=true",
"dip.ingestion.mail-adapter-enabled=true", "dip.ingestion.mail-adapter-enabled=true",

View File

@ -25,7 +25,7 @@ import org.testcontainers.junit.jupiter.Testcontainers;
"spring.jpa.hibernate.ddl-auto=create-drop", "spring.jpa.hibernate.ddl-auto=create-drop",
"spring.jpa.show-sql=false", "spring.jpa.show-sql=false",
"spring.jpa.open-in-view=false", "spring.jpa.open-in-view=false",
"spring.jpa.properties.hibernate.default_schema=DOC", "spring.jpa.properties.hibernate.default_schema=doc",
"spring.main.lazy-initialization=true", "spring.main.lazy-initialization=true",
"ted.vectorization.enabled=false", "ted.vectorization.enabled=false",
"ted.search.default-page-size=20", "ted.search.default-page-size=20",

View File

@ -28,7 +28,7 @@ import org.testcontainers.utility.DockerImageName;
"spring.jpa.hibernate.ddl-auto=create-drop", "spring.jpa.hibernate.ddl-auto=create-drop",
"spring.jpa.show-sql=false", "spring.jpa.show-sql=false",
"spring.jpa.open-in-view=false", "spring.jpa.open-in-view=false",
"spring.jpa.properties.hibernate.default_schema=DOC", "spring.jpa.properties.hibernate.default_schema=doc",
"spring.main.lazy-initialization=true", "spring.main.lazy-initialization=true",
"server.servlet.context-path=/api", "server.servlet.context-path=/api",

View File

@ -26,7 +26,7 @@ import org.testcontainers.junit.jupiter.Testcontainers;
"spring.jpa.hibernate.ddl-auto=create-drop", "spring.jpa.hibernate.ddl-auto=create-drop",
"spring.jpa.show-sql=false", "spring.jpa.show-sql=false",
"spring.jpa.open-in-view=false", "spring.jpa.open-in-view=false",
"spring.jpa.properties.hibernate.default_schema=DOC", "spring.jpa.properties.hibernate.default_schema=doc",
"spring.main.lazy-initialization=true", "spring.main.lazy-initialization=true",
"dip.runtime.mode=NEW", "dip.runtime.mode=NEW",
"dip.search.default-page-size=20", "dip.search.default-page-size=20",