fixed flyway db migration scripts
This commit is contained in:
parent
4bc503ed29
commit
f9df7c8d22
|
|
@ -5,9 +5,9 @@ package at.procon.dip.architecture;
|
||||||
*/
|
*/
|
||||||
public final class SchemaNames {
|
public final class SchemaNames {
|
||||||
|
|
||||||
public static final String DOC = "DOC";
|
public static final String DOC = "doc";
|
||||||
public static final String TED = "TED";
|
public static final String TED = "ted";
|
||||||
public static final String TIME = "TIME";
|
public static final String TIME = "time";
|
||||||
|
|
||||||
private SchemaNames() {
|
private SchemaNames() {
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,8 @@ import jakarta.persistence.*;
|
||||||
import java.time.OffsetDateTime;
|
import java.time.OffsetDateTime;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
import lombok.*;
|
import lombok.*;
|
||||||
|
import org.hibernate.annotations.JdbcTypeCode;
|
||||||
|
import org.hibernate.type.SqlTypes;
|
||||||
|
|
||||||
@Entity
|
@Entity
|
||||||
@Table(schema = SchemaNames.TIME, name = "time_entry_search_projection", indexes = {
|
@Table(schema = SchemaNames.TIME, name = "time_entry_search_projection", indexes = {
|
||||||
|
|
@ -35,7 +37,8 @@ public class TimeEntrySearchProjection {
|
||||||
private Document document;
|
private Document document;
|
||||||
|
|
||||||
@Enumerated(EnumType.STRING)
|
@Enumerated(EnumType.STRING)
|
||||||
@Column(name = "source_system", nullable = false, length = 32)
|
@JdbcTypeCode(SqlTypes.NAMED_ENUM)
|
||||||
|
@Column(name = "source_system", nullable = false, columnDefinition = "TIME.time_source_system")
|
||||||
private TimeSourceSystem sourceSystem;
|
private TimeSourceSystem sourceSystem;
|
||||||
|
|
||||||
@Column(name = "external_id", nullable = false, length = 255)
|
@Column(name = "external_id", nullable = false, length = 255)
|
||||||
|
|
|
||||||
|
|
@ -50,7 +50,7 @@ public class TimeSyncRun {
|
||||||
|
|
||||||
@Enumerated(EnumType.STRING)
|
@Enumerated(EnumType.STRING)
|
||||||
@JdbcTypeCode(SqlTypes.NAMED_ENUM)
|
@JdbcTypeCode(SqlTypes.NAMED_ENUM)
|
||||||
@Column(name = "run_type", columnDefinition = "TIME.time_sync_run_system")
|
@Column(name = "run_type", columnDefinition = "TIME.time_sync_run_type")
|
||||||
private TimeSyncRunType runType;
|
private TimeSyncRunType runType;
|
||||||
|
|
||||||
@Column(name = "scope_key", nullable = false, length = 255)
|
@Column(name = "scope_key", nullable = false, length = 255)
|
||||||
|
|
|
||||||
|
|
@ -38,13 +38,13 @@ public class Organization {
|
||||||
/**
|
/**
|
||||||
* Internal organization reference from XML (e.g., "ORG-0001").
|
* Internal organization reference from XML (e.g., "ORG-0001").
|
||||||
*/
|
*/
|
||||||
@Column(name = "org_reference", length = 50)
|
@Column(name = "org_reference", length = 100)
|
||||||
private String orgReference;
|
private String orgReference;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Role of the organization (e.g., "buyer", "review-body", "ted-esen").
|
* Role of the organization (e.g., "buyer", "review-body", "ted-esen").
|
||||||
*/
|
*/
|
||||||
@Column(name = "role", length = 50)
|
@Column(name = "role", length = 100)
|
||||||
private String role;
|
private String role;
|
||||||
|
|
||||||
@Column(name = "name", columnDefinition = "TEXT")
|
@Column(name = "name", columnDefinition = "TEXT")
|
||||||
|
|
@ -62,7 +62,7 @@ public class Organization {
|
||||||
@Column(name = "city", columnDefinition = "TEXT")
|
@Column(name = "city", columnDefinition = "TEXT")
|
||||||
private String city;
|
private String city;
|
||||||
|
|
||||||
@Column(name = "postal_code", length = 255)
|
@Column(name = "postal_code", columnDefinition = "TEXT")
|
||||||
private String postalCode;
|
private String postalCode;
|
||||||
|
|
||||||
@Column(name = "street_name", columnDefinition = "TEXT")
|
@Column(name = "street_name", columnDefinition = "TEXT")
|
||||||
|
|
@ -77,7 +77,7 @@ public class Organization {
|
||||||
@Column(name = "email", length = 255)
|
@Column(name = "email", length = 255)
|
||||||
private String email;
|
private String email;
|
||||||
|
|
||||||
@Column(name = "phone", length = 50)
|
@Column(name = "phone", length = 100)
|
||||||
private String phone;
|
private String phone;
|
||||||
|
|
||||||
@Column(name = "created_at", nullable = false, updatable = false)
|
@Column(name = "created_at", nullable = false, updatable = false)
|
||||||
|
|
|
||||||
|
|
@ -49,11 +49,11 @@ dip:
|
||||||
execution-batch-size: 48
|
execution-batch-size: 48
|
||||||
startup:
|
startup:
|
||||||
# Enqueue missing DOC representation embeddings on NEW-runtime startup.
|
# Enqueue missing DOC representation embeddings on NEW-runtime startup.
|
||||||
enqueue-missing-enabled: true
|
enqueue-missing-enabled: false
|
||||||
# Also process ready embedding jobs during startup. Requires dip.embedding.jobs.enabled=true.
|
# Also process ready embedding jobs during startup. Requires dip.embedding.jobs.enabled=true.
|
||||||
process-ready-enabled: true
|
process-ready-enabled: false
|
||||||
# Leave empty to enqueue missing embeddings for all document types, or set e.g. TED_NOTICE_LOT.
|
# Leave empty to enqueue missing embeddings for all document types, or set e.g. TED_NOTICE_LOT.
|
||||||
document-type: TED_NOTICE_LOT
|
document-type:
|
||||||
# Optional representation filter, e.g. SEMANTIC_TEXT.
|
# Optional representation filter, e.g. SEMANTIC_TEXT.
|
||||||
representation-type:
|
representation-type:
|
||||||
# Optional builder filter, e.g. ted-lot-clustering-text-v1.
|
# Optional builder filter, e.g. ted-lot-clustering-text-v1.
|
||||||
|
|
@ -371,9 +371,9 @@ dip:
|
||||||
structured-search-facet-bucket-limit: 12
|
structured-search-facet-bucket-limit: 12
|
||||||
lot-documents:
|
lot-documents:
|
||||||
# Materialize one canonical DOC document of type TED_NOTICE_LOT per TED lot.
|
# Materialize one canonical DOC document of type TED_NOTICE_LOT per TED lot.
|
||||||
enabled: true
|
enabled: false
|
||||||
# Optional startup/backfill path for notices that were imported before lot documents existed.
|
# Optional startup/backfill path for notices that were imported before lot documents existed.
|
||||||
startup-backfill-enabled: true
|
startup-backfill-enabled: false
|
||||||
# Maximum number of legacy TED lot documents to backfill during startup (0 = all)
|
# Maximum number of legacy TED lot documents to backfill during startup (0 = all)
|
||||||
startup-backfill-limit: 0
|
startup-backfill-limit: 0
|
||||||
# Queue embeddings whenever the lot semantic text representation is created or changed.
|
# Queue embeddings whenever the lot semantic text representation is created or changed.
|
||||||
|
|
|
||||||
|
|
@ -14,8 +14,7 @@ spring:
|
||||||
name: document-intelligence-platform
|
name: document-intelligence-platform
|
||||||
|
|
||||||
datasource:
|
datasource:
|
||||||
url: jdbc:postgresql://localhost:5432/RELM
|
url: jdbc:postgresql://${DB_HOST}:${DB_PORT}/${DB_NAME}
|
||||||
#url: jdbc:postgresql://94.130.218.54:32333/RELM
|
|
||||||
username: ${DB_USERNAME}
|
username: ${DB_USERNAME}
|
||||||
password: ${DB_PASSWORD}
|
password: ${DB_PASSWORD}
|
||||||
|
|
||||||
|
|
@ -37,21 +36,21 @@ spring:
|
||||||
hibernate:
|
hibernate:
|
||||||
dialect: org.hibernate.dialect.PostgreSQLDialect
|
dialect: org.hibernate.dialect.PostgreSQLDialect
|
||||||
format_sql: true
|
format_sql: true
|
||||||
default_schema: TED
|
default_schema: ted
|
||||||
jdbc:
|
jdbc:
|
||||||
batch_size: 25 # Match chunk size for optimal batch processing
|
batch_size: 25 # Match chunk size for optimal batch processing
|
||||||
order_inserts: true
|
order_inserts: true
|
||||||
order_updates: true
|
order_updates: true
|
||||||
|
|
||||||
flyway:
|
flyway:
|
||||||
enabled: false
|
enabled: true
|
||||||
locations: classpath:db/migration
|
locations: classpath:db/migration
|
||||||
baseline-on-migrate: true
|
baseline-on-migrate: true
|
||||||
create-schemas: true
|
create-schemas: true
|
||||||
schemas:
|
schemas:
|
||||||
- TED
|
- ted
|
||||||
- DOC
|
- doc
|
||||||
default-schema: TED
|
default-schema: ted
|
||||||
|
|
||||||
# Apache Camel Configuration
|
# Apache Camel Configuration
|
||||||
camel:
|
camel:
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,12 @@ BEGIN
|
||||||
FROM pg_constraint
|
FROM pg_constraint
|
||||||
WHERE conname = 'uq_doc_embedding_representation_model'
|
WHERE conname = 'uq_doc_embedding_representation_model'
|
||||||
AND conrelid = 'doc.doc_embedding'::regclass
|
AND conrelid = 'doc.doc_embedding'::regclass
|
||||||
|
) AND NOT EXISTS (
|
||||||
|
SELECT 1
|
||||||
|
FROM pg_class c
|
||||||
|
JOIN pg_namespace n ON n.oid = c.relnamespace
|
||||||
|
WHERE n.nspname = 'doc'
|
||||||
|
AND c.relname = 'uq_doc_embedding_representation_model'
|
||||||
) THEN
|
) THEN
|
||||||
ALTER TABLE DOC.doc_embedding
|
ALTER TABLE DOC.doc_embedding
|
||||||
ADD CONSTRAINT uq_doc_embedding_representation_model
|
ADD CONSTRAINT uq_doc_embedding_representation_model
|
||||||
|
|
@ -36,4 +42,4 @@ COMMENT ON COLUMN DOC.doc_embedding.embedding_vector IS
|
||||||
--
|
--
|
||||||
-- If you use inner product or euclidean distance for a model, pick the matching operator class:
|
-- If you use inner product or euclidean distance for a model, pick the matching operator class:
|
||||||
-- vector_ip_ops
|
-- vector_ip_ops
|
||||||
-- vector_l2_ops
|
-- vector_l2_ops
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@
|
||||||
CREATE SCHEMA IF NOT EXISTS ted;
|
CREATE SCHEMA IF NOT EXISTS ted;
|
||||||
|
|
||||||
-- Set search path to use TED schema
|
-- Set search path to use TED schema
|
||||||
SET search_path TO ted;
|
SET search_path TO ted, public;
|
||||||
|
|
||||||
-- Enable required PostgreSQL extensions (wenn Berechtigung vorhanden)
|
-- Enable required PostgreSQL extensions (wenn Berechtigung vorhanden)
|
||||||
-- Falls Extensions nicht erstellt werden können, müssen diese vom DBA manuell erstellt werden
|
-- Falls Extensions nicht erstellt werden können, müssen diese vom DBA manuell erstellt werden
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,8 @@ WITH legacy_package_map AS (
|
||||||
FROM legacy_package_map l
|
FROM legacy_package_map l
|
||||||
)
|
)
|
||||||
INSERT INTO DOC.doc_document (
|
INSERT INTO DOC.doc_document (
|
||||||
id, visibility, document_type, document_family, status, title, summary, mime_type, business_key, dedup_hash
|
id, visibility, document_type, document_family, status, title, summary, mime_type, business_key, dedup_hash,
|
||||||
|
created_at, updated_at
|
||||||
)
|
)
|
||||||
SELECT
|
SELECT
|
||||||
gen_random_uuid(),
|
gen_random_uuid(),
|
||||||
|
|
@ -40,7 +41,9 @@ SELECT
|
||||||
END,
|
END,
|
||||||
'application/gzip',
|
'application/gzip',
|
||||||
pd.business_key,
|
pd.business_key,
|
||||||
pd.dedup_hash
|
pd.dedup_hash,
|
||||||
|
CURRENT_TIMESTAMP,
|
||||||
|
CURRENT_TIMESTAMP
|
||||||
FROM package_documents pd
|
FROM package_documents pd
|
||||||
LEFT JOIN TED.ted_daily_package pkg
|
LEFT JOIN TED.ted_daily_package pkg
|
||||||
ON pkg.package_identifier = pd.package_identifier
|
ON pkg.package_identifier = pd.package_identifier
|
||||||
|
|
@ -74,7 +77,7 @@ WITH legacy_package_map AS (
|
||||||
AND doc.business_key LIKE 'TED:package:%'
|
AND doc.business_key LIKE 'TED:package:%'
|
||||||
)
|
)
|
||||||
INSERT INTO DOC.doc_relation (
|
INSERT INTO DOC.doc_relation (
|
||||||
id, parent_document_id, child_document_id, relation_type, sort_order, relation_metadata
|
id, parent_document_id, child_document_id, relation_type, sort_order, relation_metadata, created_at
|
||||||
)
|
)
|
||||||
SELECT
|
SELECT
|
||||||
gen_random_uuid(),
|
gen_random_uuid(),
|
||||||
|
|
@ -82,7 +85,8 @@ SELECT
|
||||||
l.child_document_id,
|
l.child_document_id,
|
||||||
'CONTAINS',
|
'CONTAINS',
|
||||||
NULL,
|
NULL,
|
||||||
'packageIdentifier=' || l.package_identifier
|
'packageIdentifier=' || l.package_identifier,
|
||||||
|
CURRENT_TIMESTAMP
|
||||||
FROM legacy_package_map l
|
FROM legacy_package_map l
|
||||||
JOIN package_documents pkg
|
JOIN package_documents pkg
|
||||||
ON pkg.package_identifier = l.package_identifier
|
ON pkg.package_identifier = l.package_identifier
|
||||||
|
|
|
||||||
|
|
@ -29,9 +29,21 @@ CREATE INDEX IF NOT EXISTS idx_doc_attr_name_context
|
||||||
ALTER TABLE DOC.doc_document_attribute
|
ALTER TABLE DOC.doc_document_attribute
|
||||||
ADD COLUMN IF NOT EXISTS string_value TEXT;
|
ADD COLUMN IF NOT EXISTS string_value TEXT;
|
||||||
|
|
||||||
UPDATE DOC.doc_document_attribute
|
DO $$
|
||||||
SET string_value = attribute_value
|
BEGIN
|
||||||
WHERE string_value IS NULL AND attribute_value IS NOT NULL;
|
IF EXISTS (
|
||||||
|
SELECT 1
|
||||||
|
FROM information_schema.columns
|
||||||
|
WHERE table_schema = 'doc'
|
||||||
|
AND table_name = 'doc_document_attribute'
|
||||||
|
AND column_name = 'attribute_value'
|
||||||
|
) THEN
|
||||||
|
UPDATE DOC.doc_document_attribute
|
||||||
|
SET string_value = attribute_value
|
||||||
|
WHERE string_value IS NULL AND attribute_value IS NOT NULL;
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
ALTER TABLE DOC.doc_document_attribute
|
ALTER TABLE DOC.doc_document_attribute
|
||||||
ADD COLUMN IF NOT EXISTS number_value NUMERIC;
|
ADD COLUMN IF NOT EXISTS number_value NUMERIC;
|
||||||
|
|
@ -54,7 +66,7 @@ ALTER TABLE DOC.doc_document_attribute
|
||||||
(CASE WHEN number_value IS NOT NULL THEN 1 ELSE 0 END) +
|
(CASE WHEN number_value IS NOT NULL THEN 1 ELSE 0 END) +
|
||||||
(CASE WHEN date_value IS NOT NULL THEN 1 ELSE 0 END) +
|
(CASE WHEN date_value IS NOT NULL THEN 1 ELSE 0 END) +
|
||||||
(CASE WHEN boolean_value IS NOT NULL THEN 1 ELSE 0 END) = 1
|
(CASE WHEN boolean_value IS NOT NULL THEN 1 ELSE 0 END) = 1
|
||||||
);
|
) NOT VALID;
|
||||||
|
|
||||||
COMMENT ON COLUMN DOC.doc_attribute_name.attribute_context IS 'Optional namespace/context for avoiding name conflicts. GLOBAL is the default context.';
|
COMMENT ON COLUMN DOC.doc_attribute_name.attribute_context IS 'Optional namespace/context for avoiding name conflicts. GLOBAL is the default context.';
|
||||||
COMMENT ON COLUMN DOC.doc_attribute_name.attribute_value_type IS 'Declared type of the attribute value for this catalog entry.';
|
COMMENT ON COLUMN DOC.doc_attribute_name.attribute_value_type IS 'Declared type of the attribute value for this catalog entry.';
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,7 @@ ALTER TABLE DOC.doc_document_attribute
|
||||||
(CASE WHEN date_value IS NOT NULL THEN 1 ELSE 0 END) +
|
(CASE WHEN date_value IS NOT NULL THEN 1 ELSE 0 END) +
|
||||||
(CASE WHEN datetime_value IS NOT NULL THEN 1 ELSE 0 END) +
|
(CASE WHEN datetime_value IS NOT NULL THEN 1 ELSE 0 END) +
|
||||||
(CASE WHEN boolean_value IS NOT NULL THEN 1 ELSE 0 END) = 1
|
(CASE WHEN boolean_value IS NOT NULL THEN 1 ELSE 0 END) = 1
|
||||||
);
|
) NOT VALID;
|
||||||
|
|
||||||
COMMENT ON COLUMN DOC.doc_document_attribute.integer_value IS 'Integer representation when the catalog entry type is INTEGER.';
|
COMMENT ON COLUMN DOC.doc_document_attribute.integer_value IS 'Integer representation when the catalog entry type is INTEGER.';
|
||||||
COMMENT ON COLUMN DOC.doc_document_attribute.datetime_value IS 'Date-time representation when the catalog entry type is DATETIME.';
|
COMMENT ON COLUMN DOC.doc_document_attribute.datetime_value IS 'Date-time representation when the catalog entry type is DATETIME.';
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,10 @@
|
||||||
-- TIME Phase T3: search projection and representation materialization foundation for time entries.
|
-- TIME Phase T3: search projection and representation materialization foundation for time entries.
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS TIME.time_entry_search_projection (
|
CREATE TABLE IF NOT EXISTS "time".time_entry_search_projection (
|
||||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
time_entry_id UUID NOT NULL UNIQUE REFERENCES TIME.time_entry(id) ON DELETE CASCADE,
|
time_entry_id UUID NOT NULL UNIQUE REFERENCES "time".time_entry(id) ON DELETE CASCADE,
|
||||||
document_id UUID NOT NULL UNIQUE REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
|
document_id UUID NOT NULL UNIQUE REFERENCES DOC.doc_document(id) ON DELETE CASCADE,
|
||||||
source_system TIME.time_source_system NOT NULL,
|
source_system "time".time_source_system NOT NULL,
|
||||||
external_id VARCHAR(255) NOT NULL,
|
external_id VARCHAR(255) NOT NULL,
|
||||||
language_code VARCHAR(16),
|
language_code VARCHAR(16),
|
||||||
entry_start TIMESTAMP WITH TIME ZONE,
|
entry_start TIMESTAMP WITH TIME ZONE,
|
||||||
|
|
@ -54,8 +54,8 @@ CREATE TABLE IF NOT EXISTS TIME.time_entry_search_projection (
|
||||||
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_document ON TIME.time_entry_search_projection(document_id);
|
CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_document ON "time".time_entry_search_projection(document_id);
|
||||||
CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_source ON TIME.time_entry_search_projection(source_system, external_id);
|
CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_source ON "time".time_entry_search_projection(source_system, external_id);
|
||||||
CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_entry_start ON TIME.time_entry_search_projection(entry_start DESC);
|
CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_entry_start ON "time".time_entry_search_projection(entry_start DESC);
|
||||||
CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_person ON TIME.time_entry_search_projection(person_external_id);
|
CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_person ON "time".time_entry_search_projection(person_external_id);
|
||||||
CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_activity_type ON TIME.time_entry_search_projection(activity_type_id);
|
CREATE INDEX IF NOT EXISTS idx_time_entry_search_projection_activity_type ON "time".time_entry_search_projection(activity_type_id);
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,61 @@
|
||||||
|
-- Keep clustering run filter constraints aligned with the Java DOC enum values.
|
||||||
|
-- V32 added TED_NOTICE_LOT to doc.doc_document, but cluster runs persist their
|
||||||
|
-- own optional document_type filter and therefore need the same enum expansion.
|
||||||
|
|
||||||
|
ALTER TABLE doc.doc_embedding_cluster_run
|
||||||
|
DROP CONSTRAINT IF EXISTS doc_embedding_cluster_run_document_type_check;
|
||||||
|
|
||||||
|
ALTER TABLE doc.doc_embedding_cluster_run
|
||||||
|
ADD CONSTRAINT doc_embedding_cluster_run_document_type_check
|
||||||
|
CHECK (
|
||||||
|
document_type IS NULL OR document_type IN (
|
||||||
|
'TED_PACKAGE',
|
||||||
|
'TED_NOTICE',
|
||||||
|
'TED_NOTICE_LOT',
|
||||||
|
'TIME_ENTRY',
|
||||||
|
'EMAIL',
|
||||||
|
'MIME_MESSAGE',
|
||||||
|
'PDF',
|
||||||
|
'DOCX',
|
||||||
|
'HTML',
|
||||||
|
'XML_GENERIC',
|
||||||
|
'TEXT',
|
||||||
|
'MARKDOWN',
|
||||||
|
'ZIP_ARCHIVE',
|
||||||
|
'GENERIC_BINARY',
|
||||||
|
'UNKNOWN'
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
|
ALTER TABLE doc.doc_embedding_cluster_run
|
||||||
|
DROP CONSTRAINT IF EXISTS doc_embedding_cluster_run_document_family_check;
|
||||||
|
|
||||||
|
ALTER TABLE doc.doc_embedding_cluster_run
|
||||||
|
ADD CONSTRAINT doc_embedding_cluster_run_document_family_check
|
||||||
|
CHECK (
|
||||||
|
document_family IS NULL OR document_family IN (
|
||||||
|
'PROCUREMENT',
|
||||||
|
'TIME',
|
||||||
|
'MAIL',
|
||||||
|
'ATTACHMENT',
|
||||||
|
'KNOWLEDGE',
|
||||||
|
'GENERIC'
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
|
ALTER TABLE doc.doc_embedding_cluster_run
|
||||||
|
DROP CONSTRAINT IF EXISTS doc_embedding_cluster_run_representation_type_check;
|
||||||
|
|
||||||
|
ALTER TABLE doc.doc_embedding_cluster_run
|
||||||
|
ADD CONSTRAINT doc_embedding_cluster_run_representation_type_check
|
||||||
|
CHECK (
|
||||||
|
representation_type IS NULL OR representation_type IN (
|
||||||
|
'FULLTEXT',
|
||||||
|
'SEMANTIC_TEXT',
|
||||||
|
'SUMMARY',
|
||||||
|
'TITLE_ABSTRACT',
|
||||||
|
'CHUNK',
|
||||||
|
'METADATA_ENRICHED',
|
||||||
|
'ATTACHMENT_ROLLUP'
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
@ -0,0 +1,172 @@
|
||||||
|
-- Align DOC enum-backed columns and Hibernate-created check constraints with
|
||||||
|
-- the current Java enum values.
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF EXISTS (
|
||||||
|
SELECT 1
|
||||||
|
FROM pg_type t
|
||||||
|
JOIN pg_namespace n ON n.oid = t.typnamespace
|
||||||
|
WHERE n.nspname = 'doc'
|
||||||
|
AND t.typname = 'doc_document_type'
|
||||||
|
) THEN
|
||||||
|
ALTER TYPE DOC.doc_document_type ADD VALUE IF NOT EXISTS 'TED_PACKAGE';
|
||||||
|
ALTER TYPE DOC.doc_document_type ADD VALUE IF NOT EXISTS 'TED_NOTICE_LOT';
|
||||||
|
ALTER TYPE DOC.doc_document_type ADD VALUE IF NOT EXISTS 'TIME_ENTRY';
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF EXISTS (
|
||||||
|
SELECT 1
|
||||||
|
FROM pg_type t
|
||||||
|
JOIN pg_namespace n ON n.oid = t.typnamespace
|
||||||
|
WHERE n.nspname = 'doc'
|
||||||
|
AND t.typname = 'doc_document_family'
|
||||||
|
) THEN
|
||||||
|
ALTER TYPE DOC.doc_document_family ADD VALUE IF NOT EXISTS 'TIME';
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF EXISTS (
|
||||||
|
SELECT 1
|
||||||
|
FROM pg_type t
|
||||||
|
JOIN pg_namespace n ON n.oid = t.typnamespace
|
||||||
|
WHERE n.nspname = 'doc'
|
||||||
|
AND t.typname = 'doc_source_type'
|
||||||
|
) THEN
|
||||||
|
ALTER TYPE DOC.doc_source_type ADD VALUE IF NOT EXISTS 'PACKAGE_CHILD';
|
||||||
|
ALTER TYPE DOC.doc_source_type ADD VALUE IF NOT EXISTS 'MAIL_ATTACHMENT';
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF EXISTS (
|
||||||
|
SELECT 1
|
||||||
|
FROM pg_type t
|
||||||
|
JOIN pg_namespace n ON n.oid = t.typnamespace
|
||||||
|
WHERE n.nspname = 'doc'
|
||||||
|
AND t.typname = 'doc_representation_type'
|
||||||
|
) THEN
|
||||||
|
ALTER TYPE DOC.doc_representation_type ADD VALUE IF NOT EXISTS 'ATTACHMENT_ROLLUP';
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF EXISTS (
|
||||||
|
SELECT 1
|
||||||
|
FROM pg_type t
|
||||||
|
JOIN pg_namespace n ON n.oid = t.typnamespace
|
||||||
|
WHERE n.nspname = 'doc'
|
||||||
|
AND t.typname = 'doc_distance_metric'
|
||||||
|
) THEN
|
||||||
|
ALTER TYPE DOC.doc_distance_metric ADD VALUE IF NOT EXISTS 'EUCLIDEAN';
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF EXISTS (
|
||||||
|
SELECT 1
|
||||||
|
FROM pg_constraint c
|
||||||
|
JOIN pg_class r ON r.oid = c.conrelid
|
||||||
|
JOIN pg_namespace n ON n.oid = r.relnamespace
|
||||||
|
WHERE n.nspname = 'doc'
|
||||||
|
AND r.relname = 'doc_document'
|
||||||
|
AND c.conname = 'doc_document_document_type_check'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE DOC.doc_document DROP CONSTRAINT doc_document_document_type_check;
|
||||||
|
ALTER TABLE DOC.doc_document
|
||||||
|
ADD CONSTRAINT doc_document_document_type_check
|
||||||
|
CHECK (
|
||||||
|
document_type IN (
|
||||||
|
'TED_PACKAGE',
|
||||||
|
'TED_NOTICE',
|
||||||
|
'TED_NOTICE_LOT',
|
||||||
|
'TIME_ENTRY',
|
||||||
|
'EMAIL',
|
||||||
|
'MIME_MESSAGE',
|
||||||
|
'PDF',
|
||||||
|
'DOCX',
|
||||||
|
'HTML',
|
||||||
|
'XML_GENERIC',
|
||||||
|
'TEXT',
|
||||||
|
'MARKDOWN',
|
||||||
|
'ZIP_ARCHIVE',
|
||||||
|
'GENERIC_BINARY',
|
||||||
|
'UNKNOWN'
|
||||||
|
)
|
||||||
|
);
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF EXISTS (
|
||||||
|
SELECT 1
|
||||||
|
FROM pg_constraint c
|
||||||
|
JOIN pg_class r ON r.oid = c.conrelid
|
||||||
|
JOIN pg_namespace n ON n.oid = r.relnamespace
|
||||||
|
WHERE n.nspname = 'doc'
|
||||||
|
AND r.relname = 'doc_document'
|
||||||
|
AND c.conname = 'doc_document_document_family_check'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE DOC.doc_document DROP CONSTRAINT doc_document_document_family_check;
|
||||||
|
ALTER TABLE DOC.doc_document
|
||||||
|
ADD CONSTRAINT doc_document_document_family_check
|
||||||
|
CHECK (
|
||||||
|
document_family IN (
|
||||||
|
'PROCUREMENT',
|
||||||
|
'TIME',
|
||||||
|
'MAIL',
|
||||||
|
'ATTACHMENT',
|
||||||
|
'KNOWLEDGE',
|
||||||
|
'GENERIC'
|
||||||
|
)
|
||||||
|
);
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF EXISTS (
|
||||||
|
SELECT 1
|
||||||
|
FROM pg_constraint c
|
||||||
|
JOIN pg_class r ON r.oid = c.conrelid
|
||||||
|
JOIN pg_namespace n ON n.oid = r.relnamespace
|
||||||
|
WHERE n.nspname = 'doc'
|
||||||
|
AND r.relname = 'doc_source'
|
||||||
|
AND c.conname = 'doc_source_source_type_check'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE DOC.doc_source DROP CONSTRAINT doc_source_source_type_check;
|
||||||
|
ALTER TABLE DOC.doc_source
|
||||||
|
ADD CONSTRAINT doc_source_source_type_check
|
||||||
|
CHECK (
|
||||||
|
source_type IN (
|
||||||
|
'TED_PACKAGE',
|
||||||
|
'PACKAGE_CHILD',
|
||||||
|
'MAIL',
|
||||||
|
'MAIL_ATTACHMENT',
|
||||||
|
'FILE_SYSTEM',
|
||||||
|
'REST_UPLOAD',
|
||||||
|
'MANUAL_UPLOAD',
|
||||||
|
'ZIP_CHILD',
|
||||||
|
'API',
|
||||||
|
'MIGRATION'
|
||||||
|
)
|
||||||
|
);
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
-- Align TIME text columns with the JPA mappings that store normalized labels/descriptions as TEXT.
|
||||||
|
|
||||||
|
ALTER TABLE "time".time_entry
|
||||||
|
ALTER COLUMN description_short TYPE TEXT;
|
||||||
|
|
||||||
|
ALTER TABLE "time".time_entry
|
||||||
|
ALTER COLUMN search_anchor_label TYPE TEXT;
|
||||||
|
|
@ -0,0 +1,39 @@
|
||||||
|
-- Align legacy TED procurement_document table with the current JPA entity.
|
||||||
|
|
||||||
|
ALTER TABLE TED.procurement_document
|
||||||
|
ADD COLUMN IF NOT EXISTS notice_url VARCHAR(255);
|
||||||
|
|
||||||
|
ALTER TABLE TED.procurement_document
|
||||||
|
ADD COLUMN IF NOT EXISTS issue_datetime TIMESTAMP WITH TIME ZONE;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF EXISTS (
|
||||||
|
SELECT 1
|
||||||
|
FROM information_schema.columns
|
||||||
|
WHERE table_schema = 'ted'
|
||||||
|
AND table_name = 'procurement_document'
|
||||||
|
AND column_name = 'issue_date'
|
||||||
|
) THEN
|
||||||
|
UPDATE TED.procurement_document
|
||||||
|
SET issue_datetime = (issue_date + COALESCE(issue_time, TIME '00:00')) AT TIME ZONE current_setting('TIMEZONE')
|
||||||
|
WHERE issue_datetime IS NULL
|
||||||
|
AND issue_date IS NOT NULL;
|
||||||
|
END IF;
|
||||||
|
END
|
||||||
|
$$;
|
||||||
|
|
||||||
|
ALTER TABLE TED.procurement_document
|
||||||
|
ADD COLUMN IF NOT EXISTS embedding_token_count INTEGER;
|
||||||
|
|
||||||
|
ALTER TABLE TED.procurement_document
|
||||||
|
ALTER COLUMN buyer_city TYPE TEXT;
|
||||||
|
|
||||||
|
ALTER TABLE TED.procurement_document
|
||||||
|
ALTER COLUMN buyer_postal_code TYPE TEXT;
|
||||||
|
|
||||||
|
ALTER TABLE TED.procurement_document
|
||||||
|
ALTER COLUMN internal_reference TYPE TEXT;
|
||||||
|
|
||||||
|
ALTER TABLE TED.procurement_lot
|
||||||
|
ALTER COLUMN internal_id TYPE TEXT;
|
||||||
|
|
@ -0,0 +1,16 @@
|
||||||
|
-- Align VARCHAR columns that are mapped as TEXT in current JPA entities.
|
||||||
|
|
||||||
|
ALTER TABLE DOC.doc_document
|
||||||
|
ALTER COLUMN title TYPE TEXT;
|
||||||
|
|
||||||
|
ALTER TABLE TED.ted_notice_projection
|
||||||
|
ALTER COLUMN notice_url TYPE TEXT;
|
||||||
|
|
||||||
|
ALTER TABLE TED.ted_notice_projection
|
||||||
|
ALTER COLUMN buyer_postal_code TYPE TEXT;
|
||||||
|
|
||||||
|
ALTER TABLE TED.ted_notice_projection
|
||||||
|
ALTER COLUMN internal_reference TYPE TEXT;
|
||||||
|
|
||||||
|
ALTER TABLE TED.ted_notice_organization
|
||||||
|
ALTER COLUMN company_id TYPE TEXT;
|
||||||
|
|
@ -27,6 +27,7 @@ BEGIN
|
||||||
AND t.typname = 'doc_source_type'
|
AND t.typname = 'doc_source_type'
|
||||||
) THEN
|
) THEN
|
||||||
ALTER TYPE DOC.doc_source_type ADD VALUE IF NOT EXISTS 'PACKAGE_CHILD';
|
ALTER TYPE DOC.doc_source_type ADD VALUE IF NOT EXISTS 'PACKAGE_CHILD';
|
||||||
|
ALTER TYPE DOC.doc_source_type ADD VALUE IF NOT EXISTS 'MAIL_ATTACHMENT';
|
||||||
END IF;
|
END IF;
|
||||||
END
|
END
|
||||||
$$;
|
$$;
|
||||||
|
|
@ -34,6 +35,7 @@ $$;
|
||||||
DO $$
|
DO $$
|
||||||
BEGIN
|
BEGIN
|
||||||
IF EXISTS (
|
IF EXISTS (
|
||||||
|
SELECT 1
|
||||||
FROM pg_constraint c
|
FROM pg_constraint c
|
||||||
JOIN pg_class r ON r.oid = c.conrelid
|
JOIN pg_class r ON r.oid = c.conrelid
|
||||||
JOIN pg_namespace n ON n.oid = r.relnamespace
|
JOIN pg_namespace n ON n.oid = r.relnamespace
|
||||||
|
|
@ -46,7 +48,7 @@ BEGIN
|
||||||
ADD CONSTRAINT doc_document_document_type_check
|
ADD CONSTRAINT doc_document_document_type_check
|
||||||
CHECK (
|
CHECK (
|
||||||
document_type IN (
|
document_type IN (
|
||||||
'TED_PACKAGE', 'TED_NOTICE', 'EMAIL', 'MIME_MESSAGE', 'PDF', 'DOCX', 'HTML',
|
'TED_PACKAGE', 'TED_NOTICE', 'TED_NOTICE_LOT','EMAIL', 'MIME_MESSAGE', 'PDF', 'DOCX', 'HTML',
|
||||||
'XML_GENERIC', 'TEXT', 'MARKDOWN', 'ZIP_ARCHIVE', 'GENERIC_BINARY', 'TIME_ENTRY', 'UNKNOWN'
|
'XML_GENERIC', 'TEXT', 'MARKDOWN', 'ZIP_ARCHIVE', 'GENERIC_BINARY', 'TIME_ENTRY', 'UNKNOWN'
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
|
@ -70,7 +72,7 @@ BEGIN
|
||||||
ADD CONSTRAINT doc_source_source_type_check
|
ADD CONSTRAINT doc_source_source_type_check
|
||||||
CHECK (
|
CHECK (
|
||||||
source_type IN (
|
source_type IN (
|
||||||
'TED_PACKAGE', 'PACKAGE_CHILD', 'MAIL', 'FILE_SYSTEM', 'REST_UPLOAD',
|
'TED_PACKAGE', 'PACKAGE_CHILD', 'MAIL', 'MAIL_ATTACHMENT', 'FILE_SYSTEM', 'REST_UPLOAD',
|
||||||
'MANUAL_UPLOAD', 'ZIP_CHILD', 'API', 'MIGRATION'
|
'MANUAL_UPLOAD', 'ZIP_CHILD', 'API', 'MIGRATION'
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,95 @@
|
||||||
|
package at.procon.dip.domain.ted.startup;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.ted.config.TedProjectionProperties;
|
||||||
|
import at.procon.dip.domain.ted.entity.TedNoticeProjection;
|
||||||
|
import at.procon.dip.domain.ted.repository.TedNoticeProjectionRepository;
|
||||||
|
import at.procon.dip.domain.ted.service.TedLotDocumentMaterializationService;
|
||||||
|
import at.procon.dip.domain.ted.service.TedNoticeProjectionService;
|
||||||
|
import at.procon.ted.model.entity.ProcurementDocument;
|
||||||
|
import at.procon.ted.repository.ProcurementDocumentRepository;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.UUID;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
import org.mockito.ArgumentCaptor;
|
||||||
|
import org.mockito.Mock;
|
||||||
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
import org.springframework.data.domain.PageImpl;
|
||||||
|
import org.springframework.data.domain.Pageable;
|
||||||
|
|
||||||
|
@ExtendWith(MockitoExtension.class)
|
||||||
|
class TedProjectionStartupRunnerTest {
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
private ProcurementDocumentRepository procurementDocumentRepository;
|
||||||
|
@Mock
|
||||||
|
private TedNoticeProjectionRepository projectionRepository;
|
||||||
|
@Mock
|
||||||
|
private TedNoticeProjectionService projectionService;
|
||||||
|
@Mock
|
||||||
|
private TedLotDocumentMaterializationService lotDocumentMaterializationService;
|
||||||
|
|
||||||
|
private TedProjectionProperties properties;
|
||||||
|
private TedProjectionStartupRunner runner;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() {
|
||||||
|
properties = new TedProjectionProperties();
|
||||||
|
runner = new TedProjectionStartupRunner(
|
||||||
|
properties,
|
||||||
|
procurementDocumentRepository,
|
||||||
|
projectionRepository,
|
||||||
|
projectionService,
|
||||||
|
lotDocumentMaterializationService
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void lotDocumentBackfillTreatsZeroLimitAsUnboundedBatch() {
|
||||||
|
properties.getLotDocuments().setEnabled(true);
|
||||||
|
properties.getLotDocuments().setStartupBackfillEnabled(true);
|
||||||
|
properties.getLotDocuments().setStartupBackfillLimit(0);
|
||||||
|
|
||||||
|
UUID projectionId = UUID.randomUUID();
|
||||||
|
TedNoticeProjection projection = new TedNoticeProjection();
|
||||||
|
projection.setId(projectionId);
|
||||||
|
|
||||||
|
when(projectionRepository.findAll(any(Pageable.class)))
|
||||||
|
.thenAnswer(invocation -> new PageImpl<>(List.of(projection), invocation.getArgument(0), 1));
|
||||||
|
when(lotDocumentMaterializationService.materializeProjectionLots(projectionId)).thenReturn(2);
|
||||||
|
|
||||||
|
runner.run(null);
|
||||||
|
|
||||||
|
ArgumentCaptor<Pageable> pageable = ArgumentCaptor.forClass(Pageable.class);
|
||||||
|
verify(projectionRepository).findAll(pageable.capture());
|
||||||
|
assertThat(pageable.getValue().getPageSize()).isEqualTo(1000);
|
||||||
|
verify(lotDocumentMaterializationService).materializeProjectionLots(projectionId);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void noticeProjectionBackfillTreatsZeroLimitAsUnboundedBatch() {
|
||||||
|
properties.setStartupBackfillEnabled(true);
|
||||||
|
properties.setStartupBackfillLimit(0);
|
||||||
|
|
||||||
|
UUID legacyDocumentId = UUID.randomUUID();
|
||||||
|
ProcurementDocument legacyDocument = new ProcurementDocument();
|
||||||
|
legacyDocument.setId(legacyDocumentId);
|
||||||
|
|
||||||
|
when(procurementDocumentRepository.findAll(any(Pageable.class)))
|
||||||
|
.thenAnswer(invocation -> new PageImpl<>(List.of(legacyDocument), invocation.getArgument(0), 1));
|
||||||
|
when(projectionRepository.existsByLegacyProcurementDocumentId(legacyDocumentId)).thenReturn(false);
|
||||||
|
|
||||||
|
runner.run(null);
|
||||||
|
|
||||||
|
ArgumentCaptor<Pageable> pageable = ArgumentCaptor.forClass(Pageable.class);
|
||||||
|
verify(procurementDocumentRepository).findAll(pageable.capture());
|
||||||
|
assertThat(pageable.getValue().getPageSize()).isEqualTo(1000);
|
||||||
|
verify(projectionService).registerOrRefreshProjection(legacyDocument);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,112 @@
|
||||||
|
package at.procon.dip.embedding.service;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.domain.document.EmbeddingStatus;
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
|
import at.procon.dip.domain.document.entity.Document;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentTextRepresentationRepository;
|
||||||
|
import at.procon.dip.embedding.job.service.EmbeddingJobService;
|
||||||
|
import at.procon.dip.embedding.model.EmbeddingJobType;
|
||||||
|
import at.procon.dip.embedding.model.EmbeddingModelDescriptor;
|
||||||
|
import at.procon.dip.embedding.registry.EmbeddingModelRegistry;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.UUID;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
import org.mockito.ArgumentCaptor;
|
||||||
|
import org.mockito.Mock;
|
||||||
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
import org.springframework.data.domain.Pageable;
|
||||||
|
|
||||||
|
@ExtendWith(MockitoExtension.class)
|
||||||
|
class ScopedEmbeddingEnqueueServiceTest {
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
private DocumentTextRepresentationRepository representationRepository;
|
||||||
|
@Mock
|
||||||
|
private EmbeddingJobService jobService;
|
||||||
|
@Mock
|
||||||
|
private EmbeddingModelRegistry modelRegistry;
|
||||||
|
|
||||||
|
private ScopedEmbeddingEnqueueService service;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() {
|
||||||
|
service = new ScopedEmbeddingEnqueueService(representationRepository, jobService, modelRegistry);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void queuesOnlyMatchingRepresentationsForDocumentType() {
|
||||||
|
String modelKey = "e5-default";
|
||||||
|
UUID documentId = UUID.randomUUID();
|
||||||
|
UUID representationId = UUID.randomUUID();
|
||||||
|
Document document = new Document();
|
||||||
|
document.setId(documentId);
|
||||||
|
DocumentTextRepresentation representation = new DocumentTextRepresentation();
|
||||||
|
representation.setId(representationId);
|
||||||
|
representation.setDocument(document);
|
||||||
|
|
||||||
|
when(modelRegistry.getRequiredDefaultDocumentModelKey()).thenReturn(modelKey);
|
||||||
|
when(modelRegistry.getRequired(modelKey)).thenReturn(anyModel(modelKey));
|
||||||
|
when(representationRepository.findEmbeddingCandidatesByDocumentType(
|
||||||
|
any(), any(), any(), any(Boolean.class), any(), any(), any(Boolean.class), any(Pageable.class)))
|
||||||
|
.thenReturn(List.of(representation));
|
||||||
|
|
||||||
|
ScopedEmbeddingEnqueueResult result = service.enqueueByDocumentType(
|
||||||
|
DocumentType.TED_NOTICE_LOT,
|
||||||
|
RepresentationType.SEMANTIC_TEXT,
|
||||||
|
null,
|
||||||
|
true,
|
||||||
|
null,
|
||||||
|
false,
|
||||||
|
250
|
||||||
|
);
|
||||||
|
|
||||||
|
verify(representationRepository).findEmbeddingCandidatesByDocumentType(
|
||||||
|
DocumentType.TED_NOTICE_LOT,
|
||||||
|
RepresentationType.SEMANTIC_TEXT,
|
||||||
|
null,
|
||||||
|
true,
|
||||||
|
modelKey,
|
||||||
|
EmbeddingStatus.COMPLETED,
|
||||||
|
false,
|
||||||
|
Pageable.ofSize(250)
|
||||||
|
);
|
||||||
|
verify(jobService).enqueueForRepresentation(
|
||||||
|
documentId,
|
||||||
|
representationId,
|
||||||
|
modelKey,
|
||||||
|
EmbeddingJobType.DOCUMENT_EMBED
|
||||||
|
);
|
||||||
|
assertThat(result.jobsQueuedOrAlreadyActive()).isEqualTo(1);
|
||||||
|
assertThat(result.matchedRepresentations()).isEqualTo(1);
|
||||||
|
assertThat(result.modelKey()).isEqualTo(modelKey);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void capsLargeLimits() {
|
||||||
|
String modelKey = "e5-default";
|
||||||
|
when(modelRegistry.getRequired(modelKey)).thenReturn(anyModel(modelKey));
|
||||||
|
when(representationRepository.findEmbeddingCandidatesByDocumentType(
|
||||||
|
any(), any(), any(), any(Boolean.class), any(), any(), any(Boolean.class), any(Pageable.class)))
|
||||||
|
.thenReturn(List.of());
|
||||||
|
|
||||||
|
service.enqueueByDocumentType(DocumentType.TED_NOTICE_LOT, null, null, false, modelKey, true, 50_000);
|
||||||
|
|
||||||
|
ArgumentCaptor<Pageable> pageable = ArgumentCaptor.forClass(Pageable.class);
|
||||||
|
verify(representationRepository).findEmbeddingCandidatesByDocumentType(
|
||||||
|
any(), any(), any(), any(Boolean.class), any(), any(), any(Boolean.class), pageable.capture());
|
||||||
|
assertThat(pageable.getValue().getPageSize()).isEqualTo(10_000);
|
||||||
|
}
|
||||||
|
|
||||||
|
private EmbeddingModelDescriptor anyModel(String modelKey) {
|
||||||
|
return new EmbeddingModelDescriptor(modelKey, "mock", modelKey, 3, null, true, false, null, true, null, null, null);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,88 @@
|
||||||
|
package at.procon.dip.embedding.startup;
|
||||||
|
|
||||||
|
import static org.mockito.Mockito.never;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
|
import at.procon.dip.embedding.config.EmbeddingProperties;
|
||||||
|
import at.procon.dip.embedding.service.RepresentationEmbeddingOrchestrator;
|
||||||
|
import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueResult;
|
||||||
|
import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueService;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
import org.mockito.Mock;
|
||||||
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
|
||||||
|
@ExtendWith(MockitoExtension.class)
|
||||||
|
class EmbeddingStartupRunnerTest {
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
private ScopedEmbeddingEnqueueService enqueueService;
|
||||||
|
@Mock
|
||||||
|
private RepresentationEmbeddingOrchestrator orchestrator;
|
||||||
|
|
||||||
|
private EmbeddingProperties properties;
|
||||||
|
private EmbeddingStartupRunner runner;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() {
|
||||||
|
properties = new EmbeddingProperties();
|
||||||
|
properties.setEnabled(true);
|
||||||
|
runner = new EmbeddingStartupRunner(properties, enqueueService, orchestrator);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void startupCanEnqueueAllMissingTedLotEmbeddingsAcrossBatches() {
|
||||||
|
EmbeddingProperties.StartupProperties startup = properties.getStartup();
|
||||||
|
startup.setEnqueueMissingEnabled(true);
|
||||||
|
startup.setDocumentType(DocumentType.TED_NOTICE_LOT);
|
||||||
|
startup.setRepresentationType(RepresentationType.SEMANTIC_TEXT);
|
||||||
|
startup.setPrimaryOnly(true);
|
||||||
|
startup.setBatchSize(2);
|
||||||
|
|
||||||
|
when(enqueueService.enqueueByDocumentType(
|
||||||
|
DocumentType.TED_NOTICE_LOT,
|
||||||
|
RepresentationType.SEMANTIC_TEXT,
|
||||||
|
null,
|
||||||
|
true,
|
||||||
|
null,
|
||||||
|
false,
|
||||||
|
0,
|
||||||
|
2
|
||||||
|
)).thenReturn(new ScopedEmbeddingEnqueueResult(
|
||||||
|
DocumentType.TED_NOTICE_LOT, RepresentationType.SEMANTIC_TEXT, null, true, "e5-default", false, 2, 2, 2));
|
||||||
|
when(enqueueService.enqueueByDocumentType(
|
||||||
|
DocumentType.TED_NOTICE_LOT,
|
||||||
|
RepresentationType.SEMANTIC_TEXT,
|
||||||
|
null,
|
||||||
|
true,
|
||||||
|
null,
|
||||||
|
false,
|
||||||
|
1,
|
||||||
|
2
|
||||||
|
)).thenReturn(new ScopedEmbeddingEnqueueResult(
|
||||||
|
DocumentType.TED_NOTICE_LOT, RepresentationType.SEMANTIC_TEXT, null, true, "e5-default", false, 2, 1, 1));
|
||||||
|
|
||||||
|
runner.run(null);
|
||||||
|
|
||||||
|
verify(enqueueService).enqueueByDocumentType(
|
||||||
|
DocumentType.TED_NOTICE_LOT, RepresentationType.SEMANTIC_TEXT, null, true, null, false, 0, 2);
|
||||||
|
verify(enqueueService).enqueueByDocumentType(
|
||||||
|
DocumentType.TED_NOTICE_LOT, RepresentationType.SEMANTIC_TEXT, null, true, null, false, 1, 2);
|
||||||
|
verify(orchestrator, never()).processNextReadyBatch();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void startupCanProcessReadyJobsWhenJobSubsystemIsEnabled() {
|
||||||
|
properties.getStartup().setProcessReadyEnabled(true);
|
||||||
|
properties.getJobs().setEnabled(true);
|
||||||
|
when(orchestrator.processNextReadyBatch()).thenReturn(3);
|
||||||
|
|
||||||
|
runner.run(null);
|
||||||
|
|
||||||
|
verify(orchestrator).processNextReadyBatch();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,69 @@
|
||||||
|
package at.procon.dip.embedding.web;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
|
import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueResult;
|
||||||
|
import at.procon.dip.embedding.service.ScopedEmbeddingEnqueueService;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
import org.mockito.ArgumentCaptor;
|
||||||
|
import org.mockito.Mock;
|
||||||
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
|
||||||
|
@ExtendWith(MockitoExtension.class)
|
||||||
|
class EmbeddingAdminControllerTest {
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
private ScopedEmbeddingEnqueueService enqueueService;
|
||||||
|
|
||||||
|
private EmbeddingAdminController controller;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() {
|
||||||
|
controller = new EmbeddingAdminController(enqueueService);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void acceptsLowercaseDocumentTypeAndRepresentationType() {
|
||||||
|
when(enqueueService.enqueueByDocumentType(
|
||||||
|
org.mockito.ArgumentMatchers.any(),
|
||||||
|
org.mockito.ArgumentMatchers.any(),
|
||||||
|
org.mockito.ArgumentMatchers.any(),
|
||||||
|
org.mockito.ArgumentMatchers.anyBoolean(),
|
||||||
|
org.mockito.ArgumentMatchers.any(),
|
||||||
|
org.mockito.ArgumentMatchers.anyBoolean(),
|
||||||
|
org.mockito.ArgumentMatchers.anyInt()))
|
||||||
|
.thenReturn(new ScopedEmbeddingEnqueueResult(
|
||||||
|
DocumentType.TED_NOTICE_LOT,
|
||||||
|
RepresentationType.SEMANTIC_TEXT,
|
||||||
|
null,
|
||||||
|
true,
|
||||||
|
"e5-default",
|
||||||
|
false,
|
||||||
|
1000,
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
));
|
||||||
|
|
||||||
|
controller.enqueueByDocumentType("ted_notice_lot", "semantic-text", null, true, null, false, 1000);
|
||||||
|
|
||||||
|
ArgumentCaptor<DocumentType> documentType = ArgumentCaptor.forClass(DocumentType.class);
|
||||||
|
ArgumentCaptor<RepresentationType> representationType = ArgumentCaptor.forClass(RepresentationType.class);
|
||||||
|
verify(enqueueService).enqueueByDocumentType(
|
||||||
|
documentType.capture(),
|
||||||
|
representationType.capture(),
|
||||||
|
org.mockito.ArgumentMatchers.isNull(),
|
||||||
|
org.mockito.ArgumentMatchers.eq(true),
|
||||||
|
org.mockito.ArgumentMatchers.isNull(),
|
||||||
|
org.mockito.ArgumentMatchers.eq(false),
|
||||||
|
org.mockito.ArgumentMatchers.eq(1000)
|
||||||
|
);
|
||||||
|
assertThat(documentType.getValue()).isEqualTo(DocumentType.TED_NOTICE_LOT);
|
||||||
|
assertThat(representationType.getValue()).isEqualTo(RepresentationType.SEMANTIC_TEXT);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -91,7 +91,7 @@ import static org.assertj.core.api.Assertions.assertThat;
|
||||||
"spring.jpa.hibernate.ddl-auto=create-drop",
|
"spring.jpa.hibernate.ddl-auto=create-drop",
|
||||||
"spring.jpa.show-sql=false",
|
"spring.jpa.show-sql=false",
|
||||||
"spring.jpa.open-in-view=false",
|
"spring.jpa.open-in-view=false",
|
||||||
"spring.jpa.properties.hibernate.default_schema=DOC",
|
"spring.jpa.properties.hibernate.default_schema=doc",
|
||||||
"ted.vectorization.enabled=false",
|
"ted.vectorization.enabled=false",
|
||||||
"dip.ingestion.enabled=true",
|
"dip.ingestion.enabled=true",
|
||||||
"dip.ingestion.mail-adapter-enabled=true",
|
"dip.ingestion.mail-adapter-enabled=true",
|
||||||
|
|
|
||||||
|
|
@ -25,7 +25,7 @@ import org.testcontainers.junit.jupiter.Testcontainers;
|
||||||
"spring.jpa.hibernate.ddl-auto=create-drop",
|
"spring.jpa.hibernate.ddl-auto=create-drop",
|
||||||
"spring.jpa.show-sql=false",
|
"spring.jpa.show-sql=false",
|
||||||
"spring.jpa.open-in-view=false",
|
"spring.jpa.open-in-view=false",
|
||||||
"spring.jpa.properties.hibernate.default_schema=DOC",
|
"spring.jpa.properties.hibernate.default_schema=doc",
|
||||||
"spring.main.lazy-initialization=true",
|
"spring.main.lazy-initialization=true",
|
||||||
"ted.vectorization.enabled=false",
|
"ted.vectorization.enabled=false",
|
||||||
"ted.search.default-page-size=20",
|
"ted.search.default-page-size=20",
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,7 @@ import org.testcontainers.utility.DockerImageName;
|
||||||
"spring.jpa.hibernate.ddl-auto=create-drop",
|
"spring.jpa.hibernate.ddl-auto=create-drop",
|
||||||
"spring.jpa.show-sql=false",
|
"spring.jpa.show-sql=false",
|
||||||
"spring.jpa.open-in-view=false",
|
"spring.jpa.open-in-view=false",
|
||||||
"spring.jpa.properties.hibernate.default_schema=DOC",
|
"spring.jpa.properties.hibernate.default_schema=doc",
|
||||||
"spring.main.lazy-initialization=true",
|
"spring.main.lazy-initialization=true",
|
||||||
"server.servlet.context-path=/api",
|
"server.servlet.context-path=/api",
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,7 @@ import org.testcontainers.junit.jupiter.Testcontainers;
|
||||||
"spring.jpa.hibernate.ddl-auto=create-drop",
|
"spring.jpa.hibernate.ddl-auto=create-drop",
|
||||||
"spring.jpa.show-sql=false",
|
"spring.jpa.show-sql=false",
|
||||||
"spring.jpa.open-in-view=false",
|
"spring.jpa.open-in-view=false",
|
||||||
"spring.jpa.properties.hibernate.default_schema=DOC",
|
"spring.jpa.properties.hibernate.default_schema=doc",
|
||||||
"spring.main.lazy-initialization=true",
|
"spring.main.lazy-initialization=true",
|
||||||
"dip.runtime.mode=NEW",
|
"dip.runtime.mode=NEW",
|
||||||
"dip.search.default-page-size=20",
|
"dip.search.default-page-size=20",
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue