From 1cd8ebe0660bcf1381f8dc3a4101e5dd37e72e02 Mon Sep 17 00:00:00 2001 From: trifonovt <87468028+TihomirTrifonov@users.noreply.github.com> Date: Tue, 21 Apr 2026 14:07:16 +0200 Subject: [PATCH] introduced document attributes and text import rest api --- .../document/DocumentAttributeValueType.java | 10 + .../document/entity/DocumentAttribute.java | 80 +++++ .../entity/DocumentAttributeName.java | 72 ++++ .../DocumentAttributeNameRepository.java | 11 + .../DocumentAttributeRepository.java | 13 + .../service/DocumentAttributeService.java | 315 ++++++++++++++++++ .../repository/EmbeddingJobRepository.java | 2 +- .../GenericDocumentImportController.java | 75 ++++- .../dto/GenericNameValuePairRequest.java | 20 ++ .../dto/GenericTextImportRequest.java | 4 +- ...oc_mail_processing_stabilization_step1.sql | 24 ++ ...xt_import_typed_attributes_and_context.sql | 64 ++++ ...ext_import_integer_datetime_attributes.sql | 23 ++ .../service/DocumentAttributeServiceTest.java | 125 +++++++ .../GenericDocumentImportControllerTest.java | 115 +++++++ 15 files changed, 941 insertions(+), 12 deletions(-) create mode 100644 src/main/java/at/procon/dip/domain/document/DocumentAttributeValueType.java create mode 100644 src/main/java/at/procon/dip/domain/document/entity/DocumentAttribute.java create mode 100644 src/main/java/at/procon/dip/domain/document/entity/DocumentAttributeName.java create mode 100644 src/main/java/at/procon/dip/domain/document/repository/DocumentAttributeNameRepository.java create mode 100644 src/main/java/at/procon/dip/domain/document/repository/DocumentAttributeRepository.java create mode 100644 src/main/java/at/procon/dip/domain/document/service/DocumentAttributeService.java create mode 100644 src/main/java/at/procon/dip/ingestion/dto/GenericNameValuePairRequest.java create mode 100644 src/main/resources/db/migration/V25__doc_text_import_typed_attributes_and_context.sql create mode 100644 src/main/resources/db/migration/V26__doc_text_import_integer_datetime_attributes.sql create mode 100644 src/test/java/at/procon/dip/domain/document/service/DocumentAttributeServiceTest.java create mode 100644 src/test/java/at/procon/dip/ingestion/controller/GenericDocumentImportControllerTest.java diff --git a/src/main/java/at/procon/dip/domain/document/DocumentAttributeValueType.java b/src/main/java/at/procon/dip/domain/document/DocumentAttributeValueType.java new file mode 100644 index 0000000..cf1c665 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/DocumentAttributeValueType.java @@ -0,0 +1,10 @@ +package at.procon.dip.domain.document; + +public enum DocumentAttributeValueType { + STRING, + INTEGER, + NUMBER, + DATE, + DATETIME, + BOOLEAN +} diff --git a/src/main/java/at/procon/dip/domain/document/entity/DocumentAttribute.java b/src/main/java/at/procon/dip/domain/document/entity/DocumentAttribute.java new file mode 100644 index 0000000..d9c64a4 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/entity/DocumentAttribute.java @@ -0,0 +1,80 @@ +package at.procon.dip.domain.document.entity; + +import at.procon.dip.architecture.SchemaNames; +import jakarta.persistence.Column; +import jakarta.persistence.Entity; +import jakarta.persistence.FetchType; +import jakarta.persistence.GeneratedValue; +import jakarta.persistence.GenerationType; +import jakarta.persistence.Id; +import jakarta.persistence.Index; +import jakarta.persistence.JoinColumn; +import jakarta.persistence.ManyToOne; +import jakarta.persistence.PrePersist; +import jakarta.persistence.Table; +import java.math.BigDecimal; +import java.time.LocalDate; +import java.time.OffsetDateTime; +import java.util.UUID; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; + +@Entity +@Table(schema = SchemaNames.DOC, name = "doc_document_attribute", indexes = { + @Index(name = "idx_doc_doc_attr_document", columnList = "document_id"), + @Index(name = "idx_doc_doc_attr_name", columnList = "attribute_name_id"), + @Index(name = "idx_doc_doc_attr_value_hash", columnList = "attribute_value_hash"), + @Index(name = "idx_doc_doc_attr_doc_name_hash", columnList = "document_id, attribute_name_id, attribute_value_hash", unique = true) +}) +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class DocumentAttribute { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + private UUID id; + + @ManyToOne(fetch = FetchType.LAZY, optional = false) + @JoinColumn(name = "document_id", nullable = false) + private Document document; + + @ManyToOne(fetch = FetchType.LAZY, optional = false) + @JoinColumn(name = "attribute_name_id", nullable = false) + private DocumentAttributeName attributeName; + + @Column(name = "string_value", columnDefinition = "TEXT") + private String stringValue; + + @Column(name = "integer_value") + private Long integerValue; + + @Column(name = "number_value", columnDefinition = "NUMERIC") + private BigDecimal numberValue; + + @Column(name = "date_value") + private LocalDate dateValue; + + @Column(name = "datetime_value") + private OffsetDateTime datetimeValue; + + @Column(name = "boolean_value") + private Boolean booleanValue; + + @Column(name = "attribute_value_hash", nullable = false, length = 64) + private String attributeValueHash; + + @Builder.Default + @Column(name = "created_at", nullable = false, updatable = false) + private OffsetDateTime createdAt = OffsetDateTime.now(); + + @PrePersist + protected void onCreate() { + createdAt = OffsetDateTime.now(); + } +} diff --git a/src/main/java/at/procon/dip/domain/document/entity/DocumentAttributeName.java b/src/main/java/at/procon/dip/domain/document/entity/DocumentAttributeName.java new file mode 100644 index 0000000..91d2fcf --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/entity/DocumentAttributeName.java @@ -0,0 +1,72 @@ +package at.procon.dip.domain.document.entity; + +import at.procon.dip.architecture.SchemaNames; +import at.procon.dip.domain.document.DocumentAttributeValueType; +import jakarta.persistence.Column; +import jakarta.persistence.Entity; +import jakarta.persistence.EnumType; +import jakarta.persistence.Enumerated; +import jakarta.persistence.GeneratedValue; +import jakarta.persistence.GenerationType; +import jakarta.persistence.Id; +import jakarta.persistence.Index; +import jakarta.persistence.PrePersist; +import jakarta.persistence.PreUpdate; +import jakarta.persistence.Table; +import java.time.OffsetDateTime; +import java.util.UUID; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; + +@Entity +@Table(schema = SchemaNames.DOC, name = "doc_attribute_name", indexes = { + @Index(name = "idx_doc_attr_name_name", columnList = "attribute_name"), + @Index(name = "idx_doc_attr_name_context", columnList = "attribute_context"), + @Index(name = "idx_doc_attr_name_normalized_ctx", columnList = "normalized_name, attribute_context", unique = true) +}) +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class DocumentAttributeName { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + private UUID id; + + @Column(name = "attribute_name", nullable = false, length = 255) + private String attributeName; + + @Column(name = "normalized_name", nullable = false, length = 255) + private String normalizedName; + + @Column(name = "attribute_context", nullable = false, length = 100) + private String attributeContext; + + @Enumerated(EnumType.STRING) + @Column(name = "attribute_value_type", nullable = false, length = 32) + private DocumentAttributeValueType attributeValueType; + + @Builder.Default + @Column(name = "created_at", nullable = false, updatable = false) + private OffsetDateTime createdAt = OffsetDateTime.now(); + + @Builder.Default + @Column(name = "updated_at", nullable = false) + private OffsetDateTime updatedAt = OffsetDateTime.now(); + + @PrePersist + protected void onCreate() { + createdAt = OffsetDateTime.now(); + updatedAt = OffsetDateTime.now(); + } + + @PreUpdate + protected void onUpdate() { + updatedAt = OffsetDateTime.now(); + } +} diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentAttributeNameRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentAttributeNameRepository.java new file mode 100644 index 0000000..c163b80 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentAttributeNameRepository.java @@ -0,0 +1,11 @@ +package at.procon.dip.domain.document.repository; + +import at.procon.dip.domain.document.entity.DocumentAttributeName; +import java.util.Optional; +import java.util.UUID; +import org.springframework.data.jpa.repository.JpaRepository; + +public interface DocumentAttributeNameRepository extends JpaRepository { + + Optional findByNormalizedNameAndAttributeContext(String normalizedName, String attributeContext); +} diff --git a/src/main/java/at/procon/dip/domain/document/repository/DocumentAttributeRepository.java b/src/main/java/at/procon/dip/domain/document/repository/DocumentAttributeRepository.java new file mode 100644 index 0000000..4d5c734 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/repository/DocumentAttributeRepository.java @@ -0,0 +1,13 @@ +package at.procon.dip.domain.document.repository; + +import at.procon.dip.domain.document.entity.DocumentAttribute; +import at.procon.dip.domain.document.entity.DocumentAttributeName; +import org.springframework.data.jpa.repository.JpaRepository; + +import java.util.Optional; +import java.util.UUID; + +public interface DocumentAttributeRepository extends JpaRepository { + + boolean existsByDocument_IdAndAttributeName_IdAndAttributeValueHash(UUID documentId, UUID attributeNameId, String attributeValueHash); +} diff --git a/src/main/java/at/procon/dip/domain/document/service/DocumentAttributeService.java b/src/main/java/at/procon/dip/domain/document/service/DocumentAttributeService.java new file mode 100644 index 0000000..028f326 --- /dev/null +++ b/src/main/java/at/procon/dip/domain/document/service/DocumentAttributeService.java @@ -0,0 +1,315 @@ +package at.procon.dip.domain.document.service; + +import at.procon.dip.domain.document.DocumentAttributeValueType; +import at.procon.dip.domain.document.entity.Document; +import at.procon.dip.domain.document.entity.DocumentAttribute; +import at.procon.dip.domain.document.entity.DocumentAttributeName; +import at.procon.dip.domain.document.repository.DocumentAttributeNameRepository; +import at.procon.dip.domain.document.repository.DocumentAttributeRepository; +import at.procon.dip.ingestion.dto.GenericNameValuePairRequest; +import at.procon.ted.util.HashUtils; +import java.math.BigDecimal; +import java.time.LocalDate; +import java.time.OffsetDateTime; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import lombok.RequiredArgsConstructor; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; +import org.springframework.util.StringUtils; + +@Service +@RequiredArgsConstructor +@Transactional +public class DocumentAttributeService { + + public static final String GLOBAL_CONTEXT = "GLOBAL"; + + private final DocumentService documentService; + private final DocumentAttributeNameRepository attributeNameRepository; + private final DocumentAttributeRepository attributeRepository; + + public void addAttributes(UUID documentId, List attributes) { + if (attributes == null || attributes.isEmpty()) { + return; + } + + Document document = documentService.getRequired(documentId); + Set seenInRequest = new HashSet<>(); + Map catalogEntriesByKey = new HashMap<>(); + + for (GenericNameValuePairRequest attribute : attributes) { + ResolvedAttributeInput resolved = resolveInput(attribute); + if (resolved == null) { + continue; + } + + String catalogKey = resolved.normalizedName() + "|" + resolved.context(); + DocumentAttributeName catalogEntry = catalogEntriesByKey.computeIfAbsent(catalogKey, ignored -> + resolveOrCreateCatalogEntry(resolved)); + + if (catalogEntry.getAttributeValueType() != resolved.type()) { + throw new IllegalArgumentException( + "Attribute '" + resolved.name() + "' in context '" + resolved.context() + "' is already registered as type " + + catalogEntry.getAttributeValueType() + " but request uses " + resolved.type() + ); + } + + String attributeValueHash = HashUtils.computeSha256(resolved.canonicalValue()); + String requestKey = catalogEntry.getId() + ":" + attributeValueHash; + if (!seenInRequest.add(requestKey)) { + continue; + } + + boolean alreadyAssigned = attributeRepository.existsByDocument_IdAndAttributeName_IdAndAttributeValueHash( + document.getId(), + catalogEntry.getId(), + attributeValueHash + ); + if (alreadyAssigned) { + continue; + } + + DocumentAttribute entity = DocumentAttribute.builder() + .document(document) + .attributeName(catalogEntry) + .attributeValueHash(attributeValueHash) + .build(); + applyTypedValue(entity, resolved); + attributeRepository.save(entity); + } + } + + private DocumentAttributeName resolveOrCreateCatalogEntry(ResolvedAttributeInput resolved) { + return attributeNameRepository.findByNormalizedNameAndAttributeContext(resolved.normalizedName(), resolved.context()) + .orElseGet(() -> attributeNameRepository.save(DocumentAttributeName.builder() + .attributeName(resolved.name()) + .normalizedName(resolved.normalizedName()) + .attributeContext(resolved.context()) + .attributeValueType(resolved.type()) + .build())); + } + + private void applyTypedValue(DocumentAttribute entity, ResolvedAttributeInput resolved) { + switch (resolved.type()) { + case STRING -> entity.setStringValue(resolved.stringValue()); + case INTEGER -> entity.setIntegerValue(resolved.integerValue()); + case NUMBER -> entity.setNumberValue(resolved.numberValue()); + case DATE -> entity.setDateValue(resolved.dateValue()); + case DATETIME -> entity.setDatetimeValue(resolved.datetimeValue()); + case BOOLEAN -> entity.setBooleanValue(resolved.booleanValue()); + } + } + + private ResolvedAttributeInput resolveInput(GenericNameValuePairRequest attribute) { + if (attribute == null || !StringUtils.hasText(attribute.name())) { + return null; + } + + String name = attribute.name().trim(); + String normalizedName = normalizeName(name); + String context = normalizeContext(attribute.context()); + + int explicitTypedValues = countExplicitTypedValues(attribute); + String legacyValue = StringUtils.hasText(attribute.value()) ? attribute.value().trim() : null; + String explicitStringValue = StringUtils.hasText(attribute.stringValue()) ? attribute.stringValue().trim() : null; + DocumentAttributeValueType requestedType = attribute.type(); + + if (requestedType == null) { + if (legacyValue != null && explicitTypedValues > 0) { + throw new IllegalArgumentException("Attribute '" + name + "' uses both legacy and typed value fields"); + } + if (explicitTypedValues > 1) { + throw new IllegalArgumentException("Attribute '" + name + "' defines multiple typed values without declaring a type"); + } + if (legacyValue != null) { + return new ResolvedAttributeInput( + name, normalizedName, context, DocumentAttributeValueType.STRING, legacyValue, null, null, null, null, null, legacyValue + ); + } + if (explicitStringValue != null && explicitTypedValues == 1) { + return new ResolvedAttributeInput( + name, normalizedName, context, DocumentAttributeValueType.STRING, explicitStringValue, null, null, null, null, null, explicitStringValue + ); + } + if (attribute.integerValue() != null && explicitTypedValues == 1) { + return new ResolvedAttributeInput( + name, normalizedName, context, DocumentAttributeValueType.INTEGER, null, attribute.integerValue(), null, null, null, null, attribute.integerValue().toString() + ); + } + if (attribute.numberValue() != null && explicitTypedValues == 1) { + BigDecimal normalized = normalizeNumber(attribute.numberValue()); + return new ResolvedAttributeInput( + name, normalizedName, context, DocumentAttributeValueType.NUMBER, null, null, normalized, null, null, null, normalized.toPlainString() + ); + } + if (attribute.dateValue() != null && explicitTypedValues == 1) { + return new ResolvedAttributeInput( + name, normalizedName, context, DocumentAttributeValueType.DATE, null, null, null, attribute.dateValue(), null, null, attribute.dateValue().toString() + ); + } + if (attribute.datetimeValue() != null && explicitTypedValues == 1) { + OffsetDateTime normalized = normalizeDateTime(attribute.datetimeValue()); + return new ResolvedAttributeInput( + name, normalizedName, context, DocumentAttributeValueType.DATETIME, null, null, null, null, normalized, null, normalized.toString() + ); + } + if (attribute.booleanValue() != null && explicitTypedValues == 1) { + return new ResolvedAttributeInput( + name, normalizedName, context, DocumentAttributeValueType.BOOLEAN, null, null, null, null, null, attribute.booleanValue(), attribute.booleanValue().toString() + ); + } + return null; + } + + return switch (requestedType) { + case STRING -> { + String value = explicitStringValue != null ? explicitStringValue : legacyValue; + if (!StringUtils.hasText(value) || hasOtherTypedValues(attribute, DocumentAttributeValueType.STRING)) { + throw new IllegalArgumentException("String attribute '" + name + "' requires only string or value input"); + } + yield new ResolvedAttributeInput( + name, normalizedName, context, DocumentAttributeValueType.STRING, value.trim(), null, null, null, null, null, value.trim() + ); + } + case INTEGER -> { + Long value = attribute.integerValue(); + if (value == null && legacyValue != null) { + value = Long.parseLong(legacyValue); + } + if (value == null || hasOtherTypedValues(attribute, DocumentAttributeValueType.INTEGER)) { + throw new IllegalArgumentException("Integer attribute '" + name + "' requires only integer input"); + } + yield new ResolvedAttributeInput( + name, normalizedName, context, DocumentAttributeValueType.INTEGER, null, value, null, null, null, null, value.toString() + ); + } + case NUMBER -> { + BigDecimal value = attribute.numberValue(); + if (value == null && legacyValue != null) { + value = normalizeNumber(new BigDecimal(legacyValue)); + } + if (value == null || hasOtherTypedValues(attribute, DocumentAttributeValueType.NUMBER)) { + throw new IllegalArgumentException("Number attribute '" + name + "' requires only number input"); + } + value = normalizeNumber(value); + yield new ResolvedAttributeInput( + name, normalizedName, context, DocumentAttributeValueType.NUMBER, null, null, value, null, null, null, value.toPlainString() + ); + } + case DATE -> { + LocalDate value = attribute.dateValue(); + if (value == null && legacyValue != null) { + value = LocalDate.parse(legacyValue); + } + if (value == null || hasOtherTypedValues(attribute, DocumentAttributeValueType.DATE)) { + throw new IllegalArgumentException("Date attribute '" + name + "' requires only date input"); + } + yield new ResolvedAttributeInput( + name, normalizedName, context, DocumentAttributeValueType.DATE, null, null, null, value, null, null, value.toString() + ); + } + case DATETIME -> { + OffsetDateTime value = attribute.datetimeValue(); + if (value == null && legacyValue != null) { + value = OffsetDateTime.parse(legacyValue); + } + if (value == null || hasOtherTypedValues(attribute, DocumentAttributeValueType.DATETIME)) { + throw new IllegalArgumentException("Datetime attribute '" + name + "' requires only datetime input"); + } + value = normalizeDateTime(value); + yield new ResolvedAttributeInput( + name, normalizedName, context, DocumentAttributeValueType.DATETIME, null, null, null, null, value, null, value.toString() + ); + } + case BOOLEAN -> { + Boolean value = attribute.booleanValue(); + if (value == null && legacyValue != null) { + if ("true".equalsIgnoreCase(legacyValue) || "false".equalsIgnoreCase(legacyValue)) { + value = Boolean.parseBoolean(legacyValue); + } else { + throw new IllegalArgumentException("Boolean attribute '" + name + "' requires true or false"); + } + } + if (value == null || hasOtherTypedValues(attribute, DocumentAttributeValueType.BOOLEAN)) { + throw new IllegalArgumentException("Boolean attribute '" + name + "' requires only boolean input"); + } + yield new ResolvedAttributeInput( + name, normalizedName, context, DocumentAttributeValueType.BOOLEAN, null, null, null, null, null, value, value.toString() + ); + } + }; + } + + private int countExplicitTypedValues(GenericNameValuePairRequest attribute) { + int count = 0; + if (StringUtils.hasText(attribute.stringValue())) { + count++; + } + if (attribute.integerValue() != null) { + count++; + } + if (attribute.numberValue() != null) { + count++; + } + if (attribute.dateValue() != null) { + count++; + } + if (attribute.datetimeValue() != null) { + count++; + } + if (attribute.booleanValue() != null) { + count++; + } + return count; + } + + private boolean hasOtherTypedValues(GenericNameValuePairRequest attribute, DocumentAttributeValueType expectedType) { + return switch (expectedType) { + case STRING -> attribute.integerValue() != null || attribute.numberValue() != null || attribute.dateValue() != null || attribute.datetimeValue() != null || attribute.booleanValue() != null; + case INTEGER -> StringUtils.hasText(attribute.stringValue()) || attribute.numberValue() != null || attribute.dateValue() != null || attribute.datetimeValue() != null || attribute.booleanValue() != null; + case NUMBER -> StringUtils.hasText(attribute.stringValue()) || attribute.integerValue() != null || attribute.dateValue() != null || attribute.datetimeValue() != null || attribute.booleanValue() != null; + case DATE -> StringUtils.hasText(attribute.stringValue()) || attribute.integerValue() != null || attribute.numberValue() != null || attribute.datetimeValue() != null || attribute.booleanValue() != null; + case DATETIME -> StringUtils.hasText(attribute.stringValue()) || attribute.integerValue() != null || attribute.numberValue() != null || attribute.dateValue() != null || attribute.booleanValue() != null; + case BOOLEAN -> StringUtils.hasText(attribute.stringValue()) || attribute.integerValue() != null || attribute.numberValue() != null || attribute.dateValue() != null || attribute.datetimeValue() != null; + }; + } + + private BigDecimal normalizeNumber(BigDecimal value) { + return value.stripTrailingZeros(); + } + + private OffsetDateTime normalizeDateTime(OffsetDateTime value) { + return value.withNano(0); + } + + private String normalizeName(String value) { + return value.trim().toLowerCase(Locale.ROOT); + } + + private String normalizeContext(String value) { + return StringUtils.hasText(value) + ? value.trim().toUpperCase(Locale.ROOT) + : GLOBAL_CONTEXT; + } + + private record ResolvedAttributeInput( + String name, + String normalizedName, + String context, + DocumentAttributeValueType type, + String stringValue, + Long integerValue, + BigDecimal numberValue, + LocalDate dateValue, + OffsetDateTime datetimeValue, + Boolean booleanValue, + String canonicalValue + ) { + } +} diff --git a/src/main/java/at/procon/dip/embedding/job/repository/EmbeddingJobRepository.java b/src/main/java/at/procon/dip/embedding/job/repository/EmbeddingJobRepository.java index 92b5606..1e33f8a 100644 --- a/src/main/java/at/procon/dip/embedding/job/repository/EmbeddingJobRepository.java +++ b/src/main/java/at/procon/dip/embedding/job/repository/EmbeddingJobRepository.java @@ -25,7 +25,7 @@ public interface EmbeddingJobRepository extends JpaRepository upload( @@ -76,13 +81,7 @@ public class GenericDocumentImportController { @PostMapping(path = "/text", consumes = MediaType.APPLICATION_JSON_VALUE) public ResponseEntity importText(@RequestBody GenericTextImportRequest request) { ensureRestUploadEnabled(); - Map attributes = new LinkedHashMap<>(); - if (StringUtils.hasText(request.languageCode())) { - attributes.put("languageCode", request.languageCode()); - } - if (StringUtils.hasText(request.title())) { - attributes.put("title", request.title()); - } + Map attributes = buildDescriptorAttributes(request); SourceDescriptor descriptor = new SourceDescriptor( buildAccessContext(request.ownerTenantKey(), request.visibility()), @@ -91,14 +90,70 @@ public class GenericDocumentImportController { null, request.fileName(), request.mediaType(), - request.text() == null ? null : request.text().getBytes(java.nio.charset.StandardCharsets.UTF_8), + request.text() == null ? null : request.text().getBytes(StandardCharsets.UTF_8), request.text(), OffsetDateTime.now(), OriginalContentStoragePolicy.DEFAULT, attributes ); IngestionResult result = ingestionGateway.ingest(descriptor); - return ResponseEntity.ok(toResponse(result)); + GenericImportResponse response = toResponse(result); + if (response.documentId() != null) { + documentAttributeService.addAttributes(response.documentId(), request.attributes()); + } + return ResponseEntity.ok(response); + } + + private Map buildDescriptorAttributes(GenericTextImportRequest request) { + Map attributes = new LinkedHashMap<>(); + if (request.attributes() != null) { + for (GenericNameValuePairRequest attribute : request.attributes()) { + if (attribute == null || !StringUtils.hasText(attribute.name()) || !isGlobalContext(attribute.context())) { + continue; + } + String renderedValue = renderAsDescriptorValue(attribute); + if (StringUtils.hasText(renderedValue)) { + attributes.put(attribute.name().trim(), renderedValue); + } + } + } + if (StringUtils.hasText(request.languageCode())) { + attributes.put("languageCode", request.languageCode()); + } + if (StringUtils.hasText(request.title())) { + attributes.put("title", request.title()); + } + return attributes; + } + + private String renderAsDescriptorValue(GenericNameValuePairRequest attribute) { + if (StringUtils.hasText(attribute.value())) { + return attribute.value().trim(); + } + if (StringUtils.hasText(attribute.stringValue())) { + return attribute.stringValue().trim(); + } + if (attribute.integerValue() != null) { + return attribute.integerValue().toString(); + } + if (attribute.numberValue() != null) { + BigDecimal number = attribute.numberValue().stripTrailingZeros(); + return number.toPlainString(); + } + if (attribute.dateValue() != null) { + return attribute.dateValue().toString(); + } + if (attribute.datetimeValue() != null) { + return attribute.datetimeValue().withNano(0).toString(); + } + if (attribute.booleanValue() != null) { + return attribute.booleanValue().toString(); + } + return null; + } + + private boolean isGlobalContext(String context) { + return !StringUtils.hasText(context) || DocumentAttributeService.GLOBAL_CONTEXT.equalsIgnoreCase(context.trim()); } private void ensureRestUploadEnabled() { diff --git a/src/main/java/at/procon/dip/ingestion/dto/GenericNameValuePairRequest.java b/src/main/java/at/procon/dip/ingestion/dto/GenericNameValuePairRequest.java new file mode 100644 index 0000000..0448a2e --- /dev/null +++ b/src/main/java/at/procon/dip/ingestion/dto/GenericNameValuePairRequest.java @@ -0,0 +1,20 @@ +package at.procon.dip.ingestion.dto; + +import at.procon.dip.domain.document.DocumentAttributeValueType; +import java.math.BigDecimal; +import java.time.LocalDate; +import java.time.OffsetDateTime; + +public record GenericNameValuePairRequest( + String name, + String context, + DocumentAttributeValueType type, + String value, + String stringValue, + Long integerValue, + BigDecimal numberValue, + LocalDate dateValue, + OffsetDateTime datetimeValue, + Boolean booleanValue +) { +} diff --git a/src/main/java/at/procon/dip/ingestion/dto/GenericTextImportRequest.java b/src/main/java/at/procon/dip/ingestion/dto/GenericTextImportRequest.java index c6560f9..ba18265 100644 --- a/src/main/java/at/procon/dip/ingestion/dto/GenericTextImportRequest.java +++ b/src/main/java/at/procon/dip/ingestion/dto/GenericTextImportRequest.java @@ -1,6 +1,7 @@ package at.procon.dip.ingestion.dto; import at.procon.dip.domain.access.DocumentVisibility; +import java.util.List; public record GenericTextImportRequest( String text, @@ -10,6 +11,7 @@ public record GenericTextImportRequest( DocumentVisibility visibility, String languageCode, String title, - String sourceIdentifier + String sourceIdentifier, + List attributes ) { } diff --git a/src/main/resources/db/migration/V23__doc_mail_processing_stabilization_step1.sql b/src/main/resources/db/migration/V23__doc_mail_processing_stabilization_step1.sql index 12ababd..375530a 100644 --- a/src/main/resources/db/migration/V23__doc_mail_processing_stabilization_step1.sql +++ b/src/main/resources/db/migration/V23__doc_mail_processing_stabilization_step1.sql @@ -88,3 +88,27 @@ CREATE UNIQUE INDEX IF NOT EXISTS uq_doc_mail_attachment_mail_index CREATE UNIQUE INDEX IF NOT EXISTS uq_doc_mail_attachment_mail_part ON DOC.doc_mail_attachment (mail_document_id, part_path) WHERE part_path IS NOT NULL; + +DO $$ +BEGIN + IF EXISTS ( + SELECT 1 + FROM pg_constraint c + JOIN pg_class r ON r.oid = c.conrelid + JOIN pg_namespace n ON n.oid = r.relnamespace + WHERE n.nspname = 'doc' + AND r.relname = 'doc_source' + AND c.conname = 'doc_source_source_type_check' + ) THEN + ALTER TABLE DOC.doc_source DROP CONSTRAINT doc_source_source_type_check; + ALTER TABLE DOC.doc_source + ADD CONSTRAINT doc_source_source_type_check + CHECK ( + source_type IN ( + 'TED_PACKAGE', 'PACKAGE_CHILD', 'MAIL', 'MAIL_ATTACHMENT', 'FILE_SYSTEM', 'REST_UPLOAD', + 'MANUAL_UPLOAD', 'ZIP_CHILD', 'API', 'MIGRATION' + ) + ); + END IF; +END +$$; \ No newline at end of file diff --git a/src/main/resources/db/migration/V25__doc_text_import_typed_attributes_and_context.sql b/src/main/resources/db/migration/V25__doc_text_import_typed_attributes_and_context.sql new file mode 100644 index 0000000..1b0a51d --- /dev/null +++ b/src/main/resources/db/migration/V25__doc_text_import_typed_attributes_and_context.sql @@ -0,0 +1,64 @@ +-- Upgrade text-import attribute catalog to typed values with optional context. + +ALTER TABLE DOC.doc_attribute_name + ADD COLUMN IF NOT EXISTS attribute_context VARCHAR(100); + +UPDATE DOC.doc_attribute_name +SET attribute_context = 'GLOBAL' +WHERE attribute_context IS NULL OR BTRIM(attribute_context) = ''; + +ALTER TABLE DOC.doc_attribute_name + ALTER COLUMN attribute_context SET NOT NULL; + +ALTER TABLE DOC.doc_attribute_name + ADD COLUMN IF NOT EXISTS attribute_value_type VARCHAR(32); + +UPDATE DOC.doc_attribute_name +SET attribute_value_type = 'STRING' +WHERE attribute_value_type IS NULL OR BTRIM(attribute_value_type) = ''; + +ALTER TABLE DOC.doc_attribute_name + ALTER COLUMN attribute_value_type SET NOT NULL; + +DROP INDEX IF EXISTS DOC.idx_doc_attr_name_normalized; +CREATE UNIQUE INDEX IF NOT EXISTS idx_doc_attr_name_normalized_ctx + ON DOC.doc_attribute_name(normalized_name, attribute_context); +CREATE INDEX IF NOT EXISTS idx_doc_attr_name_context + ON DOC.doc_attribute_name(attribute_context); + +ALTER TABLE DOC.doc_document_attribute + ADD COLUMN IF NOT EXISTS string_value TEXT; + +UPDATE DOC.doc_document_attribute +SET string_value = attribute_value +WHERE string_value IS NULL AND attribute_value IS NOT NULL; + +ALTER TABLE DOC.doc_document_attribute + ADD COLUMN IF NOT EXISTS number_value NUMERIC; + +ALTER TABLE DOC.doc_document_attribute + ADD COLUMN IF NOT EXISTS date_value DATE; + +ALTER TABLE DOC.doc_document_attribute + ADD COLUMN IF NOT EXISTS boolean_value BOOLEAN; + +ALTER TABLE DOC.doc_document_attribute + DROP COLUMN IF EXISTS attribute_value; + +ALTER TABLE DOC.doc_document_attribute + DROP CONSTRAINT IF EXISTS chk_doc_document_attribute_single_typed_value; + +ALTER TABLE DOC.doc_document_attribute + ADD CONSTRAINT chk_doc_document_attribute_single_typed_value CHECK ( + (CASE WHEN string_value IS NOT NULL THEN 1 ELSE 0 END) + + (CASE WHEN number_value IS NOT NULL THEN 1 ELSE 0 END) + + (CASE WHEN date_value IS NOT NULL THEN 1 ELSE 0 END) + + (CASE WHEN boolean_value IS NOT NULL THEN 1 ELSE 0 END) = 1 + ); + +COMMENT ON COLUMN DOC.doc_attribute_name.attribute_context IS 'Optional namespace/context for avoiding name conflicts. GLOBAL is the default context.'; +COMMENT ON COLUMN DOC.doc_attribute_name.attribute_value_type IS 'Declared type of the attribute value for this catalog entry.'; +COMMENT ON COLUMN DOC.doc_document_attribute.string_value IS 'String representation when the catalog entry type is STRING.'; +COMMENT ON COLUMN DOC.doc_document_attribute.number_value IS 'Numeric representation when the catalog entry type is NUMBER.'; +COMMENT ON COLUMN DOC.doc_document_attribute.date_value IS 'Date representation when the catalog entry type is DATE.'; +COMMENT ON COLUMN DOC.doc_document_attribute.boolean_value IS 'Boolean representation when the catalog entry type is BOOLEAN.'; diff --git a/src/main/resources/db/migration/V26__doc_text_import_integer_datetime_attributes.sql b/src/main/resources/db/migration/V26__doc_text_import_integer_datetime_attributes.sql new file mode 100644 index 0000000..cf7fe5e --- /dev/null +++ b/src/main/resources/db/migration/V26__doc_text_import_integer_datetime_attributes.sql @@ -0,0 +1,23 @@ +-- Extend typed text-import attributes with INTEGER and DATETIME support. + +ALTER TABLE DOC.doc_document_attribute + ADD COLUMN IF NOT EXISTS integer_value BIGINT; + +ALTER TABLE DOC.doc_document_attribute + ADD COLUMN IF NOT EXISTS datetime_value TIMESTAMP WITH TIME ZONE; + +ALTER TABLE DOC.doc_document_attribute + DROP CONSTRAINT IF EXISTS chk_doc_document_attribute_single_typed_value; + +ALTER TABLE DOC.doc_document_attribute + ADD CONSTRAINT chk_doc_document_attribute_single_typed_value CHECK ( + (CASE WHEN string_value IS NOT NULL THEN 1 ELSE 0 END) + + (CASE WHEN integer_value IS NOT NULL THEN 1 ELSE 0 END) + + (CASE WHEN number_value IS NOT NULL THEN 1 ELSE 0 END) + + (CASE WHEN date_value IS NOT NULL THEN 1 ELSE 0 END) + + (CASE WHEN datetime_value IS NOT NULL THEN 1 ELSE 0 END) + + (CASE WHEN boolean_value IS NOT NULL THEN 1 ELSE 0 END) = 1 + ); + +COMMENT ON COLUMN DOC.doc_document_attribute.integer_value IS 'Integer representation when the catalog entry type is INTEGER.'; +COMMENT ON COLUMN DOC.doc_document_attribute.datetime_value IS 'Date-time representation when the catalog entry type is DATETIME.'; diff --git a/src/test/java/at/procon/dip/domain/document/service/DocumentAttributeServiceTest.java b/src/test/java/at/procon/dip/domain/document/service/DocumentAttributeServiceTest.java new file mode 100644 index 0000000..d9253c0 --- /dev/null +++ b/src/test/java/at/procon/dip/domain/document/service/DocumentAttributeServiceTest.java @@ -0,0 +1,125 @@ +package at.procon.dip.domain.document.service; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import at.procon.dip.domain.document.DocumentAttributeValueType; +import at.procon.dip.domain.document.entity.Document; +import at.procon.dip.domain.document.entity.DocumentAttribute; +import at.procon.dip.domain.document.entity.DocumentAttributeName; +import at.procon.dip.domain.document.repository.DocumentAttributeNameRepository; +import at.procon.dip.domain.document.repository.DocumentAttributeRepository; +import at.procon.dip.ingestion.dto.GenericNameValuePairRequest; +import java.math.BigDecimal; +import java.time.LocalDate; +import java.time.OffsetDateTime; +import java.util.List; +import java.util.Optional; +import java.util.UUID; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.ArgumentCaptor; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +@ExtendWith(MockitoExtension.class) +class DocumentAttributeServiceTest { + + @Mock + private DocumentService documentService; + @Mock + private DocumentAttributeNameRepository attributeNameRepository; + @Mock + private DocumentAttributeRepository attributeRepository; + + private DocumentAttributeService service; + + @BeforeEach + void setUp() { + service = new DocumentAttributeService(documentService, attributeNameRepository, attributeRepository); + } + + @Test + void shouldCreateTypedCatalogEntriesIncludingIntegerAndDatetimeAndUseContextToAvoidConflicts() { + UUID documentId = UUID.randomUUID(); + when(documentService.getRequired(documentId)).thenReturn(Document.builder().id(documentId).build()); + + when(attributeNameRepository.findByNormalizedNameAndAttributeContext("status", "GLOBAL")).thenReturn(Optional.empty()); + when(attributeNameRepository.findByNormalizedNameAndAttributeContext("status", "TED")).thenReturn(Optional.empty()); + when(attributeNameRepository.findByNormalizedNameAndAttributeContext("estimatedvalue", "GLOBAL")).thenReturn(Optional.empty()); + when(attributeNameRepository.findByNormalizedNameAndAttributeContext("publisheddate", "GLOBAL")).thenReturn(Optional.empty()); + when(attributeNameRepository.findByNormalizedNameAndAttributeContext("framework", "GLOBAL")).thenReturn(Optional.empty()); + when(attributeNameRepository.findByNormalizedNameAndAttributeContext("version", "GLOBAL")).thenReturn(Optional.empty()); + when(attributeNameRepository.findByNormalizedNameAndAttributeContext("lastsyncedat", "GLOBAL")).thenReturn(Optional.empty()); + when(attributeNameRepository.save(any(DocumentAttributeName.class))).thenAnswer(invocation -> { + DocumentAttributeName value = invocation.getArgument(0); + value.setId(UUID.randomUUID()); + return value; + }); + when(attributeRepository.existsByDocument_IdAndAttributeName_IdAndAttributeValueHash(any(), any(), any())) + .thenReturn(false); + + OffsetDateTime syncedAt = OffsetDateTime.parse("2026-04-21T11:05:00+02:00"); + + service.addAttributes(documentId, List.of( + new GenericNameValuePairRequest("Status", null, DocumentAttributeValueType.STRING, null, "published", null, null, null, null, null), + new GenericNameValuePairRequest("Status", "ted", DocumentAttributeValueType.STRING, null, "closed", null, null, null, null, null), + new GenericNameValuePairRequest("estimatedValue", null, DocumentAttributeValueType.NUMBER, null, null, null, new BigDecimal("125000.50"), null, null, null), + new GenericNameValuePairRequest("publishedDate", null, DocumentAttributeValueType.DATE, null, null, null, null, LocalDate.of(2026, 4, 21), null, null), + new GenericNameValuePairRequest("framework", null, DocumentAttributeValueType.BOOLEAN, null, null, null, null, null, null, true), + new GenericNameValuePairRequest("version", null, DocumentAttributeValueType.INTEGER, null, null, 7L, null, null, null, null), + new GenericNameValuePairRequest("lastSyncedAt", null, DocumentAttributeValueType.DATETIME, null, null, null, null, null, syncedAt, null), + new GenericNameValuePairRequest("status", null, DocumentAttributeValueType.STRING, null, "published", null, null, null, null, null) + )); + + verify(attributeNameRepository, times(7)).save(any(DocumentAttributeName.class)); + ArgumentCaptor captor = ArgumentCaptor.forClass(DocumentAttribute.class); + verify(attributeRepository, times(7)).save(captor.capture()); + List saved = captor.getAllValues(); + + assertThat(saved).hasSize(7); + assertThat(saved).anySatisfy(attribute -> { + assertThat(attribute.getAttributeName().getAttributeContext()).isEqualTo("GLOBAL"); + assertThat(attribute.getAttributeName().getNormalizedName()).isEqualTo("status"); + assertThat(attribute.getStringValue()).isEqualTo("published"); + }); + assertThat(saved).anySatisfy(attribute -> { + assertThat(attribute.getAttributeName().getAttributeContext()).isEqualTo("TED"); + assertThat(attribute.getAttributeName().getNormalizedName()).isEqualTo("status"); + assertThat(attribute.getStringValue()).isEqualTo("closed"); + }); + assertThat(saved).anySatisfy(attribute -> assertThat(attribute.getNumberValue()).isEqualByComparingTo("125000.5")); + assertThat(saved).anySatisfy(attribute -> assertThat(attribute.getDateValue()).isEqualTo(LocalDate.of(2026, 4, 21))); + assertThat(saved).anySatisfy(attribute -> assertThat(attribute.getBooleanValue()).isTrue()); + assertThat(saved).anySatisfy(attribute -> assertThat(attribute.getIntegerValue()).isEqualTo(7L)); + assertThat(saved).anySatisfy(attribute -> assertThat(attribute.getDatetimeValue()).isEqualTo(syncedAt.withNano(0))); + } + + @Test + void shouldRejectTypeConflictForExistingCatalogEntryInSameContext() { + UUID documentId = UUID.randomUUID(); + when(documentService.getRequired(documentId)).thenReturn(Document.builder().id(documentId).build()); + when(attributeNameRepository.findByNormalizedNameAndAttributeContext("status", "TED")).thenReturn(Optional.of( + DocumentAttributeName.builder() + .id(UUID.randomUUID()) + .attributeName("status") + .normalizedName("status") + .attributeContext("TED") + .attributeValueType(DocumentAttributeValueType.STRING) + .build() + )); + + assertThatThrownBy(() -> service.addAttributes(documentId, List.of( + new GenericNameValuePairRequest("status", "ted", DocumentAttributeValueType.INTEGER, null, null, 1L, null, null, null, null) + ))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("already registered as type STRING"); + + verify(attributeRepository, times(0)).save(any()); + } +} diff --git a/src/test/java/at/procon/dip/ingestion/controller/GenericDocumentImportControllerTest.java b/src/test/java/at/procon/dip/ingestion/controller/GenericDocumentImportControllerTest.java new file mode 100644 index 0000000..75f2fbb --- /dev/null +++ b/src/test/java/at/procon/dip/ingestion/controller/GenericDocumentImportControllerTest.java @@ -0,0 +1,115 @@ +package at.procon.dip.ingestion.controller; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import at.procon.dip.domain.access.DocumentAccessContext; +import at.procon.dip.domain.access.DocumentVisibility; +import at.procon.dip.domain.document.CanonicalDocumentMetadata; +import at.procon.dip.domain.document.DocumentAttributeValueType; +import at.procon.dip.domain.document.DocumentFamily; +import at.procon.dip.domain.document.DocumentStatus; +import at.procon.dip.domain.document.DocumentType; +import at.procon.dip.domain.document.service.DocumentAttributeService; +import at.procon.dip.ingestion.config.DipIngestionProperties; +import at.procon.dip.ingestion.dto.GenericNameValuePairRequest; +import at.procon.dip.ingestion.dto.GenericTextImportRequest; +import at.procon.dip.ingestion.service.DocumentIngestionGateway; +import at.procon.dip.ingestion.spi.IngestionResult; +import at.procon.dip.ingestion.spi.SourceDescriptor; +import java.math.BigDecimal; +import java.time.LocalDate; +import java.time.OffsetDateTime; +import java.util.List; +import java.util.UUID; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.ArgumentCaptor; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +@ExtendWith(MockitoExtension.class) +class GenericDocumentImportControllerTest { + + @Mock + private DocumentIngestionGateway ingestionGateway; + @Mock + private DocumentAttributeService documentAttributeService; + + private GenericDocumentImportController controller; + + @BeforeEach + void setUp() { + DipIngestionProperties properties = new DipIngestionProperties(); + properties.setEnabled(true); + properties.setRestUploadEnabled(true); + properties.setDefaultVisibility(DocumentVisibility.PUBLIC); + controller = new GenericDocumentImportController(properties, ingestionGateway, documentAttributeService); + } + + @Test + void shouldPassGlobalTypedAttributesIntoIngestionAndPersistAllAttributes() { + UUID documentId = UUID.randomUUID(); + OffsetDateTime syncedAt = OffsetDateTime.parse("2026-04-21T11:05:00+02:00"); + List pairs = List.of( + new GenericNameValuePairRequest("title", null, DocumentAttributeValueType.STRING, null, "Text title from pair", null, null, null, null, null), + new GenericNameValuePairRequest("languageCode", null, DocumentAttributeValueType.STRING, null, "de", null, null, null, null, null), + new GenericNameValuePairRequest("country", null, null, "AT", null, null, null, null, null, null), + new GenericNameValuePairRequest("estimatedValue", null, DocumentAttributeValueType.NUMBER, null, null, null, new BigDecimal("125000.50"), null, null, null), + new GenericNameValuePairRequest("publishedDate", null, DocumentAttributeValueType.DATE, null, null, null, null, LocalDate.of(2026, 4, 21), null, null), + new GenericNameValuePairRequest("version", null, DocumentAttributeValueType.INTEGER, null, null, 7L, null, null, null, null), + new GenericNameValuePairRequest("lastSyncedAt", null, DocumentAttributeValueType.DATETIME, null, null, null, null, null, syncedAt, null), + new GenericNameValuePairRequest("status", "ted", DocumentAttributeValueType.STRING, null, "closed", null, null, null, null, null) + ); + GenericTextImportRequest request = new GenericTextImportRequest( + "Hello world", + "sample.txt", + "text/plain", + null, + null, + null, + null, + "source-1", + pairs + ); + + when(ingestionGateway.ingest(any(SourceDescriptor.class))).thenReturn(new IngestionResult( + List.of(new CanonicalDocumentMetadata( + documentId, + new DocumentAccessContext(null, DocumentVisibility.PUBLIC), + DocumentType.TEXT, + DocumentFamily.GENERIC, + DocumentStatus.REPRESENTED, + "Text title from pair", + "de", + "text/plain", + null, + OffsetDateTime.now(), + OffsetDateTime.now() + )), + List.of() + )); + + var response = controller.importText(request); + + ArgumentCaptor descriptorCaptor = ArgumentCaptor.forClass(SourceDescriptor.class); + verify(ingestionGateway).ingest(descriptorCaptor.capture()); + SourceDescriptor descriptor = descriptorCaptor.getValue(); + assertThat(descriptor.textContent()).isEqualTo("Hello world"); + assertThat(descriptor.attributes()).containsEntry("title", "Text title from pair"); + assertThat(descriptor.attributes()).containsEntry("languageCode", "de"); + assertThat(descriptor.attributes()).containsEntry("country", "AT"); + assertThat(descriptor.attributes()).containsEntry("estimatedValue", "125000.5"); + assertThat(descriptor.attributes()).containsEntry("publishedDate", "2026-04-21"); + assertThat(descriptor.attributes()).containsEntry("version", "7"); + assertThat(descriptor.attributes()).containsEntry("lastSyncedAt", "2026-04-21T11:05+02:00"); + assertThat(descriptor.attributes()).doesNotContainKey("status"); + + verify(documentAttributeService).addAttributes(documentId, pairs); + assertThat(response.getBody()).isNotNull(); + assertThat(response.getBody().documentId()).isEqualTo(documentId); + } +}