Refactor phases 4
parent
adc4f2da43
commit
ac59730f3e
@ -0,0 +1,58 @@
|
|||||||
|
# Phase 4 - Generic Ingestion Pipeline
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
|
||||||
|
Add the first generic ingestion path so arbitrary documents can be imported into the canonical DOC model,
|
||||||
|
normalized into text representations, and queued for vectorization without depending on the TED-specific model.
|
||||||
|
|
||||||
|
## Scope implemented
|
||||||
|
|
||||||
|
### Input channels
|
||||||
|
|
||||||
|
- file-system polling route for arbitrary documents
|
||||||
|
- REST/API upload endpoints
|
||||||
|
|
||||||
|
### Detection
|
||||||
|
|
||||||
|
- file extension + media type based classification
|
||||||
|
|
||||||
|
### Extraction
|
||||||
|
|
||||||
|
- PDF -> text via PDFBox
|
||||||
|
- HTML -> cleaned text via JSoup
|
||||||
|
- text / markdown / generic XML -> normalized UTF-8 text
|
||||||
|
- unsupported binary types -> fallback warning only
|
||||||
|
|
||||||
|
### Representation building
|
||||||
|
|
||||||
|
- default generic builder creates:
|
||||||
|
- FULLTEXT
|
||||||
|
- SEMANTIC_TEXT
|
||||||
|
- TITLE_ABSTRACT
|
||||||
|
|
||||||
|
### Persistence
|
||||||
|
|
||||||
|
- original content stored in DOC.doc_content
|
||||||
|
- binary originals can now be stored inline in `binary_content`
|
||||||
|
- derived text variants persisted as additional DOC.doc_content rows
|
||||||
|
- text representations persisted in DOC.doc_text_representation
|
||||||
|
- pending embeddings created in DOC.doc_embedding when enabled
|
||||||
|
|
||||||
|
## Access model
|
||||||
|
|
||||||
|
The generic pipeline uses the Phase 0/1 access model:
|
||||||
|
|
||||||
|
- optional owner tenant
|
||||||
|
- mandatory visibility
|
||||||
|
|
||||||
|
This supports both:
|
||||||
|
- public documents (`owner_tenant_id = null`, `visibility = PUBLIC`)
|
||||||
|
- tenant-owned documents (`owner_tenant_id != null`, `visibility = TENANT/SHARED/...`)
|
||||||
|
|
||||||
|
## Deliberately deferred
|
||||||
|
|
||||||
|
- DOCX extraction
|
||||||
|
- ZIP recursive child import in the generic pipeline
|
||||||
|
- MIME/EML structured parsing
|
||||||
|
- generic structured projections beyond TED
|
||||||
|
- chunked long-document representations
|
||||||
@ -0,0 +1,40 @@
|
|||||||
|
# Phase 4 - Generic Ingestion Pipeline
|
||||||
|
|
||||||
|
Phase 4 introduces the first generalized ingestion flow on top of the DOC backbone.
|
||||||
|
|
||||||
|
## What is included
|
||||||
|
|
||||||
|
- generic ingestion gateway with adapter selection
|
||||||
|
- file-system ingestion adapter and Camel route
|
||||||
|
- REST/API upload controller for arbitrary documents
|
||||||
|
- document type detection by media type / extension
|
||||||
|
- first extractors for:
|
||||||
|
- plain text / markdown / generic XML
|
||||||
|
- HTML
|
||||||
|
- PDF
|
||||||
|
- binary fallback
|
||||||
|
- default representation builder for non-TED documents
|
||||||
|
- binary payload support in `DOC.doc_content.binary_content`
|
||||||
|
- automatic creation of pending generic embeddings for imported representations
|
||||||
|
|
||||||
|
## Important behavior
|
||||||
|
|
||||||
|
- current TED runtime remains intact
|
||||||
|
- generic ingestion is disabled by default and must be enabled with:
|
||||||
|
- `ted.generic-ingestion.enabled=true`
|
||||||
|
- file-system polling is separately controlled with:
|
||||||
|
- `ted.generic-ingestion.file-system-enabled=true`
|
||||||
|
- REST/API upload endpoints are under:
|
||||||
|
- `/api/v1/dip/import/upload`
|
||||||
|
- `/api/v1/dip/import/text`
|
||||||
|
|
||||||
|
## Current supported generic document types
|
||||||
|
|
||||||
|
- PDF
|
||||||
|
- HTML
|
||||||
|
- TEXT
|
||||||
|
- MARKDOWN
|
||||||
|
- XML_GENERIC
|
||||||
|
- UNKNOWN text-like files
|
||||||
|
|
||||||
|
DOCX, ZIP child extraction, and MIME body parsing are intentionally left for later phases.
|
||||||
@ -0,0 +1,82 @@
|
|||||||
|
package at.procon.dip.classification.detector;
|
||||||
|
|
||||||
|
import at.procon.dip.classification.spi.DetectionResult;
|
||||||
|
import at.procon.dip.classification.spi.DocumentTypeDetector;
|
||||||
|
import at.procon.dip.domain.document.DocumentFamily;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
|
import at.procon.dip.ingestion.util.DocumentImportSupport;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Map;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.util.StringUtils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Basic Phase 4 detector using file extension, media type, and lightweight heuristics.
|
||||||
|
*/
|
||||||
|
@Component
|
||||||
|
public class BasicMimeAndExtensionDocumentTypeDetector implements DocumentTypeDetector {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean supports(SourceDescriptor sourceDescriptor) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DetectionResult detect(SourceDescriptor sourceDescriptor) {
|
||||||
|
String normalizedMediaType = DocumentImportSupport.normalizeMediaType(sourceDescriptor.mediaType());
|
||||||
|
String extension = DocumentImportSupport.extensionOf(sourceDescriptor.fileName());
|
||||||
|
|
||||||
|
DocumentType documentType = detectByMediaType(normalizedMediaType);
|
||||||
|
if (documentType == DocumentType.UNKNOWN) {
|
||||||
|
documentType = detectByExtension(extension);
|
||||||
|
}
|
||||||
|
if (documentType == DocumentType.UNKNOWN && sourceDescriptor.hasInlineTextContent()) {
|
||||||
|
documentType = DocumentType.TEXT;
|
||||||
|
}
|
||||||
|
|
||||||
|
DocumentFamily family = DocumentImportSupport.familyFor(documentType);
|
||||||
|
String languageCode = sourceDescriptor.attributes() == null ? null : sourceDescriptor.attributes().get("languageCode");
|
||||||
|
Map<String, String> attributes = new HashMap<>();
|
||||||
|
attributes.put("detectedExtension", extension);
|
||||||
|
if (normalizedMediaType != null) {
|
||||||
|
attributes.put("normalizedMediaType", normalizedMediaType);
|
||||||
|
}
|
||||||
|
if (StringUtils.hasText(sourceDescriptor.fileName())) {
|
||||||
|
attributes.put("fileName", sourceDescriptor.fileName());
|
||||||
|
}
|
||||||
|
|
||||||
|
return new DetectionResult(documentType, family, normalizedMediaType, languageCode, attributes);
|
||||||
|
}
|
||||||
|
|
||||||
|
private DocumentType detectByMediaType(String mediaType) {
|
||||||
|
if (!StringUtils.hasText(mediaType)) {
|
||||||
|
return DocumentType.UNKNOWN;
|
||||||
|
}
|
||||||
|
return switch (mediaType.toLowerCase(Locale.ROOT)) {
|
||||||
|
case "application/pdf", "application/x-pdf" -> DocumentType.PDF;
|
||||||
|
case "text/html", "application/xhtml+xml" -> DocumentType.HTML;
|
||||||
|
case "text/plain" -> DocumentType.TEXT;
|
||||||
|
case "text/markdown", "text/x-markdown" -> DocumentType.MARKDOWN;
|
||||||
|
case "application/xml", "text/xml" -> DocumentType.XML_GENERIC;
|
||||||
|
case "message/rfc822" -> DocumentType.MIME_MESSAGE;
|
||||||
|
case "application/zip", "application/x-zip-compressed" -> DocumentType.ZIP_ARCHIVE;
|
||||||
|
default -> mediaType.startsWith("text/") ? DocumentType.TEXT : DocumentType.UNKNOWN;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private DocumentType detectByExtension(String extension) {
|
||||||
|
return switch (extension) {
|
||||||
|
case "pdf" -> DocumentType.PDF;
|
||||||
|
case "html", "htm" -> DocumentType.HTML;
|
||||||
|
case "txt", "log", "csv", "json", "yaml", "yml" -> DocumentType.TEXT;
|
||||||
|
case "md", "markdown" -> DocumentType.MARKDOWN;
|
||||||
|
case "xml", "xsd", "xslt" -> DocumentType.XML_GENERIC;
|
||||||
|
case "eml", "msg" -> DocumentType.MIME_MESSAGE;
|
||||||
|
case "zip" -> DocumentType.ZIP_ARCHIVE;
|
||||||
|
case "docx" -> DocumentType.DOCX;
|
||||||
|
default -> DocumentType.UNKNOWN;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,26 @@
|
|||||||
|
package at.procon.dip.classification.service;
|
||||||
|
|
||||||
|
import at.procon.dip.classification.spi.DetectionResult;
|
||||||
|
import at.procon.dip.classification.spi.DocumentTypeDetector;
|
||||||
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
|
import java.util.List;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Selects the first matching detector for a source descriptor.
|
||||||
|
*/
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class DocumentClassificationService {
|
||||||
|
|
||||||
|
private final List<DocumentTypeDetector> detectors;
|
||||||
|
|
||||||
|
public DetectionResult detect(SourceDescriptor sourceDescriptor) {
|
||||||
|
return detectors.stream()
|
||||||
|
.filter(detector -> detector.supports(sourceDescriptor))
|
||||||
|
.findFirst()
|
||||||
|
.orElseThrow(() -> new IllegalStateException("No document type detector available"))
|
||||||
|
.detect(sourceDescriptor);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,30 @@
|
|||||||
|
package at.procon.dip.extraction.impl;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.extraction.spi.DocumentExtractor;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractionRequest;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractionResult;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fallback extractor for binary formats not yet supported by specialized extractors.
|
||||||
|
*/
|
||||||
|
@Component
|
||||||
|
public class BinaryPassThroughDocumentExtractor implements DocumentExtractor {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean supports(DocumentType documentType, String mimeType) {
|
||||||
|
return documentType == DocumentType.DOCX
|
||||||
|
|| documentType == DocumentType.ZIP_ARCHIVE
|
||||||
|
|| documentType == DocumentType.GENERIC_BINARY
|
||||||
|
|| documentType == DocumentType.MIME_MESSAGE;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ExtractionResult extract(ExtractionRequest extractionRequest) {
|
||||||
|
return new ExtractionResult(Map.of(), List.of(), List.of(
|
||||||
|
"No specialized extractor available yet for " + extractionRequest.detectionResult().documentType()));
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,60 @@
|
|||||||
|
package at.procon.dip.extraction.impl;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.ContentRole;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
|
||||||
|
import at.procon.dip.extraction.spi.DocumentExtractor;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractionRequest;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractionResult;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.util.StringUtils;
|
||||||
|
|
||||||
|
@Component
|
||||||
|
public class HtmlDocumentExtractor implements DocumentExtractor {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean supports(DocumentType documentType, String mimeType) {
|
||||||
|
return documentType == DocumentType.HTML || "text/html".equalsIgnoreCase(mimeType)
|
||||||
|
|| "application/xhtml+xml".equalsIgnoreCase(mimeType);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ExtractionResult extract(ExtractionRequest extractionRequest) {
|
||||||
|
String html = extractionRequest.textContent();
|
||||||
|
if (!StringUtils.hasText(html) && extractionRequest.binaryContent() != null) {
|
||||||
|
html = new String(extractionRequest.binaryContent(), StandardCharsets.UTF_8);
|
||||||
|
}
|
||||||
|
if (!StringUtils.hasText(html)) {
|
||||||
|
return new ExtractionResult(Map.of(), List.of(), List.of("No HTML content available"));
|
||||||
|
}
|
||||||
|
|
||||||
|
Document parsed = Jsoup.parse(html);
|
||||||
|
parsed.outputSettings().prettyPrint(false);
|
||||||
|
String cleanText = parsed.text();
|
||||||
|
|
||||||
|
Map<ContentRole, String> derivedText = new LinkedHashMap<>();
|
||||||
|
derivedText.put(ContentRole.HTML_CLEAN, parsed.body() != null ? parsed.body().text() : cleanText);
|
||||||
|
derivedText.put(ContentRole.NORMALIZED_TEXT, cleanText);
|
||||||
|
|
||||||
|
Map<String, Object> payload = new LinkedHashMap<>();
|
||||||
|
if (StringUtils.hasText(parsed.title())) {
|
||||||
|
payload.put("title", parsed.title().trim());
|
||||||
|
}
|
||||||
|
if (parsed.body() != null) {
|
||||||
|
payload.put("bodyTextLength", parsed.body().text().length());
|
||||||
|
}
|
||||||
|
List<String> warnings = new ArrayList<>();
|
||||||
|
if (!StringUtils.hasText(parsed.title())) {
|
||||||
|
warnings.add("HTML document has no <title> element");
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ExtractionResult(derivedText, List.of(new ExtractedStructuredPayload("html-document", payload)), warnings);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,69 @@
|
|||||||
|
package at.procon.dip.extraction.impl;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.ContentRole;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
|
||||||
|
import at.procon.dip.extraction.spi.DocumentExtractor;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractionRequest;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractionResult;
|
||||||
|
import at.procon.ted.service.attachment.PdfExtractionService;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.util.StringUtils;
|
||||||
|
|
||||||
|
@Component
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class PdfDocumentExtractor implements DocumentExtractor {
|
||||||
|
|
||||||
|
private final PdfExtractionService pdfExtractionService;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean supports(DocumentType documentType, String mimeType) {
|
||||||
|
return documentType == DocumentType.PDF || pdfExtractionService.canHandle("dummy.pdf", mimeType);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ExtractionResult extract(ExtractionRequest extractionRequest) {
|
||||||
|
byte[] binary = extractionRequest.binaryContent();
|
||||||
|
if (binary == null || binary.length == 0) {
|
||||||
|
return new ExtractionResult(Map.of(), List.of(), List.of("No PDF binary payload available"));
|
||||||
|
}
|
||||||
|
|
||||||
|
PdfExtractionService.ExtractionResult extraction = pdfExtractionService.extract(
|
||||||
|
binary,
|
||||||
|
extractionRequest.sourceDescriptor().fileName(),
|
||||||
|
extractionRequest.detectionResult().mimeType()
|
||||||
|
);
|
||||||
|
if (!extraction.success()) {
|
||||||
|
return new ExtractionResult(Map.of(), List.of(), List.of(extraction.errorMessage()));
|
||||||
|
}
|
||||||
|
|
||||||
|
String text = extraction.extractedText();
|
||||||
|
if (!StringUtils.hasText(text)) {
|
||||||
|
return new ExtractionResult(Map.of(), List.of(), List.of("PDF text extraction returned no text"));
|
||||||
|
}
|
||||||
|
|
||||||
|
Map<String, Object> payload = new LinkedHashMap<>();
|
||||||
|
payload.put("title", deriveTitle(text, extractionRequest.sourceDescriptor().fileName()));
|
||||||
|
payload.put("extractor", "pdfbox");
|
||||||
|
|
||||||
|
return new ExtractionResult(
|
||||||
|
Map.of(ContentRole.NORMALIZED_TEXT, text),
|
||||||
|
List.of(new ExtractedStructuredPayload("pdf-document", payload)),
|
||||||
|
List.of()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String deriveTitle(String text, String fallback) {
|
||||||
|
for (String line : text.split("\\n")) {
|
||||||
|
if (StringUtils.hasText(line)) {
|
||||||
|
String trimmed = line.trim();
|
||||||
|
return trimmed.length() > 240 ? trimmed.substring(0, 240) : trimmed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fallback;
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,69 @@
|
|||||||
|
package at.procon.dip.extraction.impl;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.ContentRole;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
|
||||||
|
import at.procon.dip.extraction.spi.DocumentExtractor;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractionRequest;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractionResult;
|
||||||
|
import at.procon.dip.ingestion.util.DocumentImportSupport;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.util.StringUtils;
|
||||||
|
|
||||||
|
@Component
|
||||||
|
public class PlainTextDocumentExtractor implements DocumentExtractor {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean supports(DocumentType documentType, String mimeType) {
|
||||||
|
return documentType == DocumentType.TEXT
|
||||||
|
|| documentType == DocumentType.MARKDOWN
|
||||||
|
|| documentType == DocumentType.XML_GENERIC
|
||||||
|
|| documentType == DocumentType.UNKNOWN
|
||||||
|
|| DocumentImportSupport.isLikelyTextMime(mimeType);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ExtractionResult extract(ExtractionRequest extractionRequest) {
|
||||||
|
String text = extractionRequest.textContent();
|
||||||
|
if (!StringUtils.hasText(text) && extractionRequest.binaryContent() != null) {
|
||||||
|
text = new String(extractionRequest.binaryContent(), StandardCharsets.UTF_8);
|
||||||
|
}
|
||||||
|
text = normalizeText(text);
|
||||||
|
|
||||||
|
if (!StringUtils.hasText(text)) {
|
||||||
|
return new ExtractionResult(Map.of(), List.of(), List.of("No text content extracted"));
|
||||||
|
}
|
||||||
|
|
||||||
|
String title = deriveTitle(text, extractionRequest.sourceDescriptor().fileName());
|
||||||
|
return new ExtractionResult(
|
||||||
|
Map.of(ContentRole.NORMALIZED_TEXT, text),
|
||||||
|
List.of(new ExtractedStructuredPayload("generic-document", Map.of("title", title))),
|
||||||
|
List.of()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String normalizeText(String text) {
|
||||||
|
if (text == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return text.replace("\r\n", "\n")
|
||||||
|
.replace('\r', '\n')
|
||||||
|
.replaceAll("\\n{3,}", "\n\n")
|
||||||
|
.replaceAll("[ \t]+\n", "\n")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String deriveTitle(String text, String fallback) {
|
||||||
|
if (StringUtils.hasText(text)) {
|
||||||
|
for (String line : text.split("\\n")) {
|
||||||
|
if (StringUtils.hasText(line)) {
|
||||||
|
return DocumentImportSupport.ellipsize(line.trim(), 240);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fallback;
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,26 @@
|
|||||||
|
package at.procon.dip.extraction.service;
|
||||||
|
|
||||||
|
import at.procon.dip.extraction.spi.DocumentExtractor;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractionRequest;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractionResult;
|
||||||
|
import java.util.List;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class DocumentExtractionService {
|
||||||
|
|
||||||
|
private final List<DocumentExtractor> extractors;
|
||||||
|
|
||||||
|
public ExtractionResult extract(ExtractionRequest extractionRequest) {
|
||||||
|
return extractors.stream()
|
||||||
|
.filter(extractor -> extractor.supports(
|
||||||
|
extractionRequest.detectionResult().documentType(),
|
||||||
|
extractionRequest.detectionResult().mimeType()))
|
||||||
|
.findFirst()
|
||||||
|
.orElseThrow(() -> new IllegalStateException(
|
||||||
|
"No extractor registered for type " + extractionRequest.detectionResult().documentType()))
|
||||||
|
.extract(extractionRequest);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,28 @@
|
|||||||
|
package at.procon.dip.ingestion.adapter;
|
||||||
|
|
||||||
|
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
|
||||||
|
import at.procon.dip.ingestion.service.GenericDocumentImportService;
|
||||||
|
import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
|
||||||
|
import at.procon.dip.ingestion.spi.IngestionResult;
|
||||||
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
|
import java.util.List;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
@Component
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class FileSystemDocumentIngestionAdapter implements DocumentIngestionAdapter {
|
||||||
|
|
||||||
|
private final GenericDocumentImportService importService;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean supports(SourceDescriptor sourceDescriptor) {
|
||||||
|
return sourceDescriptor.sourceType() == at.procon.dip.domain.document.SourceType.FILE_SYSTEM;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public IngestionResult ingest(SourceDescriptor sourceDescriptor) {
|
||||||
|
ImportedDocumentResult imported = importService.importDocument(sourceDescriptor);
|
||||||
|
return new IngestionResult(List.of(imported.document().toCanonicalMetadata()), imported.warnings());
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,31 @@
|
|||||||
|
package at.procon.dip.ingestion.adapter;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.SourceType;
|
||||||
|
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
|
||||||
|
import at.procon.dip.ingestion.service.GenericDocumentImportService;
|
||||||
|
import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
|
||||||
|
import at.procon.dip.ingestion.spi.IngestionResult;
|
||||||
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
|
import java.util.List;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
@Component
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class InlineContentDocumentIngestionAdapter implements DocumentIngestionAdapter {
|
||||||
|
|
||||||
|
private final GenericDocumentImportService importService;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean supports(SourceDescriptor sourceDescriptor) {
|
||||||
|
return sourceDescriptor.sourceType() == SourceType.REST_UPLOAD
|
||||||
|
|| sourceDescriptor.sourceType() == SourceType.MANUAL_UPLOAD
|
||||||
|
|| sourceDescriptor.sourceType() == SourceType.API;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public IngestionResult ingest(SourceDescriptor sourceDescriptor) {
|
||||||
|
ImportedDocumentResult imported = importService.importDocument(sourceDescriptor);
|
||||||
|
return new IngestionResult(List.of(imported.document().toCanonicalMetadata()), imported.warnings());
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,88 @@
|
|||||||
|
package at.procon.dip.ingestion.camel;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.access.DocumentAccessContext;
|
||||||
|
import at.procon.dip.domain.access.DocumentVisibility;
|
||||||
|
import at.procon.dip.domain.document.SourceType;
|
||||||
|
import at.procon.dip.domain.tenant.TenantRef;
|
||||||
|
import at.procon.dip.ingestion.service.DocumentIngestionGateway;
|
||||||
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
|
import at.procon.ted.config.TedProcessorProperties;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.apache.camel.Exchange;
|
||||||
|
import org.apache.camel.builder.RouteBuilder;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.util.StringUtils;
|
||||||
|
|
||||||
|
@Component
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Slf4j
|
||||||
|
public class GenericFileSystemIngestionRoute extends RouteBuilder {
|
||||||
|
|
||||||
|
private final TedProcessorProperties properties;
|
||||||
|
private final DocumentIngestionGateway ingestionGateway;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void configure() {
|
||||||
|
if (!properties.getGenericIngestion().isEnabled() || !properties.getGenericIngestion().isFileSystemEnabled()) {
|
||||||
|
log.info("Phase 4 generic filesystem ingestion route disabled");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var config = properties.getGenericIngestion();
|
||||||
|
log.info("Configuring Phase 4 generic filesystem ingestion from {}", config.getInputDirectory());
|
||||||
|
|
||||||
|
fromF("file:%s?recursive=true&include=%s&delay=%d&maxMessagesPerPoll=%d&move=%s&moveFailed=%s",
|
||||||
|
config.getInputDirectory(),
|
||||||
|
config.getFilePattern(),
|
||||||
|
config.getPollInterval(),
|
||||||
|
config.getMaxMessagesPerPoll(),
|
||||||
|
config.getProcessedDirectory(),
|
||||||
|
config.getErrorDirectory())
|
||||||
|
.routeId("dip-generic-filesystem-ingestion")
|
||||||
|
.process(exchange -> processFile(exchange))
|
||||||
|
.log("Imported generic document from ${header.CamelFileAbsolutePath}");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processFile(Exchange exchange) throws Exception {
|
||||||
|
Path path = exchange.getIn().getBody(Path.class);
|
||||||
|
if (path == null) {
|
||||||
|
String absolutePath = exchange.getIn().getHeader(Exchange.FILE_PATH, String.class);
|
||||||
|
path = Path.of(absolutePath);
|
||||||
|
}
|
||||||
|
byte[] payload = Files.readAllBytes(path);
|
||||||
|
Map<String, String> attributes = new LinkedHashMap<>();
|
||||||
|
String languageCode = properties.getGenericIngestion().getDefaultLanguageCode();
|
||||||
|
if (StringUtils.hasText(languageCode)) {
|
||||||
|
attributes.put("languageCode", languageCode);
|
||||||
|
}
|
||||||
|
|
||||||
|
SourceDescriptor descriptor = new SourceDescriptor(
|
||||||
|
buildDefaultAccessContext(),
|
||||||
|
SourceType.FILE_SYSTEM,
|
||||||
|
path.getFileName().toString(),
|
||||||
|
path.toAbsolutePath().toString(),
|
||||||
|
path.getFileName().toString(),
|
||||||
|
Files.probeContentType(path),
|
||||||
|
payload,
|
||||||
|
null,
|
||||||
|
OffsetDateTime.now(),
|
||||||
|
attributes
|
||||||
|
);
|
||||||
|
ingestionGateway.ingest(descriptor);
|
||||||
|
}
|
||||||
|
|
||||||
|
private DocumentAccessContext buildDefaultAccessContext() {
|
||||||
|
String ownerTenantKey = properties.getGenericIngestion().getDefaultOwnerTenantKey();
|
||||||
|
DocumentVisibility visibility = properties.getGenericIngestion().getDefaultVisibility();
|
||||||
|
if (!StringUtils.hasText(ownerTenantKey)) {
|
||||||
|
return new DocumentAccessContext(null, visibility);
|
||||||
|
}
|
||||||
|
return new DocumentAccessContext(new TenantRef(null, ownerTenantKey, ownerTenantKey), visibility);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,128 @@
|
|||||||
|
package at.procon.dip.ingestion.controller;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.access.DocumentAccessContext;
|
||||||
|
import at.procon.dip.domain.access.DocumentVisibility;
|
||||||
|
import at.procon.dip.domain.document.CanonicalDocumentMetadata;
|
||||||
|
import at.procon.dip.domain.document.SourceType;
|
||||||
|
import at.procon.dip.domain.tenant.TenantRef;
|
||||||
|
import at.procon.dip.ingestion.dto.GenericImportResponse;
|
||||||
|
import at.procon.dip.ingestion.dto.GenericTextImportRequest;
|
||||||
|
import at.procon.dip.ingestion.service.DocumentIngestionGateway;
|
||||||
|
import at.procon.dip.ingestion.spi.IngestionResult;
|
||||||
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
|
import at.procon.ted.config.TedProcessorProperties;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.http.MediaType;
|
||||||
|
import org.springframework.http.ResponseEntity;
|
||||||
|
import org.springframework.util.StringUtils;
|
||||||
|
import org.springframework.web.bind.annotation.PostMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RequestBody;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RequestParam;
|
||||||
|
import org.springframework.web.bind.annotation.RestController;
|
||||||
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
|
@RestController
|
||||||
|
@RequestMapping("/v1/dip/import")
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class GenericDocumentImportController {
|
||||||
|
|
||||||
|
private final TedProcessorProperties properties;
|
||||||
|
private final DocumentIngestionGateway ingestionGateway;
|
||||||
|
|
||||||
|
@PostMapping(path = "/upload", consumes = MediaType.MULTIPART_FORM_DATA_VALUE)
|
||||||
|
public ResponseEntity<GenericImportResponse> upload(
|
||||||
|
@RequestParam("file") MultipartFile file,
|
||||||
|
@RequestParam(value = "ownerTenantKey", required = false) String ownerTenantKey,
|
||||||
|
@RequestParam(value = "visibility", required = false) DocumentVisibility visibility,
|
||||||
|
@RequestParam(value = "languageCode", required = false) String languageCode,
|
||||||
|
@RequestParam(value = "title", required = false) String title,
|
||||||
|
@RequestParam(value = "sourceIdentifier", required = false) String sourceIdentifier
|
||||||
|
) throws Exception {
|
||||||
|
ensureRestUploadEnabled();
|
||||||
|
|
||||||
|
Map<String, String> attributes = new LinkedHashMap<>();
|
||||||
|
if (StringUtils.hasText(languageCode)) {
|
||||||
|
attributes.put("languageCode", languageCode);
|
||||||
|
}
|
||||||
|
if (StringUtils.hasText(title)) {
|
||||||
|
attributes.put("title", title);
|
||||||
|
}
|
||||||
|
|
||||||
|
SourceDescriptor descriptor = new SourceDescriptor(
|
||||||
|
buildAccessContext(ownerTenantKey, visibility),
|
||||||
|
SourceType.REST_UPLOAD,
|
||||||
|
StringUtils.hasText(sourceIdentifier) ? sourceIdentifier : file.getOriginalFilename(),
|
||||||
|
null,
|
||||||
|
file.getOriginalFilename(),
|
||||||
|
file.getContentType(),
|
||||||
|
file.getBytes(),
|
||||||
|
null,
|
||||||
|
OffsetDateTime.now(),
|
||||||
|
attributes
|
||||||
|
);
|
||||||
|
IngestionResult result = ingestionGateway.ingest(descriptor);
|
||||||
|
return ResponseEntity.ok(toResponse(result));
|
||||||
|
}
|
||||||
|
|
||||||
|
@PostMapping(path = "/text", consumes = MediaType.APPLICATION_JSON_VALUE)
|
||||||
|
public ResponseEntity<GenericImportResponse> importText(@RequestBody GenericTextImportRequest request) {
|
||||||
|
ensureRestUploadEnabled();
|
||||||
|
Map<String, String> attributes = new LinkedHashMap<>();
|
||||||
|
if (StringUtils.hasText(request.languageCode())) {
|
||||||
|
attributes.put("languageCode", request.languageCode());
|
||||||
|
}
|
||||||
|
if (StringUtils.hasText(request.title())) {
|
||||||
|
attributes.put("title", request.title());
|
||||||
|
}
|
||||||
|
|
||||||
|
SourceDescriptor descriptor = new SourceDescriptor(
|
||||||
|
buildAccessContext(request.ownerTenantKey(), request.visibility()),
|
||||||
|
SourceType.API,
|
||||||
|
StringUtils.hasText(request.sourceIdentifier()) ? request.sourceIdentifier() : request.fileName(),
|
||||||
|
null,
|
||||||
|
request.fileName(),
|
||||||
|
request.mediaType(),
|
||||||
|
request.text() == null ? null : request.text().getBytes(java.nio.charset.StandardCharsets.UTF_8),
|
||||||
|
request.text(),
|
||||||
|
OffsetDateTime.now(),
|
||||||
|
attributes
|
||||||
|
);
|
||||||
|
IngestionResult result = ingestionGateway.ingest(descriptor);
|
||||||
|
return ResponseEntity.ok(toResponse(result));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void ensureRestUploadEnabled() {
|
||||||
|
if (!properties.getGenericIngestion().isEnabled() || !properties.getGenericIngestion().isRestUploadEnabled()) {
|
||||||
|
throw new IllegalStateException("Generic REST import is disabled");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private DocumentAccessContext buildAccessContext(String ownerTenantKey, DocumentVisibility visibility) {
|
||||||
|
DocumentVisibility effectiveVisibility = visibility != null
|
||||||
|
? visibility
|
||||||
|
: properties.getGenericIngestion().getDefaultVisibility();
|
||||||
|
if (!StringUtils.hasText(ownerTenantKey)) {
|
||||||
|
return new DocumentAccessContext(null, effectiveVisibility);
|
||||||
|
}
|
||||||
|
return new DocumentAccessContext(new TenantRef(null, ownerTenantKey, ownerTenantKey), effectiveVisibility);
|
||||||
|
}
|
||||||
|
|
||||||
|
private GenericImportResponse toResponse(IngestionResult result) {
|
||||||
|
CanonicalDocumentMetadata metadata = result.documents().stream()
|
||||||
|
.findFirst()
|
||||||
|
.orElseThrow(() -> new IllegalStateException("No imported document metadata returned"));
|
||||||
|
return new GenericImportResponse(
|
||||||
|
metadata.documentId(),
|
||||||
|
metadata.title(),
|
||||||
|
metadata.documentType(),
|
||||||
|
metadata.documentFamily(),
|
||||||
|
metadata.status(),
|
||||||
|
result.warnings().stream().anyMatch(w -> w != null && w.contains("already imported")),
|
||||||
|
result.warnings()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,18 @@
|
|||||||
|
package at.procon.dip.ingestion.dto;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.DocumentFamily;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.domain.document.DocumentStatus;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
public record GenericImportResponse(
|
||||||
|
UUID documentId,
|
||||||
|
String title,
|
||||||
|
DocumentType documentType,
|
||||||
|
DocumentFamily documentFamily,
|
||||||
|
DocumentStatus status,
|
||||||
|
boolean deduplicated,
|
||||||
|
List<String> warnings
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,15 @@
|
|||||||
|
package at.procon.dip.ingestion.dto;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.access.DocumentVisibility;
|
||||||
|
|
||||||
|
public record GenericTextImportRequest(
|
||||||
|
String text,
|
||||||
|
String fileName,
|
||||||
|
String mediaType,
|
||||||
|
String ownerTenantKey,
|
||||||
|
DocumentVisibility visibility,
|
||||||
|
String languageCode,
|
||||||
|
String title,
|
||||||
|
String sourceIdentifier
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,16 @@
|
|||||||
|
package at.procon.dip.ingestion.dto;
|
||||||
|
|
||||||
|
import at.procon.dip.classification.spi.DetectionResult;
|
||||||
|
import at.procon.dip.domain.document.entity.Document;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Internal result of the generic Phase 4 import pipeline.
|
||||||
|
*/
|
||||||
|
public record ImportedDocumentResult(
|
||||||
|
Document document,
|
||||||
|
DetectionResult detectionResult,
|
||||||
|
List<String> warnings,
|
||||||
|
boolean deduplicated
|
||||||
|
) {
|
||||||
|
}
|
||||||
@ -0,0 +1,24 @@
|
|||||||
|
package at.procon.dip.ingestion.service;
|
||||||
|
|
||||||
|
import at.procon.dip.ingestion.spi.DocumentIngestionAdapter;
|
||||||
|
import at.procon.dip.ingestion.spi.IngestionResult;
|
||||||
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
|
import java.util.List;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class DocumentIngestionGateway {
|
||||||
|
|
||||||
|
private final List<DocumentIngestionAdapter> adapters;
|
||||||
|
|
||||||
|
public IngestionResult ingest(SourceDescriptor sourceDescriptor) {
|
||||||
|
return adapters.stream()
|
||||||
|
.filter(adapter -> adapter.supports(sourceDescriptor))
|
||||||
|
.findFirst()
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException(
|
||||||
|
"No ingestion adapter registered for source type " + sourceDescriptor.sourceType()))
|
||||||
|
.ingest(sourceDescriptor);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,393 @@
|
|||||||
|
package at.procon.dip.ingestion.service;
|
||||||
|
|
||||||
|
import at.procon.dip.classification.service.DocumentClassificationService;
|
||||||
|
import at.procon.dip.classification.spi.DetectionResult;
|
||||||
|
import at.procon.dip.domain.access.DocumentAccessContext;
|
||||||
|
import at.procon.dip.domain.access.DocumentVisibility;
|
||||||
|
import at.procon.dip.domain.document.CanonicalDocumentMetadata;
|
||||||
|
import at.procon.dip.domain.document.ContentRole;
|
||||||
|
import at.procon.dip.domain.document.DocumentStatus;
|
||||||
|
import at.procon.dip.domain.document.StorageType;
|
||||||
|
import at.procon.dip.domain.document.entity.Document;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentContent;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
|
||||||
|
import at.procon.dip.domain.document.entity.DocumentSource;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentEmbeddingRepository;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentRepository;
|
||||||
|
import at.procon.dip.domain.document.repository.DocumentSourceRepository;
|
||||||
|
import at.procon.dip.domain.document.service.DocumentContentService;
|
||||||
|
import at.procon.dip.domain.document.service.DocumentEmbeddingService;
|
||||||
|
import at.procon.dip.domain.document.service.DocumentRepresentationService;
|
||||||
|
import at.procon.dip.domain.document.service.DocumentService;
|
||||||
|
import at.procon.dip.domain.document.service.DocumentSourceService;
|
||||||
|
import at.procon.dip.domain.document.service.command.AddDocumentContentCommand;
|
||||||
|
import at.procon.dip.domain.document.service.command.AddDocumentSourceCommand;
|
||||||
|
import at.procon.dip.domain.document.service.command.AddDocumentTextRepresentationCommand;
|
||||||
|
import at.procon.dip.domain.document.service.command.CreateDocumentCommand;
|
||||||
|
import at.procon.dip.domain.document.service.command.RegisterEmbeddingModelCommand;
|
||||||
|
import at.procon.dip.extraction.service.DocumentExtractionService;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractionRequest;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractionResult;
|
||||||
|
import at.procon.dip.ingestion.dto.ImportedDocumentResult;
|
||||||
|
import at.procon.dip.ingestion.spi.SourceDescriptor;
|
||||||
|
import at.procon.dip.ingestion.util.DocumentImportSupport;
|
||||||
|
import at.procon.dip.normalization.service.TextRepresentationBuildService;
|
||||||
|
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
|
||||||
|
import at.procon.dip.normalization.spi.TextRepresentationDraft;
|
||||||
|
import at.procon.ted.config.TedProcessorProperties;
|
||||||
|
import at.procon.ted.util.HashUtils;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.UUID;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
import org.springframework.transaction.annotation.Transactional;
|
||||||
|
import org.springframework.util.StringUtils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Phase 4 generic import pipeline that persists arbitrary document types into the DOC model.
|
||||||
|
*/
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Slf4j
|
||||||
|
public class GenericDocumentImportService {
|
||||||
|
|
||||||
|
private final TedProcessorProperties properties;
|
||||||
|
private final DocumentRepository documentRepository;
|
||||||
|
private final DocumentSourceRepository documentSourceRepository;
|
||||||
|
private final DocumentEmbeddingRepository documentEmbeddingRepository;
|
||||||
|
private final DocumentService documentService;
|
||||||
|
private final DocumentSourceService documentSourceService;
|
||||||
|
private final DocumentContentService documentContentService;
|
||||||
|
private final DocumentRepresentationService documentRepresentationService;
|
||||||
|
private final DocumentEmbeddingService documentEmbeddingService;
|
||||||
|
private final DocumentClassificationService classificationService;
|
||||||
|
private final DocumentExtractionService extractionService;
|
||||||
|
private final TextRepresentationBuildService representationBuildService;
|
||||||
|
|
||||||
|
@Transactional
|
||||||
|
public ImportedDocumentResult importDocument(SourceDescriptor sourceDescriptor) {
|
||||||
|
ResolvedPayload payload = resolvePayload(sourceDescriptor);
|
||||||
|
DetectionResult detection = classificationService.detect(withResolvedMediaType(sourceDescriptor, payload));
|
||||||
|
String dedupHash = HashUtils.computeSha256(payload.binaryContent());
|
||||||
|
|
||||||
|
if (properties.getGenericIngestion().isDeduplicateByContentHash()) {
|
||||||
|
Optional<Document> existing = documentRepository.findByDedupHash(dedupHash);
|
||||||
|
if (existing.isPresent()) {
|
||||||
|
Document document = existing.get();
|
||||||
|
ensureSource(document, sourceDescriptor);
|
||||||
|
List<String> warnings = List.of("Document content hash already imported; linked new source to existing document");
|
||||||
|
return new ImportedDocumentResult(document, detection, warnings, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
DocumentAccessContext accessContext = sourceDescriptor.accessContext() == null
|
||||||
|
? defaultAccessContext()
|
||||||
|
: sourceDescriptor.accessContext();
|
||||||
|
|
||||||
|
Document document = documentService.create(new CreateDocumentCommand(
|
||||||
|
accessContext.ownerTenant() == null ? null : accessContext.ownerTenant().tenantKey(),
|
||||||
|
accessContext.visibility(),
|
||||||
|
detection.documentType(),
|
||||||
|
detection.documentFamily(),
|
||||||
|
DocumentStatus.RECEIVED,
|
||||||
|
determineInitialTitle(sourceDescriptor, detection, payload),
|
||||||
|
null,
|
||||||
|
detection.languageCode(),
|
||||||
|
detection.mimeType(),
|
||||||
|
buildBusinessKey(sourceDescriptor),
|
||||||
|
dedupHash
|
||||||
|
));
|
||||||
|
|
||||||
|
ensureSource(document, sourceDescriptor);
|
||||||
|
documentService.updateStatus(document.getId(), DocumentStatus.CLASSIFIED);
|
||||||
|
|
||||||
|
DocumentContent originalContent = persistOriginalContent(document, sourceDescriptor, detection, payload, dedupHash);
|
||||||
|
|
||||||
|
ExtractionResult extractionResult = extractionService.extract(new ExtractionRequest(
|
||||||
|
sourceDescriptor,
|
||||||
|
detection,
|
||||||
|
payload.textContent(),
|
||||||
|
payload.binaryContent()
|
||||||
|
));
|
||||||
|
List<String> warnings = new ArrayList<>(extractionResult.warnings());
|
||||||
|
|
||||||
|
Map<ContentRole, DocumentContent> persistedDerivedContent = persistDerivedContent(document, detection, extractionResult, dedupHash);
|
||||||
|
documentService.updateStatus(document.getId(), DocumentStatus.EXTRACTED);
|
||||||
|
|
||||||
|
var drafts = representationBuildService.build(new RepresentationBuildRequest(sourceDescriptor, detection, extractionResult));
|
||||||
|
persistRepresentationsAndEmbeddings(document, originalContent, persistedDerivedContent, drafts);
|
||||||
|
|
||||||
|
Document reloaded = documentService.getRequired(document.getId());
|
||||||
|
if (reloaded.getStatus() == DocumentStatus.EXTRACTED) {
|
||||||
|
documentService.updateStatus(reloaded.getId(), DocumentStatus.REPRESENTED);
|
||||||
|
reloaded = documentService.getRequired(reloaded.getId());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!extractionResult.structuredPayloads().isEmpty()) {
|
||||||
|
applyStructuredTitleIfMissing(reloaded, extractionResult);
|
||||||
|
reloaded = documentService.getRequired(reloaded.getId());
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ImportedDocumentResult(reloaded, detection, warnings, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
private SourceDescriptor withResolvedMediaType(SourceDescriptor sourceDescriptor, ResolvedPayload payload) {
|
||||||
|
if (StringUtils.hasText(sourceDescriptor.mediaType())) {
|
||||||
|
return sourceDescriptor;
|
||||||
|
}
|
||||||
|
return new SourceDescriptor(
|
||||||
|
sourceDescriptor.accessContext(),
|
||||||
|
sourceDescriptor.sourceType(),
|
||||||
|
sourceDescriptor.sourceIdentifier(),
|
||||||
|
sourceDescriptor.sourceUri(),
|
||||||
|
sourceDescriptor.fileName(),
|
||||||
|
payload.mediaType(),
|
||||||
|
sourceDescriptor.binaryContent(),
|
||||||
|
sourceDescriptor.textContent(),
|
||||||
|
sourceDescriptor.receivedAt(),
|
||||||
|
sourceDescriptor.attributes()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
private ResolvedPayload resolvePayload(SourceDescriptor sourceDescriptor) {
|
||||||
|
byte[] binary = sourceDescriptor.binaryContent();
|
||||||
|
String text = sourceDescriptor.textContent();
|
||||||
|
|
||||||
|
if ((binary == null || binary.length == 0) && StringUtils.hasText(text)) {
|
||||||
|
binary = text.getBytes(StandardCharsets.UTF_8);
|
||||||
|
}
|
||||||
|
if ((binary == null || binary.length == 0) && StringUtils.hasText(sourceDescriptor.sourceUri())) {
|
||||||
|
try {
|
||||||
|
java.nio.file.Path path = java.nio.file.Path.of(sourceDescriptor.sourceUri());
|
||||||
|
if (java.nio.file.Files.exists(path)) {
|
||||||
|
binary = java.nio.file.Files.readAllBytes(path);
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new IllegalArgumentException("Failed to read source payload from " + sourceDescriptor.sourceUri(), e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (binary == null || binary.length == 0) {
|
||||||
|
throw new IllegalArgumentException("No payload content available for source " + sourceDescriptor.sourceIdentifier());
|
||||||
|
}
|
||||||
|
if (!StringUtils.hasText(text) && DocumentImportSupport.isLikelyTextMime(sourceDescriptor.mediaType())) {
|
||||||
|
text = new String(binary, StandardCharsets.UTF_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ResolvedPayload(binary, text, inferMediaType(sourceDescriptor));
|
||||||
|
}
|
||||||
|
|
||||||
|
private String inferMediaType(SourceDescriptor sourceDescriptor) {
|
||||||
|
if (StringUtils.hasText(sourceDescriptor.mediaType())) {
|
||||||
|
return DocumentImportSupport.normalizeMediaType(sourceDescriptor.mediaType());
|
||||||
|
}
|
||||||
|
String extension = DocumentImportSupport.extensionOf(sourceDescriptor.fileName());
|
||||||
|
return switch (extension) {
|
||||||
|
case "pdf" -> "application/pdf";
|
||||||
|
case "html", "htm" -> "text/html";
|
||||||
|
case "md", "markdown" -> "text/markdown";
|
||||||
|
case "xml" -> "application/xml";
|
||||||
|
case "txt", "log", "csv", "json", "yaml", "yml" -> "text/plain";
|
||||||
|
case "eml" -> "message/rfc822";
|
||||||
|
default -> null;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private DocumentAccessContext defaultAccessContext() {
|
||||||
|
String tenantKey = properties.getGenericIngestion().getDefaultOwnerTenantKey();
|
||||||
|
DocumentVisibility visibility = properties.getGenericIngestion().getDefaultVisibility();
|
||||||
|
if (!StringUtils.hasText(tenantKey)) {
|
||||||
|
return new DocumentAccessContext(null, visibility);
|
||||||
|
}
|
||||||
|
return new DocumentAccessContext(new at.procon.dip.domain.tenant.TenantRef(null, tenantKey, tenantKey), visibility);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String determineInitialTitle(SourceDescriptor sourceDescriptor, DetectionResult detection, ResolvedPayload payload) {
|
||||||
|
if (sourceDescriptor.attributes() != null && StringUtils.hasText(sourceDescriptor.attributes().get("title"))) {
|
||||||
|
return sourceDescriptor.attributes().get("title");
|
||||||
|
}
|
||||||
|
if (StringUtils.hasText(sourceDescriptor.fileName())) {
|
||||||
|
return sourceDescriptor.fileName();
|
||||||
|
}
|
||||||
|
if (StringUtils.hasText(payload.textContent())) {
|
||||||
|
for (String line : payload.textContent().split("\\n")) {
|
||||||
|
if (StringUtils.hasText(line)) {
|
||||||
|
return DocumentImportSupport.ellipsize(line.trim(), 240);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return detection.documentType().name();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String buildBusinessKey(SourceDescriptor sourceDescriptor) {
|
||||||
|
if (StringUtils.hasText(sourceDescriptor.sourceIdentifier())) {
|
||||||
|
return sourceDescriptor.sourceType() + ":" + sourceDescriptor.sourceIdentifier();
|
||||||
|
}
|
||||||
|
if (StringUtils.hasText(sourceDescriptor.sourceUri())) {
|
||||||
|
return sourceDescriptor.sourceType() + ":" + sourceDescriptor.sourceUri();
|
||||||
|
}
|
||||||
|
return sourceDescriptor.sourceType() + ":" + java.util.UUID.randomUUID();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void ensureSource(Document document, SourceDescriptor sourceDescriptor) {
|
||||||
|
boolean alreadyLinked = documentSourceRepository.findByDocument_Id(document.getId()).stream().anyMatch(existing ->
|
||||||
|
equalsNullable(existing.getExternalSourceId(), sourceDescriptor.sourceIdentifier())
|
||||||
|
&& equalsNullable(existing.getSourceUri(), sourceDescriptor.sourceUri())
|
||||||
|
&& equalsNullable(existing.getSourceFilename(), sourceDescriptor.fileName()));
|
||||||
|
if (alreadyLinked) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
documentSourceService.addSource(new AddDocumentSourceCommand(
|
||||||
|
document.getId(),
|
||||||
|
sourceDescriptor.sourceType(),
|
||||||
|
sourceDescriptor.sourceIdentifier(),
|
||||||
|
sourceDescriptor.sourceUri(),
|
||||||
|
sourceDescriptor.fileName(),
|
||||||
|
null,
|
||||||
|
properties.getGenericIngestion().getImportBatchId(),
|
||||||
|
sourceDescriptor.receivedAt() == null ? OffsetDateTime.now() : sourceDescriptor.receivedAt()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
private DocumentContent persistOriginalContent(Document document,
|
||||||
|
SourceDescriptor sourceDescriptor,
|
||||||
|
DetectionResult detection,
|
||||||
|
ResolvedPayload payload,
|
||||||
|
String dedupHash) {
|
||||||
|
boolean storeBinaryInDb = shouldStoreBinaryInDb(payload.binaryContent());
|
||||||
|
boolean textPreferred = DocumentImportSupport.isLikelyTextMime(detection.mimeType()) || sourceDescriptor.hasInlineTextContent();
|
||||||
|
|
||||||
|
return documentContentService.addContent(new AddDocumentContentCommand(
|
||||||
|
document.getId(),
|
||||||
|
ContentRole.ORIGINAL,
|
||||||
|
textPreferred ? StorageType.DB_TEXT : (storeBinaryInDb ? StorageType.DB_BINARY : StorageType.EXTERNAL_REFERENCE),
|
||||||
|
detection.mimeType(),
|
||||||
|
textPreferred ? StandardCharsets.UTF_8.name() : null,
|
||||||
|
textPreferred ? payload.textContent() : null,
|
||||||
|
storeBinaryInDb && !textPreferred ? payload.binaryContent() : null,
|
||||||
|
!storeBinaryInDb && !textPreferred ? sourceDescriptor.sourceUri() : null,
|
||||||
|
dedupHash,
|
||||||
|
(long) payload.binaryContent().length
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean shouldStoreBinaryInDb(byte[] binaryContent) {
|
||||||
|
return properties.getGenericIngestion().isStoreOriginalBinaryInDb()
|
||||||
|
&& binaryContent != null
|
||||||
|
&& binaryContent.length <= properties.getGenericIngestion().getMaxBinaryBytesInDb();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<ContentRole, DocumentContent> persistDerivedContent(Document document,
|
||||||
|
DetectionResult detection,
|
||||||
|
ExtractionResult extractionResult,
|
||||||
|
String baseHash) {
|
||||||
|
Map<ContentRole, DocumentContent> result = new LinkedHashMap<>();
|
||||||
|
extractionResult.derivedTextByRole().forEach((role, text) -> {
|
||||||
|
if (!StringUtils.hasText(text)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
String contentHash = HashUtils.computeSha256(baseHash + ":" + role.name() + ":" + text);
|
||||||
|
DocumentContent content = documentContentService.addContent(new AddDocumentContentCommand(
|
||||||
|
document.getId(),
|
||||||
|
role,
|
||||||
|
StorageType.DB_TEXT,
|
||||||
|
detection.mimeType(),
|
||||||
|
StandardCharsets.UTF_8.name(),
|
||||||
|
text,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
contentHash,
|
||||||
|
(long) text.length()
|
||||||
|
));
|
||||||
|
result.put(role, content);
|
||||||
|
});
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void persistRepresentationsAndEmbeddings(Document document,
|
||||||
|
DocumentContent originalContent,
|
||||||
|
Map<ContentRole, DocumentContent> derivedContent,
|
||||||
|
List<TextRepresentationDraft> drafts) {
|
||||||
|
if (drafts == null || drafts.isEmpty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
DocumentEmbeddingModel model = null;
|
||||||
|
if (properties.getVectorization().isEnabled() && properties.getVectorization().isGenericPipelineEnabled()) {
|
||||||
|
model = documentEmbeddingService.registerModel(new RegisterEmbeddingModelCommand(
|
||||||
|
properties.getVectorization().getModelName(),
|
||||||
|
properties.getVectorization().getEmbeddingProvider(),
|
||||||
|
properties.getVectorization().getModelName(),
|
||||||
|
properties.getVectorization().getDimensions(),
|
||||||
|
null,
|
||||||
|
false,
|
||||||
|
true
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (TextRepresentationDraft draft : drafts) {
|
||||||
|
if (!StringUtils.hasText(draft.textBody())) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
DocumentContent linkedContent = switch (draft.representationType()) {
|
||||||
|
case FULLTEXT, SEMANTIC_TEXT, SUMMARY, TITLE_ABSTRACT, METADATA_ENRICHED, CHUNK ->
|
||||||
|
derivedContent.getOrDefault(ContentRole.NORMALIZED_TEXT, originalContent);
|
||||||
|
};
|
||||||
|
|
||||||
|
var representation = documentRepresentationService.addRepresentation(new AddDocumentTextRepresentationCommand(
|
||||||
|
document.getId(),
|
||||||
|
linkedContent == null ? null : linkedContent.getId(),
|
||||||
|
draft.representationType(),
|
||||||
|
"phase4-generic-builder",
|
||||||
|
draft.languageCode(),
|
||||||
|
null,
|
||||||
|
draft.chunkIndex(),
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
draft.primary(),
|
||||||
|
draft.textBody()
|
||||||
|
));
|
||||||
|
|
||||||
|
if (model != null && shouldQueueEmbedding(draft)) {
|
||||||
|
documentEmbeddingService.ensurePendingEmbedding(document.getId(), representation.getId(), model.getId());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
documentService.updateStatus(document.getId(), DocumentStatus.REPRESENTED);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean shouldQueueEmbedding(TextRepresentationDraft draft) {
|
||||||
|
return properties.getGenericIngestion().isVectorizePrimaryRepresentationOnly() ? draft.primary() : true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void applyStructuredTitleIfMissing(Document document, ExtractionResult extractionResult) {
|
||||||
|
boolean missingTitle = !StringUtils.hasText(document.getTitle()) || document.getTitle().equals(document.getDocumentType().name());
|
||||||
|
if (!missingTitle) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (var payload : extractionResult.structuredPayloads()) {
|
||||||
|
if (payload.attributes() == null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Object title = payload.attributes().get("title");
|
||||||
|
if (title instanceof String titleValue && StringUtils.hasText(titleValue)) {
|
||||||
|
document.setTitle(titleValue);
|
||||||
|
documentService.save(document);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean equalsNullable(String left, String right) {
|
||||||
|
return java.util.Objects.equals(left, right);
|
||||||
|
}
|
||||||
|
|
||||||
|
private record ResolvedPayload(byte[] binaryContent, String textContent, String mediaType) {
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,84 @@
|
|||||||
|
package at.procon.dip.ingestion.util;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.DocumentFamily;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Map;
|
||||||
|
import org.springframework.util.StringUtils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Shared utility methods for Phase 4 generic ingestion.
|
||||||
|
*/
|
||||||
|
public final class DocumentImportSupport {
|
||||||
|
|
||||||
|
private DocumentImportSupport() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String extensionOf(String fileName) {
|
||||||
|
if (!StringUtils.hasText(fileName) || !fileName.contains(".")) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
return fileName.substring(fileName.lastIndexOf('.') + 1).toLowerCase(Locale.ROOT);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean isLikelyTextMime(String mediaType) {
|
||||||
|
if (!StringUtils.hasText(mediaType)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
String normalized = normalizeMediaType(mediaType);
|
||||||
|
return normalized.startsWith("text/")
|
||||||
|
|| normalized.contains("json")
|
||||||
|
|| normalized.contains("xml")
|
||||||
|
|| normalized.contains("javascript")
|
||||||
|
|| normalized.equals("application/xhtml+xml");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String normalizeMediaType(String mediaType) {
|
||||||
|
if (!StringUtils.hasText(mediaType)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
int idx = mediaType.indexOf(';');
|
||||||
|
String result = idx >= 0 ? mediaType.substring(0, idx) : mediaType;
|
||||||
|
return result.trim().toLowerCase(Locale.ROOT);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static DocumentFamily familyFor(DocumentType documentType) {
|
||||||
|
return switch (documentType) {
|
||||||
|
case TED_NOTICE -> DocumentFamily.PROCUREMENT;
|
||||||
|
case EMAIL, MIME_MESSAGE -> DocumentFamily.MAIL;
|
||||||
|
case PDF, DOCX, HTML, XML_GENERIC, TEXT, MARKDOWN, ZIP_ARCHIVE, GENERIC_BINARY, UNKNOWN ->
|
||||||
|
DocumentFamily.GENERIC;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String safeUtf8(byte[] bytes) {
|
||||||
|
if (bytes == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return new String(bytes, StandardCharsets.UTF_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String firstNonBlank(Map<String, String> values, String... keys) {
|
||||||
|
for (String key : keys) {
|
||||||
|
if (values == null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
String value = values.get(key);
|
||||||
|
if (StringUtils.hasText(value)) {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String ellipsize(String text, int maxLength) {
|
||||||
|
if (text == null || text.length() <= maxLength) {
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
if (maxLength <= 3) {
|
||||||
|
return text.substring(0, maxLength);
|
||||||
|
}
|
||||||
|
return text.substring(0, maxLength - 3) + "...";
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,86 @@
|
|||||||
|
package at.procon.dip.normalization.impl;
|
||||||
|
|
||||||
|
import at.procon.dip.domain.document.ContentRole;
|
||||||
|
import at.procon.dip.domain.document.DocumentType;
|
||||||
|
import at.procon.dip.domain.document.RepresentationType;
|
||||||
|
import at.procon.dip.extraction.spi.ExtractedStructuredPayload;
|
||||||
|
import at.procon.dip.ingestion.util.DocumentImportSupport;
|
||||||
|
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
|
||||||
|
import at.procon.dip.normalization.spi.TextRepresentationBuilder;
|
||||||
|
import at.procon.dip.normalization.spi.TextRepresentationDraft;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.util.StringUtils;
|
||||||
|
|
||||||
|
@Component
|
||||||
|
public class DefaultGenericTextRepresentationBuilder implements TextRepresentationBuilder {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean supports(DocumentType documentType) {
|
||||||
|
return documentType != DocumentType.TED_NOTICE;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<TextRepresentationDraft> build(RepresentationBuildRequest request) {
|
||||||
|
String normalizedText = request.extractionResult().derivedTextByRole().get(ContentRole.NORMALIZED_TEXT);
|
||||||
|
String htmlCleanText = request.extractionResult().derivedTextByRole().get(ContentRole.HTML_CLEAN);
|
||||||
|
String baseText = StringUtils.hasText(normalizedText) ? normalizedText : htmlCleanText;
|
||||||
|
if (!StringUtils.hasText(baseText)) {
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
|
||||||
|
String title = findStructuredTitle(request).orElse(request.sourceDescriptor().fileName());
|
||||||
|
String summary = DocumentImportSupport.ellipsize(baseText.replace('\n', ' ').trim(), 1200);
|
||||||
|
String semantic = buildSemanticText(title, summary, request.detectionResult().documentType());
|
||||||
|
|
||||||
|
List<TextRepresentationDraft> drafts = new ArrayList<>();
|
||||||
|
drafts.add(new TextRepresentationDraft(
|
||||||
|
RepresentationType.FULLTEXT,
|
||||||
|
request.detectionResult().languageCode(),
|
||||||
|
baseText,
|
||||||
|
false,
|
||||||
|
null
|
||||||
|
));
|
||||||
|
drafts.add(new TextRepresentationDraft(
|
||||||
|
RepresentationType.SEMANTIC_TEXT,
|
||||||
|
request.detectionResult().languageCode(),
|
||||||
|
semantic,
|
||||||
|
true,
|
||||||
|
null
|
||||||
|
));
|
||||||
|
if (StringUtils.hasText(title)) {
|
||||||
|
drafts.add(new TextRepresentationDraft(
|
||||||
|
RepresentationType.TITLE_ABSTRACT,
|
||||||
|
request.detectionResult().languageCode(),
|
||||||
|
title + "\n\n" + summary,
|
||||||
|
false,
|
||||||
|
null
|
||||||
|
));
|
||||||
|
}
|
||||||
|
return drafts;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Optional<String> findStructuredTitle(RepresentationBuildRequest request) {
|
||||||
|
return request.extractionResult().structuredPayloads().stream()
|
||||||
|
.map(ExtractedStructuredPayload::attributes)
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.map(attributes -> attributes.get("title"))
|
||||||
|
.filter(String.class::isInstance)
|
||||||
|
.map(String.class::cast)
|
||||||
|
.filter(StringUtils::hasText)
|
||||||
|
.findFirst();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String buildSemanticText(String title, String summary, DocumentType documentType) {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
sb.append("Document type: ").append(documentType.name()).append('\n');
|
||||||
|
if (StringUtils.hasText(title)) {
|
||||||
|
sb.append("Title: ").append(title.trim()).append("\n\n");
|
||||||
|
}
|
||||||
|
sb.append(summary);
|
||||||
|
return sb.toString().trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,24 @@
|
|||||||
|
package at.procon.dip.normalization.service;
|
||||||
|
|
||||||
|
import at.procon.dip.normalization.spi.RepresentationBuildRequest;
|
||||||
|
import at.procon.dip.normalization.spi.TextRepresentationBuilder;
|
||||||
|
import at.procon.dip.normalization.spi.TextRepresentationDraft;
|
||||||
|
import java.util.List;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class TextRepresentationBuildService {
|
||||||
|
|
||||||
|
private final List<TextRepresentationBuilder> builders;
|
||||||
|
|
||||||
|
public List<TextRepresentationDraft> build(RepresentationBuildRequest request) {
|
||||||
|
return builders.stream()
|
||||||
|
.filter(builder -> builder.supports(request.detectionResult().documentType()))
|
||||||
|
.findFirst()
|
||||||
|
.orElseThrow(() -> new IllegalStateException(
|
||||||
|
"No text representation builder registered for " + request.detectionResult().documentType()))
|
||||||
|
.build(request);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,9 @@
|
|||||||
|
-- Phase 4: Generic ingestion support for arbitrary document types.
|
||||||
|
-- Add binary payload storage to the generic DOC content table so non-text originals
|
||||||
|
-- such as PDFs can be stored directly in the database when configured.
|
||||||
|
|
||||||
|
ALTER TABLE DOC.doc_content
|
||||||
|
ADD COLUMN IF NOT EXISTS binary_content BYTEA;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_content_document_role
|
||||||
|
ON DOC.doc_content(document_id, content_role, created_at DESC);
|
||||||
Loading…
Reference in New Issue