You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
DIP/src/main/java/at/procon/ted/controller/SimilaritySearchController....

179 lines
7.5 KiB
Java

package at.procon.ted.controller;
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
import at.procon.dip.runtime.config.RuntimeMode;
import at.procon.ted.service.SimilaritySearchService;
import at.procon.ted.service.SimilaritySearchService.SimilaritySearchResponse;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter;
import io.swagger.v3.oas.annotations.media.Content;
import io.swagger.v3.oas.annotations.media.Schema;
import io.swagger.v3.oas.annotations.responses.ApiResponse;
import io.swagger.v3.oas.annotations.responses.ApiResponses;
import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;
import java.io.IOException;
/**
* REST Controller for similarity search on TED procurement documents.
* Provides endpoints for searching similar documents using text or PDF input.
*
* @author Martin.Schweitzer@procon.co.at and claude.ai
*/
@RestController
@RequestMapping("/similarity")
@RequiredArgsConstructor
@Slf4j
@ConditionalOnRuntimeMode(RuntimeMode.LEGACY)
@Tag(name = "Similarity Search", description = "Vector-based semantic similarity search on TED procurement documents")
public class SimilaritySearchController {
private final SimilaritySearchService similaritySearchService;
/**
* Search for similar documents using text query.
*/
@PostMapping("/text")
@Operation(
summary = "Search by text",
description = "Find similar TED procurement documents based on text content using vector similarity (cosine distance)"
)
@ApiResponses({
@ApiResponse(responseCode = "200", description = "Search completed successfully",
content = @Content(schema = @Schema(implementation = SimilaritySearchResponse.class))),
@ApiResponse(responseCode = "400", description = "Invalid request (empty text)"),
@ApiResponse(responseCode = "503", description = "Vectorization service unavailable")
})
public ResponseEntity<SimilaritySearchResponse> searchByText(
@Parameter(description = "Text content to search for similar documents", required = true)
@RequestBody TextSearchRequest request
) {
log.info("Text similarity search request: {} chars, topK={}, threshold={}",
request.getText() != null ? request.getText().length() : 0,
request.getTopK(),
request.getThreshold());
if (request.getText() == null || request.getText().isBlank()) {
return ResponseEntity.badRequest().build();
}
try {
SimilaritySearchResponse response = similaritySearchService.searchByText(
request.getText(),
request.getTopK(),
request.getThreshold()
);
return ResponseEntity.ok(response);
} catch (IllegalStateException e) {
log.error("Vectorization service unavailable: {}", e.getMessage());
return ResponseEntity.status(503).build();
} catch (Exception e) {
log.error("Text similarity search failed: {}", e.getMessage(), e);
return ResponseEntity.internalServerError().build();
}
}
/**
* Search for similar documents using PDF file.
*/
@PostMapping(value = "/pdf", consumes = MediaType.MULTIPART_FORM_DATA_VALUE)
@Operation(
summary = "Search by PDF",
description = "Upload a PDF document to find similar TED procurement documents. " +
"Text is extracted from the PDF and used for vector similarity search."
)
@ApiResponses({
@ApiResponse(responseCode = "200", description = "Search completed successfully",
content = @Content(schema = @Schema(implementation = SimilaritySearchResponse.class))),
@ApiResponse(responseCode = "400", description = "Invalid request (no file or not a PDF)"),
@ApiResponse(responseCode = "422", description = "Could not extract text from PDF"),
@ApiResponse(responseCode = "503", description = "Vectorization service unavailable")
})
public ResponseEntity<SimilaritySearchResponse> searchByPdf(
@Parameter(description = "PDF file to search for similar documents", required = true)
@RequestPart("file") MultipartFile file,
@Parameter(description = "Number of top results to return (default: 20, max: 100)")
@RequestParam(required = false, defaultValue = "20") Integer topK,
@Parameter(description = "Minimum similarity threshold (0.0-1.0, default: 0.5)")
@RequestParam(required = false, defaultValue = "0.5") Double threshold
) {
if (file == null || file.isEmpty()) {
log.warn("PDF search request with empty file");
return ResponseEntity.badRequest().build();
}
String filename = file.getOriginalFilename();
String contentType = file.getContentType();
log.info("PDF similarity search request: filename='{}', size={} bytes, topK={}, threshold={}",
filename, file.getSize(), topK, threshold);
// Validate file type
if (contentType != null && !contentType.toLowerCase().contains("pdf")) {
if (filename == null || !filename.toLowerCase().endsWith(".pdf")) {
log.warn("Invalid file type: {} ({})", filename, contentType);
return ResponseEntity.badRequest().build();
}
}
try {
byte[] pdfData = file.getBytes();
SimilaritySearchResponse response = similaritySearchService.searchByPdf(
pdfData,
filename,
topK,
threshold
);
return ResponseEntity.ok(response);
} catch (IOException e) {
log.error("Failed to read PDF file: {}", e.getMessage());
return ResponseEntity.badRequest().build();
} catch (IllegalStateException e) {
log.error("Vectorization service unavailable: {}", e.getMessage());
return ResponseEntity.status(503).build();
} catch (RuntimeException e) {
if (e.getMessage() != null && e.getMessage().contains("extraction failed")) {
log.error("PDF extraction failed: {}", e.getMessage());
return ResponseEntity.unprocessableEntity().build();
}
log.error("PDF similarity search failed: {}", e.getMessage(), e);
return ResponseEntity.internalServerError().build();
} catch (Exception e) {
log.error("PDF similarity search failed: {}", e.getMessage(), e);
return ResponseEntity.internalServerError().build();
}
}
/**
* Request DTO for text-based similarity search.
*/
@lombok.Data
@lombok.NoArgsConstructor
@lombok.AllArgsConstructor
public static class TextSearchRequest {
@Schema(description = "Text content to search for similar documents", required = true)
private String text;
@Schema(description = "Number of top results to return (default: 20, max: 100)")
private Integer topK;
@Schema(description = "Minimum similarity threshold (0.0-1.0, default: 0.5)")
private Double threshold;
}
}