You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
179 lines
7.5 KiB
Java
179 lines
7.5 KiB
Java
package at.procon.ted.controller;
|
|
|
|
import at.procon.dip.runtime.condition.ConditionalOnRuntimeMode;
|
|
import at.procon.dip.runtime.config.RuntimeMode;
|
|
import at.procon.ted.service.SimilaritySearchService;
|
|
import at.procon.ted.service.SimilaritySearchService.SimilaritySearchResponse;
|
|
import io.swagger.v3.oas.annotations.Operation;
|
|
import io.swagger.v3.oas.annotations.Parameter;
|
|
import io.swagger.v3.oas.annotations.media.Content;
|
|
import io.swagger.v3.oas.annotations.media.Schema;
|
|
import io.swagger.v3.oas.annotations.responses.ApiResponse;
|
|
import io.swagger.v3.oas.annotations.responses.ApiResponses;
|
|
import io.swagger.v3.oas.annotations.tags.Tag;
|
|
import lombok.RequiredArgsConstructor;
|
|
import lombok.extern.slf4j.Slf4j;
|
|
import org.springframework.http.MediaType;
|
|
import org.springframework.http.ResponseEntity;
|
|
import org.springframework.web.bind.annotation.*;
|
|
import org.springframework.web.multipart.MultipartFile;
|
|
|
|
import java.io.IOException;
|
|
|
|
/**
|
|
* REST Controller for similarity search on TED procurement documents.
|
|
* Provides endpoints for searching similar documents using text or PDF input.
|
|
*
|
|
* @author Martin.Schweitzer@procon.co.at and claude.ai
|
|
*/
|
|
@RestController
|
|
@RequestMapping("/similarity")
|
|
@RequiredArgsConstructor
|
|
@Slf4j
|
|
@ConditionalOnRuntimeMode(RuntimeMode.LEGACY)
|
|
@Tag(name = "Similarity Search", description = "Vector-based semantic similarity search on TED procurement documents")
|
|
public class SimilaritySearchController {
|
|
|
|
private final SimilaritySearchService similaritySearchService;
|
|
|
|
/**
|
|
* Search for similar documents using text query.
|
|
*/
|
|
@PostMapping("/text")
|
|
@Operation(
|
|
summary = "Search by text",
|
|
description = "Find similar TED procurement documents based on text content using vector similarity (cosine distance)"
|
|
)
|
|
@ApiResponses({
|
|
@ApiResponse(responseCode = "200", description = "Search completed successfully",
|
|
content = @Content(schema = @Schema(implementation = SimilaritySearchResponse.class))),
|
|
@ApiResponse(responseCode = "400", description = "Invalid request (empty text)"),
|
|
@ApiResponse(responseCode = "503", description = "Vectorization service unavailable")
|
|
})
|
|
public ResponseEntity<SimilaritySearchResponse> searchByText(
|
|
@Parameter(description = "Text content to search for similar documents", required = true)
|
|
@RequestBody TextSearchRequest request
|
|
) {
|
|
log.info("Text similarity search request: {} chars, topK={}, threshold={}",
|
|
request.getText() != null ? request.getText().length() : 0,
|
|
request.getTopK(),
|
|
request.getThreshold());
|
|
|
|
if (request.getText() == null || request.getText().isBlank()) {
|
|
return ResponseEntity.badRequest().build();
|
|
}
|
|
|
|
try {
|
|
SimilaritySearchResponse response = similaritySearchService.searchByText(
|
|
request.getText(),
|
|
request.getTopK(),
|
|
request.getThreshold()
|
|
);
|
|
return ResponseEntity.ok(response);
|
|
|
|
} catch (IllegalStateException e) {
|
|
log.error("Vectorization service unavailable: {}", e.getMessage());
|
|
return ResponseEntity.status(503).build();
|
|
|
|
} catch (Exception e) {
|
|
log.error("Text similarity search failed: {}", e.getMessage(), e);
|
|
return ResponseEntity.internalServerError().build();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Search for similar documents using PDF file.
|
|
*/
|
|
@PostMapping(value = "/pdf", consumes = MediaType.MULTIPART_FORM_DATA_VALUE)
|
|
@Operation(
|
|
summary = "Search by PDF",
|
|
description = "Upload a PDF document to find similar TED procurement documents. " +
|
|
"Text is extracted from the PDF and used for vector similarity search."
|
|
)
|
|
@ApiResponses({
|
|
@ApiResponse(responseCode = "200", description = "Search completed successfully",
|
|
content = @Content(schema = @Schema(implementation = SimilaritySearchResponse.class))),
|
|
@ApiResponse(responseCode = "400", description = "Invalid request (no file or not a PDF)"),
|
|
@ApiResponse(responseCode = "422", description = "Could not extract text from PDF"),
|
|
@ApiResponse(responseCode = "503", description = "Vectorization service unavailable")
|
|
})
|
|
public ResponseEntity<SimilaritySearchResponse> searchByPdf(
|
|
@Parameter(description = "PDF file to search for similar documents", required = true)
|
|
@RequestPart("file") MultipartFile file,
|
|
|
|
@Parameter(description = "Number of top results to return (default: 20, max: 100)")
|
|
@RequestParam(required = false, defaultValue = "20") Integer topK,
|
|
|
|
@Parameter(description = "Minimum similarity threshold (0.0-1.0, default: 0.5)")
|
|
@RequestParam(required = false, defaultValue = "0.5") Double threshold
|
|
) {
|
|
if (file == null || file.isEmpty()) {
|
|
log.warn("PDF search request with empty file");
|
|
return ResponseEntity.badRequest().build();
|
|
}
|
|
|
|
String filename = file.getOriginalFilename();
|
|
String contentType = file.getContentType();
|
|
|
|
log.info("PDF similarity search request: filename='{}', size={} bytes, topK={}, threshold={}",
|
|
filename, file.getSize(), topK, threshold);
|
|
|
|
// Validate file type
|
|
if (contentType != null && !contentType.toLowerCase().contains("pdf")) {
|
|
if (filename == null || !filename.toLowerCase().endsWith(".pdf")) {
|
|
log.warn("Invalid file type: {} ({})", filename, contentType);
|
|
return ResponseEntity.badRequest().build();
|
|
}
|
|
}
|
|
|
|
try {
|
|
byte[] pdfData = file.getBytes();
|
|
|
|
SimilaritySearchResponse response = similaritySearchService.searchByPdf(
|
|
pdfData,
|
|
filename,
|
|
topK,
|
|
threshold
|
|
);
|
|
return ResponseEntity.ok(response);
|
|
|
|
} catch (IOException e) {
|
|
log.error("Failed to read PDF file: {}", e.getMessage());
|
|
return ResponseEntity.badRequest().build();
|
|
|
|
} catch (IllegalStateException e) {
|
|
log.error("Vectorization service unavailable: {}", e.getMessage());
|
|
return ResponseEntity.status(503).build();
|
|
|
|
} catch (RuntimeException e) {
|
|
if (e.getMessage() != null && e.getMessage().contains("extraction failed")) {
|
|
log.error("PDF extraction failed: {}", e.getMessage());
|
|
return ResponseEntity.unprocessableEntity().build();
|
|
}
|
|
log.error("PDF similarity search failed: {}", e.getMessage(), e);
|
|
return ResponseEntity.internalServerError().build();
|
|
|
|
} catch (Exception e) {
|
|
log.error("PDF similarity search failed: {}", e.getMessage(), e);
|
|
return ResponseEntity.internalServerError().build();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Request DTO for text-based similarity search.
|
|
*/
|
|
@lombok.Data
|
|
@lombok.NoArgsConstructor
|
|
@lombok.AllArgsConstructor
|
|
public static class TextSearchRequest {
|
|
@Schema(description = "Text content to search for similar documents", required = true)
|
|
private String text;
|
|
|
|
@Schema(description = "Number of top results to return (default: 20, max: 100)")
|
|
private Integer topK;
|
|
|
|
@Schema(description = "Minimum similarity threshold (0.0-1.0, default: 0.5)")
|
|
private Double threshold;
|
|
}
|
|
}
|