clustering
This commit is contained in:
parent
c6efbf40f6
commit
979f1ba18e
|
|
@ -0,0 +1,40 @@
|
|||
# TIME selective materialization by person
|
||||
|
||||
This NEW-only patch adds the ability to materialize canonical `TIME.time_entry` rows and refresh projection/representations only for Leitstand time recordings that belong to one selected person after the raw `TIME.ls_*` import is already present.
|
||||
|
||||
## Service methods
|
||||
|
||||
- `LeitstandTimeImportService.materializeCanonicalTimeEntriesForPersonDbk(String personDbk, boolean rebuildProjection)`
|
||||
- `LeitstandTimeImportService.materializeCanonicalTimeEntriesForPersonNumber(Integer personNumber, boolean rebuildProjection)`
|
||||
- `LeitstandTimeProjectionService.refreshForPersonDbk(String personDbk)`
|
||||
|
||||
## Optional startup runner
|
||||
|
||||
Enable with:
|
||||
|
||||
```yaml
|
||||
dip:
|
||||
time:
|
||||
leitstand:
|
||||
startup-selective-materialization-enabled: true
|
||||
selective-materialization-person-dbk: 100919970619190804070001
|
||||
selective-materialization-build-projection: true
|
||||
```
|
||||
|
||||
or:
|
||||
|
||||
```yaml
|
||||
dip:
|
||||
time:
|
||||
leitstand:
|
||||
startup-selective-materialization-enabled: true
|
||||
selective-materialization-person-number: 12345
|
||||
selective-materialization-build-projection: true
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- intended for already imported `TIME.ls_*` rows
|
||||
- no legacy code changes
|
||||
- no raw source sync is triggered by this runner
|
||||
- if projection rebuild is enabled, representations/embedding enqueueing continue to use the existing T3 behavior
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
# Python clustering backend for DBSCAN and advanced algorithms
|
||||
|
||||
This patch adds a dedicated Python service for clustering algorithms that are better supported in the Python scientific stack than in Java.
|
||||
|
||||
## Why Python for this step
|
||||
|
||||
The Spring module remains the orchestrator for:
|
||||
- embedding selection
|
||||
- run metadata
|
||||
- result persistence
|
||||
- cluster browsing APIs
|
||||
|
||||
The Python backend executes the actual clustering for algorithms such as:
|
||||
- `DBSCAN`
|
||||
- `HDBSCAN`
|
||||
- `MINI_BATCH_KMEANS`
|
||||
- `AGGLOMERATIVE`
|
||||
- `KMEANS` with optional reduction
|
||||
|
||||
## Spring-side contract changes in this patch
|
||||
|
||||
The Spring request model now supports generic algorithm parameters through `parameters` instead of only `k`.
|
||||
|
||||
Examples:
|
||||
- KMeans: `{ "k": 25 }`
|
||||
- DBSCAN: `{ "eps": 0.25, "minSamples": 5 }`
|
||||
- HDBSCAN: `{ "minClusterSize": 15, "minSamples": 5 }`
|
||||
- Agglomerative: `{ "k": 20, "linkage": "average", "metric": "euclidean" }`
|
||||
|
||||
The Python response is now mapped with:
|
||||
- `noise`
|
||||
- `membershipScore`
|
||||
- `distanceToCentroid`
|
||||
- noise cluster rows
|
||||
- `noiseCount`
|
||||
|
||||
Those values are persisted back into:
|
||||
- `doc.doc_embedding_cluster`
|
||||
- `doc.doc_embedding_cluster_assignment`
|
||||
- `doc.doc_embedding_cluster_run`
|
||||
|
||||
## Recommended defaults for embeddings
|
||||
|
||||
For high-dimensional text embeddings, use:
|
||||
- `normalizeVectors=true`
|
||||
- `reductionMethod=PCA`
|
||||
- `reductionDimensions=50..150`
|
||||
|
||||
Typical starting points:
|
||||
- DBSCAN: `eps=0.20..0.35`, `minSamples=5`
|
||||
- HDBSCAN: `minClusterSize=10..30`, `minSamples=3..10`
|
||||
|
||||
The right values still depend on:
|
||||
- embedding model
|
||||
- whether vectors are normalized
|
||||
- whether full documents or chunks are clustered
|
||||
- the semantic density of the selected dataset
|
||||
|
|
@ -0,0 +1,100 @@
|
|||
{
|
||||
"info": {
|
||||
"name": "DIP Clustering Dual Python Modes",
|
||||
"_postman_id": "4c2f4f68-c9c3-4977-9627-b4f7422dd001",
|
||||
"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json",
|
||||
"description": "Direct vector-upload and compact runId Python endpoints."
|
||||
},
|
||||
"variable": [
|
||||
{
|
||||
"key": "pythonBaseUrl",
|
||||
"value": "http://localhost:8001"
|
||||
},
|
||||
{
|
||||
"key": "runId",
|
||||
"value": ""
|
||||
}
|
||||
],
|
||||
"item": [
|
||||
{
|
||||
"name": "Health",
|
||||
"item": [
|
||||
{
|
||||
"name": "GET /health",
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": {
|
||||
"raw": "{{pythonBaseUrl}}/health",
|
||||
"host": [
|
||||
"{{pythonBaseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"health"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Direct vector upload",
|
||||
"item": [
|
||||
{
|
||||
"name": "POST /cluster",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [
|
||||
{
|
||||
"key": "Content-Type",
|
||||
"value": "application/json"
|
||||
}
|
||||
],
|
||||
"url": {
|
||||
"raw": "{{pythonBaseUrl}}/cluster",
|
||||
"host": [
|
||||
"{{pythonBaseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"cluster"
|
||||
]
|
||||
},
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"algorithm\": \"KMEANS\",\n \"parameters\": {\n \"k\": 2,\n \"normalizeVectors\": true\n },\n \"reductionMethod\": \"NONE\",\n \"reductionDimensions\": null,\n \"items\": [\n {\n \"embeddingId\": \"11111111-1111-1111-1111-111111111111\",\n \"documentId\": \"22222222-2222-2222-2222-222222222222\",\n \"representationId\": \"33333333-3333-3333-3333-333333333333\",\n \"vector\": [\n 0.1,\n 0.2,\n 0.3\n ]\n },\n {\n \"embeddingId\": \"11111111-1111-1111-1111-111111111112\",\n \"documentId\": \"22222222-2222-2222-2222-222222222223\",\n \"representationId\": \"33333333-3333-3333-3333-333333333334\",\n \"vector\": [\n 0.11,\n 0.19,\n 0.31\n ]\n }\n ]\n}"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Compact runId mode",
|
||||
"item": [
|
||||
{
|
||||
"name": "POST /cluster-run",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [
|
||||
{
|
||||
"key": "Content-Type",
|
||||
"value": "application/json"
|
||||
}
|
||||
],
|
||||
"url": {
|
||||
"raw": "{{pythonBaseUrl}}/cluster-run",
|
||||
"host": [
|
||||
"{{pythonBaseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"cluster-run"
|
||||
]
|
||||
},
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"runId\": \"{{runId}}\"\n}"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
{
|
||||
"info": {
|
||||
"name": "DIP Clustering Phase E Compact Run",
|
||||
"_postman_id": "57e745df-cb97-4a13-8c74-9e5c689ef0ac",
|
||||
"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json",
|
||||
"description": "Phase E compact run execution: Spring keeps metadata, Python receives only runId."
|
||||
},
|
||||
"variable": [
|
||||
{
|
||||
"key": "springBaseUrl",
|
||||
"value": "http://localhost:8889/api"
|
||||
},
|
||||
{
|
||||
"key": "pythonBaseUrl",
|
||||
"value": "http://localhost:8001"
|
||||
},
|
||||
{
|
||||
"key": "runId",
|
||||
"value": ""
|
||||
}
|
||||
],
|
||||
"item": [
|
||||
{
|
||||
"name": "Python Health",
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": {
|
||||
"raw": "{{pythonBaseUrl}}/health",
|
||||
"host": [
|
||||
"{{pythonBaseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"health"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Python Cluster Run",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [
|
||||
{
|
||||
"key": "Content-Type",
|
||||
"value": "application/json"
|
||||
}
|
||||
],
|
||||
"url": {
|
||||
"raw": "{{pythonBaseUrl}}/cluster-run",
|
||||
"host": [
|
||||
"{{pythonBaseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"cluster-run"
|
||||
]
|
||||
},
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"runId\": \"{{runId}}\"\n}"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Create TED DBSCAN run",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [
|
||||
{
|
||||
"key": "Content-Type",
|
||||
"value": "application/json"
|
||||
}
|
||||
],
|
||||
"url": {
|
||||
"raw": "{{springBaseUrl}}/v1/dip/clustering/runs",
|
||||
"host": [
|
||||
"{{springBaseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"v1",
|
||||
"dip",
|
||||
"clustering",
|
||||
"runs"
|
||||
]
|
||||
},
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"name\": \"TED notices DBSCAN PCA200\",\n \"algorithm\": \"DBSCAN\",\n \"executionBackend\": \"PYTHON_REMOTE\",\n \"reduction\": {\n \"method\": \"PCA\",\n \"targetDimensions\": 200\n },\n \"selection\": {\n \"documentTypes\": [\n \"TED_NOTICE\"\n ],\n \"representationTypes\": [\n \"SEMANTIC_TEXT\"\n ],\n \"embeddingStatuses\": [\n \"COMPLETED\"\n ],\n \"primaryRepresentationOnly\": true\n },\n \"parameters\": {\n \"eps\": 0.25,\n \"minSamples\": 5,\n \"metric\": \"euclidean\",\n \"normalizeVectors\": true\n }\n}"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Queue Run",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"url": {
|
||||
"raw": "{{springBaseUrl}}/v1/dip/clustering/runs/{{runId}}/start",
|
||||
"host": [
|
||||
"{{springBaseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"v1",
|
||||
"dip",
|
||||
"clustering",
|
||||
"runs",
|
||||
"{{runId}}",
|
||||
"start"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Get Run",
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": {
|
||||
"raw": "{{springBaseUrl}}/v1/dip/clustering/runs/{{runId}}",
|
||||
"host": [
|
||||
"{{springBaseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"v1",
|
||||
"dip",
|
||||
"clustering",
|
||||
"runs",
|
||||
"{{runId}}"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,157 @@
|
|||
{
|
||||
"info": {
|
||||
"name": "DIP Clustering Phase C",
|
||||
"_postman_id": "fa4b1e24-7d8d-4b1a-bd67-0f3f1b601111",
|
||||
"description": "Operational Postman collection for clustering sets, async runs, cancellation, and text-aware result inspection.",
|
||||
"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
|
||||
},
|
||||
"variable": [
|
||||
{
|
||||
"key": "baseUrl",
|
||||
"value": "http://localhost:8080/api"
|
||||
},
|
||||
{
|
||||
"key": "runId",
|
||||
"value": ""
|
||||
},
|
||||
{
|
||||
"key": "clusterSetId",
|
||||
"value": ""
|
||||
},
|
||||
{
|
||||
"key": "clusterId",
|
||||
"value": ""
|
||||
}
|
||||
],
|
||||
"item": [
|
||||
{
|
||||
"name": "Cluster Sets",
|
||||
"item": [
|
||||
{
|
||||
"name": "Create TED cluster set",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [
|
||||
{
|
||||
"key": "Content-Type",
|
||||
"value": "application/json"
|
||||
}
|
||||
],
|
||||
"url": "{{baseUrl}}/v1/dip/clustering/sets",
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"code\": \"TED_NOTICE_E5_PRIMARY\",\n \"name\": \"TED notices primary semantic text\",\n \"description\": \"Saved TED notice clustering selection\",\n \"selection\": {\n \"documentTypes\": [\n \"TED_NOTICE\"\n ],\n \"representationTypes\": [\n \"SEMANTIC_TEXT\"\n ],\n \"embeddingStatuses\": [\n \"COMPLETED\"\n ],\n \"primaryRepresentationOnly\": true\n },\n \"active\": true\n}"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "List cluster sets",
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": "{{baseUrl}}/v1/dip/clustering/sets"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Get cluster set",
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": "{{baseUrl}}/v1/dip/clustering/sets/{{clusterSetId}}"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Runs",
|
||||
"item": [
|
||||
{
|
||||
"name": "Create TED KMeans run",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [
|
||||
{
|
||||
"key": "Content-Type",
|
||||
"value": "application/json"
|
||||
}
|
||||
],
|
||||
"url": "{{baseUrl}}/v1/dip/clustering/runs",
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"name\": \"TED notices KMeans async run 1\",\n \"algorithm\": \"KMEANS\",\n \"executionBackend\": \"JAVA_LOCAL\",\n \"reduction\": {\n \"method\": \"NONE\"\n },\n \"selection\": {\n \"documentTypes\": [\n \"TED_NOTICE\"\n ],\n \"representationTypes\": [\n \"SEMANTIC_TEXT\"\n ],\n \"embeddingStatuses\": [\n \"COMPLETED\"\n ],\n \"primaryRepresentationOnly\": true\n },\n \"k\": 25\n}"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Create HDBSCAN Python run",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [
|
||||
{
|
||||
"key": "Content-Type",
|
||||
"value": "application/json"
|
||||
}
|
||||
],
|
||||
"url": "{{baseUrl}}/v1/dip/clustering/runs",
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"name\": \"TED notices HDBSCAN PCA100\",\n \"algorithm\": \"HDBSCAN\",\n \"executionBackend\": \"PYTHON_REMOTE\",\n \"reduction\": {\n \"method\": \"PCA\",\n \"targetDimensions\": 100\n },\n \"selection\": {\n \"documentTypes\": [\n \"TED_NOTICE\"\n ],\n \"representationTypes\": [\n \"SEMANTIC_TEXT\"\n ],\n \"embeddingStatuses\": [\n \"COMPLETED\"\n ],\n \"primaryRepresentationOnly\": true\n },\n \"k\": 25\n}"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Start run async",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"url": "{{baseUrl}}/v1/dip/clustering/runs/{{runId}}/start"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Cancel run",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"url": "{{baseUrl}}/v1/dip/clustering/runs/{{runId}}/cancel"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Get run",
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": "{{baseUrl}}/v1/dip/clustering/runs/{{runId}}"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "List runs",
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": "{{baseUrl}}/v1/dip/clustering/runs?status=COMPLETED"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Results",
|
||||
"item": [
|
||||
{
|
||||
"name": "List clusters",
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": "{{baseUrl}}/v1/dip/clustering/runs/{{runId}}/clusters"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Assignments with text",
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": "{{baseUrl}}/v1/dip/clustering/runs/{{runId}}/assignments?includeText=true"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Cluster members with text",
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": "{{baseUrl}}/v1/dip/clustering/runs/{{runId}}/clusters/{{clusterId}}/members?includeText=true"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,150 @@
|
|||
{
|
||||
"info": {
|
||||
"name": "DIP Clustering Phase D",
|
||||
"_postman_id": "0c39a7cf-8fde-43f9-8fdb-5d8890ad7676",
|
||||
"description": "Spring clustering API examples with generic algorithm parameters and remote Python backend.",
|
||||
"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
|
||||
},
|
||||
"variable": [
|
||||
{
|
||||
"key": "baseUrl",
|
||||
"value": "http://localhost:8080"
|
||||
},
|
||||
{
|
||||
"key": "runId",
|
||||
"value": ""
|
||||
}
|
||||
],
|
||||
"item": [
|
||||
{
|
||||
"name": "Create DBSCAN run for TED notices",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [
|
||||
{
|
||||
"key": "Content-Type",
|
||||
"value": "application/json"
|
||||
}
|
||||
],
|
||||
"url": {
|
||||
"raw": "{{baseUrl}}/v1/dip/clustering/runs",
|
||||
"host": [
|
||||
"{{baseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"v1",
|
||||
"dip",
|
||||
"clustering",
|
||||
"runs"
|
||||
]
|
||||
},
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"name\": \"TED notices DBSCAN PCA100\",\n \"algorithm\": \"DBSCAN\",\n \"executionBackend\": \"PYTHON_REMOTE\",\n \"reduction\": {\n \"method\": \"PCA\",\n \"targetDimensions\": 100\n },\n \"selection\": {\n \"documentTypes\": [\n \"TED_NOTICE\"\n ],\n \"representationTypes\": [\n \"SEMANTIC_TEXT\"\n ],\n \"embeddingStatuses\": [\n \"COMPLETED\"\n ],\n \"primaryRepresentationOnly\": true\n },\n \"parameters\": {\n \"eps\": 0.25,\n \"minSamples\": 5,\n \"metric\": \"euclidean\",\n \"normalizeVectors\": true\n }\n}"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Create HDBSCAN run for Leitstand TIME",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [
|
||||
{
|
||||
"key": "Content-Type",
|
||||
"value": "application/json"
|
||||
}
|
||||
],
|
||||
"url": {
|
||||
"raw": "{{baseUrl}}/v1/dip/clustering/runs",
|
||||
"host": [
|
||||
"{{baseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"v1",
|
||||
"dip",
|
||||
"clustering",
|
||||
"runs"
|
||||
]
|
||||
},
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"name\": \"Leitstand TIME HDBSCAN PCA50\",\n \"algorithm\": \"HDBSCAN\",\n \"executionBackend\": \"PYTHON_REMOTE\",\n \"reduction\": {\n \"method\": \"PCA\",\n \"targetDimensions\": 50\n },\n \"selection\": {\n \"documentTypes\": [\n \"TIME_ENTRY\"\n ],\n \"builderKeys\": [\n \"time-entry-structured-text\"\n ],\n \"embeddingStatuses\": [\n \"COMPLETED\"\n ],\n \"primaryRepresentationOnly\": true\n },\n \"parameters\": {\n \"minClusterSize\": 15,\n \"minSamples\": 5,\n \"metric\": \"euclidean\",\n \"clusterSelectionMethod\": \"eom\",\n \"normalizeVectors\": true\n }\n}"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Create Agglomerative run",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [
|
||||
{
|
||||
"key": "Content-Type",
|
||||
"value": "application/json"
|
||||
}
|
||||
],
|
||||
"url": {
|
||||
"raw": "{{baseUrl}}/v1/dip/clustering/runs",
|
||||
"host": [
|
||||
"{{baseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"v1",
|
||||
"dip",
|
||||
"clustering",
|
||||
"runs"
|
||||
]
|
||||
},
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"name\": \"TED notices Agglomerative\",\n \"algorithm\": \"AGGLOMERATIVE\",\n \"executionBackend\": \"PYTHON_REMOTE\",\n \"reduction\": {\n \"method\": \"PCA\",\n \"targetDimensions\": 100\n },\n \"selection\": {\n \"documentTypes\": [\n \"TED_NOTICE\"\n ],\n \"representationTypes\": [\n \"SEMANTIC_TEXT\"\n ],\n \"embeddingStatuses\": [\n \"COMPLETED\"\n ],\n \"primaryRepresentationOnly\": true\n },\n \"parameters\": {\n \"k\": 25,\n \"linkage\": \"average\",\n \"metric\": \"euclidean\",\n \"normalizeVectors\": true\n }\n}"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Start run",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"url": {
|
||||
"raw": "{{baseUrl}}/v1/dip/clustering/runs/{{runId}}/start",
|
||||
"host": [
|
||||
"{{baseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"v1",
|
||||
"dip",
|
||||
"clustering",
|
||||
"runs",
|
||||
"{{runId}}",
|
||||
"start"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Assignments with text",
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": {
|
||||
"raw": "{{baseUrl}}/v1/dip/clustering/runs/{{runId}}/assignments?includeText=true",
|
||||
"host": [
|
||||
"{{baseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"v1",
|
||||
"dip",
|
||||
"clustering",
|
||||
"runs",
|
||||
"{{runId}}",
|
||||
"assignments"
|
||||
],
|
||||
"query": [
|
||||
{
|
||||
"key": "includeText",
|
||||
"value": "true"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
{
|
||||
"info": {
|
||||
"name": "DIP Python Clustering Service",
|
||||
"_postman_id": "59df0e88-01d6-42a4-9071-195b43f96787",
|
||||
"description": "Direct calls to the remote Python clustering service.",
|
||||
"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
|
||||
},
|
||||
"variable": [
|
||||
{
|
||||
"key": "baseUrl",
|
||||
"value": "http://localhost:8001"
|
||||
}
|
||||
],
|
||||
"item": [
|
||||
{
|
||||
"name": "Health",
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"url": {
|
||||
"raw": "{{baseUrl}}/health",
|
||||
"host": [
|
||||
"{{baseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"health"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "DBSCAN PCA request",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [
|
||||
{
|
||||
"key": "Content-Type",
|
||||
"value": "application/json"
|
||||
}
|
||||
],
|
||||
"url": {
|
||||
"raw": "{{baseUrl}}/cluster",
|
||||
"host": [
|
||||
"{{baseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"cluster"
|
||||
]
|
||||
},
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"algorithm\": \"DBSCAN\",\n \"parameters\": {\n \"eps\": 0.25,\n \"minSamples\": 2,\n \"normalizeVectors\": false\n },\n \"reductionMethod\": \"NONE\",\n \"items\": [\n {\n \"embeddingId\": \"11111111-1111-1111-1111-111111111111\",\n \"documentId\": \"22222222-2222-2222-2222-222222222221\",\n \"representationId\": \"33333333-3333-3333-3333-333333333331\",\n \"vector\": [\n 0.0,\n 0.0\n ]\n },\n {\n \"embeddingId\": \"11111111-1111-1111-1111-111111111112\",\n \"documentId\": \"22222222-2222-2222-2222-222222222222\",\n \"representationId\": \"33333333-3333-3333-3333-333333333332\",\n \"vector\": [\n 0.05,\n 0.0\n ]\n },\n {\n \"embeddingId\": \"11111111-1111-1111-1111-111111111113\",\n \"documentId\": \"22222222-2222-2222-2222-222222222223\",\n \"representationId\": \"33333333-3333-3333-3333-333333333333\",\n \"vector\": [\n 10.0,\n 10.0\n ]\n }\n ]\n}"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "HDBSCAN PCA request",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [
|
||||
{
|
||||
"key": "Content-Type",
|
||||
"value": "application/json"
|
||||
}
|
||||
],
|
||||
"url": {
|
||||
"raw": "{{baseUrl}}/cluster",
|
||||
"host": [
|
||||
"{{baseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"cluster"
|
||||
]
|
||||
},
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"algorithm\": \"HDBSCAN\",\n \"parameters\": {\n \"minClusterSize\": 2,\n \"minSamples\": 1,\n \"normalizeVectors\": false\n },\n \"reductionMethod\": \"NONE\",\n \"items\": [\n {\n \"embeddingId\": \"11111111-1111-1111-1111-111111111121\",\n \"documentId\": \"22222222-2222-2222-2222-222222222231\",\n \"representationId\": \"33333333-3333-3333-3333-333333333341\",\n \"vector\": [\n 0.0,\n 0.0\n ]\n },\n {\n \"embeddingId\": \"11111111-1111-1111-1111-111111111122\",\n \"documentId\": \"22222222-2222-2222-2222-222222222232\",\n \"representationId\": \"33333333-3333-3333-3333-333333333342\",\n \"vector\": [\n 0.03,\n 0.01\n ]\n },\n {\n \"embeddingId\": \"11111111-1111-1111-1111-111111111123\",\n \"documentId\": \"22222222-2222-2222-2222-222222222233\",\n \"representationId\": \"33333333-3333-3333-3333-333333333343\",\n \"vector\": [\n 5.0,\n 5.0\n ]\n },\n {\n \"embeddingId\": \"11111111-1111-1111-1111-111111111124\",\n \"documentId\": \"22222222-2222-2222-2222-222222222234\",\n \"representationId\": \"33333333-3333-3333-3333-333333333344\",\n \"vector\": [\n 5.05,\n 4.98\n ]\n }\n ]\n}"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,248 @@
|
|||
{
|
||||
"info": {
|
||||
"name": "DIP Clustering Phase A",
|
||||
"_postman_id": "5d5f3a8f-1c0c-4f7a-9e14-4d6f2c6d8f3a",
|
||||
"description": "Sample Postman collection for DIP clustering Phase A endpoints.\n\nVariables:\n- baseUrl: Spring Boot base URL\n- runId: cluster run id returned by create run\n\nThis collection contains example requests for TED notice embeddings and Leitstand TIME entry embeddings.",
|
||||
"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
|
||||
},
|
||||
"variable": [
|
||||
{
|
||||
"key": "baseUrl",
|
||||
"value": "http://localhost:8080"
|
||||
},
|
||||
{
|
||||
"key": "runId",
|
||||
"value": ""
|
||||
}
|
||||
],
|
||||
"item": [
|
||||
{
|
||||
"name": "Selection",
|
||||
"item": [
|
||||
{
|
||||
"name": "Count TED selection",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [
|
||||
{
|
||||
"key": "Content-Type",
|
||||
"value": "application/json"
|
||||
}
|
||||
],
|
||||
"url": {
|
||||
"raw": "{{baseUrl}}/v1/dip/clustering/selection/count",
|
||||
"host": [
|
||||
"{{baseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"v1",
|
||||
"dip",
|
||||
"clustering",
|
||||
"selection",
|
||||
"count"
|
||||
]
|
||||
},
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"documentTypes\": [\n \"TED_NOTICE\"\n ],\n \"representationTypes\": [\n \"SEMANTIC_TEXT\"\n ],\n \"embeddingStatuses\": [\n \"COMPLETED\"\n ],\n \"primaryRepresentationOnly\": true\n}"
|
||||
},
|
||||
"description": "Counts the number of completed TED_NOTICE embeddings eligible for clustering."
|
||||
},
|
||||
"response": []
|
||||
},
|
||||
{
|
||||
"name": "Count Leitstand TIME selection",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [
|
||||
{
|
||||
"key": "Content-Type",
|
||||
"value": "application/json"
|
||||
}
|
||||
],
|
||||
"url": {
|
||||
"raw": "{{baseUrl}}/v1/dip/clustering/selection/count",
|
||||
"host": [
|
||||
"{{baseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"v1",
|
||||
"dip",
|
||||
"clustering",
|
||||
"selection",
|
||||
"count"
|
||||
]
|
||||
},
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"documentTypes\": [\n \"TIME_ENTRY\"\n ],\n \"builderKeys\": [\n \"time-entry-structured-text\"\n ],\n \"embeddingStatuses\": [\n \"COMPLETED\"\n ],\n \"primaryRepresentationOnly\": true\n}"
|
||||
},
|
||||
"description": "Counts the number of completed Leitstand TIME_ENTRY embeddings eligible for clustering."
|
||||
},
|
||||
"response": []
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Runs",
|
||||
"item": [
|
||||
{
|
||||
"name": "Create TED KMeans run",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [
|
||||
{
|
||||
"key": "Content-Type",
|
||||
"value": "application/json"
|
||||
}
|
||||
],
|
||||
"url": {
|
||||
"raw": "{{baseUrl}}/v1/dip/clustering/runs",
|
||||
"host": [
|
||||
"{{baseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"v1",
|
||||
"dip",
|
||||
"clustering",
|
||||
"runs"
|
||||
]
|
||||
},
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"name\": \"TED notices KMeans run 1\",\n \"algorithm\": \"KMEANS\",\n \"selection\": {\n \"documentTypes\": [\n \"TED_NOTICE\"\n ],\n \"representationTypes\": [\n \"SEMANTIC_TEXT\"\n ],\n \"embeddingStatuses\": [\n \"COMPLETED\"\n ],\n \"primaryRepresentationOnly\": true\n },\n \"k\": 25\n}"
|
||||
},
|
||||
"description": "Creates a cluster run for TED notice embeddings. Copy the returned id into the Postman variable runId."
|
||||
},
|
||||
"response": []
|
||||
},
|
||||
{
|
||||
"name": "Create Leitstand TIME KMeans run",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [
|
||||
{
|
||||
"key": "Content-Type",
|
||||
"value": "application/json"
|
||||
}
|
||||
],
|
||||
"url": {
|
||||
"raw": "{{baseUrl}}/v1/dip/clustering/runs",
|
||||
"host": [
|
||||
"{{baseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"v1",
|
||||
"dip",
|
||||
"clustering",
|
||||
"runs"
|
||||
]
|
||||
},
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\n \"name\": \"Leitstand TIME KMeans run 1\",\n \"algorithm\": \"KMEANS\",\n \"selection\": {\n \"documentTypes\": [\n \"TIME_ENTRY\"\n ],\n \"builderKeys\": [\n \"time-entry-structured-text\"\n ],\n \"embeddingStatuses\": [\n \"COMPLETED\"\n ],\n \"primaryRepresentationOnly\": true\n },\n \"k\": 15\n}"
|
||||
},
|
||||
"description": "Creates a cluster run for Leitstand TIME_ENTRY embeddings. Copy the returned id into the Postman variable runId."
|
||||
},
|
||||
"response": []
|
||||
},
|
||||
{
|
||||
"name": "Start run",
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"header": [],
|
||||
"url": {
|
||||
"raw": "{{baseUrl}}/v1/dip/clustering/runs/{{runId}}/start",
|
||||
"host": [
|
||||
"{{baseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"v1",
|
||||
"dip",
|
||||
"clustering",
|
||||
"runs",
|
||||
"{{runId}}",
|
||||
"start"
|
||||
]
|
||||
},
|
||||
"description": "Starts clustering for the run stored in the runId variable."
|
||||
},
|
||||
"response": []
|
||||
},
|
||||
{
|
||||
"name": "Get run",
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"header": [],
|
||||
"url": {
|
||||
"raw": "{{baseUrl}}/v1/dip/clustering/runs/{{runId}}",
|
||||
"host": [
|
||||
"{{baseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"v1",
|
||||
"dip",
|
||||
"clustering",
|
||||
"runs",
|
||||
"{{runId}}"
|
||||
]
|
||||
},
|
||||
"description": "Returns run metadata and status."
|
||||
},
|
||||
"response": []
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Results",
|
||||
"item": [
|
||||
{
|
||||
"name": "Get clusters for run",
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"header": [],
|
||||
"url": {
|
||||
"raw": "{{baseUrl}}/v1/dip/clustering/runs/{{runId}}/clusters",
|
||||
"host": [
|
||||
"{{baseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"v1",
|
||||
"dip",
|
||||
"clustering",
|
||||
"runs",
|
||||
"{{runId}}",
|
||||
"clusters"
|
||||
]
|
||||
},
|
||||
"description": "Lists clusters discovered in the run."
|
||||
},
|
||||
"response": []
|
||||
},
|
||||
{
|
||||
"name": "Get assignments for run",
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"header": [],
|
||||
"url": {
|
||||
"raw": "{{baseUrl}}/v1/dip/clustering/runs/{{runId}}/assignments",
|
||||
"host": [
|
||||
"{{baseUrl}}"
|
||||
],
|
||||
"path": [
|
||||
"v1",
|
||||
"dip",
|
||||
"clustering",
|
||||
"runs",
|
||||
"{{runId}}",
|
||||
"assignments"
|
||||
]
|
||||
},
|
||||
"description": "Lists embedding-to-cluster assignments for the run."
|
||||
},
|
||||
"response": []
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
FROM python:3.11-slim
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends build-essential \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY app ./app
|
||||
|
||||
EXPOSE 8001
|
||||
|
||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8001"]
|
||||
|
|
@ -0,0 +1,204 @@
|
|||
# DIP Clustering Service
|
||||
|
||||
Remote Python clustering backend for the DIP Spring clustering module.
|
||||
|
||||
## Main execution mode
|
||||
|
||||
The preferred execution mode is now:
|
||||
|
||||
- Spring keeps run metadata, selection snapshot, and lifecycle.
|
||||
- Spring sends only a compact request containing `runId`.
|
||||
- Python loads the run metadata and selected embeddings directly from Postgres.
|
||||
- Python returns compact assignments keyed by `embeddingId`.
|
||||
|
||||
This avoids sending the full embedding matrix through HTTP.
|
||||
|
||||
## Implemented algorithms
|
||||
|
||||
- `KMEANS`
|
||||
- `MINI_BATCH_KMEANS`
|
||||
- `DBSCAN`
|
||||
- `HDBSCAN`
|
||||
- `AGGLOMERATIVE`
|
||||
|
||||
## Implemented reductions
|
||||
|
||||
- `NONE`
|
||||
- `PCA`
|
||||
- `UMAP`
|
||||
|
||||
## API
|
||||
|
||||
### `GET /health`
|
||||
|
||||
Returns service status and supported algorithms/reduction methods.
|
||||
|
||||
### `POST /cluster-run`
|
||||
|
||||
Preferred endpoint. Accepts only the cluster run id.
|
||||
|
||||
Example request body:
|
||||
|
||||
```json
|
||||
{
|
||||
"runId": "6c3bc3a3-24b0-47a5-9e35-92dd4b7275f8"
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
This service supports two remote execution modes at the same time:
|
||||
|
||||
- `POST /cluster`
|
||||
- Spring uploads embeddings in the request body.
|
||||
- This keeps the original implementation intact.
|
||||
- `POST /cluster-run`
|
||||
- Spring sends only `runId`.
|
||||
- Python loads run metadata and embeddings directly from Postgres.
|
||||
|
||||
## Start
|
||||
|
||||
```powershell
|
||||
py -3.11 -m venv .venv
|
||||
.\.venv\Scripts\python.exe -m pip install --upgrade pip
|
||||
.\.venv\Scripts\python.exe -m pip install -r requirements.txt
|
||||
```
|
||||
|
||||
Configure DB access for `/cluster-run` with either:
|
||||
|
||||
|
||||
### `POST /cluster`
|
||||
|
||||
Accepts the Spring `PythonClusteringRequest` payload and returns `PythonClusteringResponse`.
|
||||
|
||||
Example request body:
|
||||
|
||||
```json
|
||||
{
|
||||
"algorithm": "DBSCAN",
|
||||
"parameters": {
|
||||
"eps": 0.25,
|
||||
"minSamples": 5,
|
||||
"metric": "euclidean",
|
||||
"normalizeVectors": true
|
||||
},
|
||||
"reductionMethod": "PCA",
|
||||
"reductionDimensions": 100,
|
||||
"items": [
|
||||
{
|
||||
"embeddingId": "11111111-1111-1111-1111-111111111111",
|
||||
"documentId": "22222222-2222-2222-2222-222222222222",
|
||||
"representationId": "33333333-3333-3333-3333-333333333333",
|
||||
"vector": [0.1, 0.2, 0.3]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Parameters by algorithm
|
||||
|
||||
### KMEANS
|
||||
- `k` required
|
||||
- `randomState` optional, default `42`
|
||||
- `nInit` optional, default `10`
|
||||
- `maxIter` optional, default `300`
|
||||
|
||||
### MINI_BATCH_KMEANS
|
||||
- `k` required
|
||||
- `batchSize` optional
|
||||
- `randomState` optional, default `42`
|
||||
- `nInit` optional, default `10`
|
||||
- `maxIter` optional, default `300`
|
||||
|
||||
### DBSCAN
|
||||
- `eps` required
|
||||
- `minSamples` optional, default `5`
|
||||
- `metric` optional, default `euclidean`
|
||||
- `algorithm` optional, default `auto`
|
||||
- `nJobs` optional, default `-1`
|
||||
|
||||
### HDBSCAN
|
||||
- `minClusterSize` optional, default `10`
|
||||
- `minSamples` optional
|
||||
- `metric` optional, default `euclidean`
|
||||
- `clusterSelectionMethod` optional, default `eom`
|
||||
|
||||
### AGGLOMERATIVE
|
||||
- `k` required
|
||||
- `linkage` optional, default `average`
|
||||
- `metric` optional, default `euclidean`
|
||||
- `computeDistances` optional, default `false`
|
||||
|
||||
## Shared parameters
|
||||
|
||||
- `normalizeVectors` optional, default `true`
|
||||
- `randomState` optional, used by `KMEANS`, `MINI_BATCH_KMEANS`, `PCA`, `UMAP`
|
||||
|
||||
## UMAP reduction parameters
|
||||
|
||||
- `reductionMetric` optional, default `cosine`
|
||||
- `umapNeighbors` optional, default `15`
|
||||
- `umapMinDist` optional, default `0.0`
|
||||
|
||||
## Local run
|
||||
## Required database configuration
|
||||
|
||||
Set either:
|
||||
|
||||
- `CLUSTERING_DB_DSN`
|
||||
- or `DATABASE_URL`
|
||||
- or `CLUSTERING_DB_HOST`, `CLUSTERING_DB_PORT`, `CLUSTERING_DB_NAME`, `CLUSTERING_DB_USER`, `CLUSTERING_DB_PASSWORD`
|
||||
|
||||
Example:
|
||||
|
||||
```bash
|
||||
export CLUSTERING_DB_DSN=postgresql://postgres:postgres@localhost:5432/dip
|
||||
```
|
||||
|
||||
## Local run on Windows
|
||||
|
||||
```powershell
|
||||
$env:CLUSTERING_DB_DSN="postgresql://postgres:postgres@localhost:5432/dip"
|
||||
.\.venv\Scripts\python.exe -m uvicorn app.main:app --host 0.0.0.0 --port 8001 --reload
|
||||
```
|
||||
|
||||
|
||||
## Docker run
|
||||
|
||||
```bash
|
||||
docker build -t dip-clustering-service .
|
||||
docker run --rm -p 8001:8001 dip-clustering-service
|
||||
```
|
||||
|
||||
## Spring configuration
|
||||
|
||||
Use the original request-upload mode:
|
||||
|
||||
```yaml
|
||||
dip:
|
||||
clustering:
|
||||
python:
|
||||
enabled: true
|
||||
base-url: http://localhost:8001
|
||||
cluster-path: /cluster
|
||||
cluster-run-path: /cluster-run
|
||||
request-mode: INLINE_VECTORS
|
||||
connect-timeout: 30s
|
||||
read-timeout: 30m
|
||||
```
|
||||
|
||||
Use compact `runId` mode:
|
||||
|
||||
```yaml
|
||||
dip:
|
||||
clustering:
|
||||
python:
|
||||
enabled: true
|
||||
base-url: http://localhost:8001
|
||||
cluster-path: /cluster
|
||||
cluster-run-path: /cluster-run
|
||||
request-mode: RUN_ID
|
||||
connect-timeout: 30s
|
||||
read-timeout: 30m
|
||||
```
|
||||
|
||||
`INLINE_VECTORS` is the default if `request-mode` is omitted.
|
||||
|
|
@ -0,0 +1,311 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans, MiniBatchKMeans
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.preprocessing import normalize
|
||||
|
||||
try:
|
||||
import hdbscan
|
||||
except Exception: # pragma: no cover - runtime dependency guard
|
||||
hdbscan = None
|
||||
|
||||
try:
|
||||
import umap
|
||||
except Exception: # pragma: no cover - runtime dependency guard
|
||||
umap = None
|
||||
|
||||
from .models import (
|
||||
ClusteringAlgorithm,
|
||||
PythonAssignment,
|
||||
PythonCluster,
|
||||
PythonClusteringItem,
|
||||
PythonClusteringRequest,
|
||||
PythonClusteringResponse,
|
||||
ReductionMethod,
|
||||
RunMetadata,
|
||||
)
|
||||
|
||||
|
||||
class ClusteringServiceError(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class PreparedData:
|
||||
original: np.ndarray
|
||||
transformed: np.ndarray
|
||||
items: list[PythonClusteringItem]
|
||||
|
||||
|
||||
def cluster_embeddings(request: PythonClusteringRequest) -> PythonClusteringResponse:
|
||||
return cluster_items(
|
||||
algorithm=request.algorithm,
|
||||
parameters=request.parameters or {},
|
||||
reduction_method=request.reductionMethod,
|
||||
reduction_dimensions=request.reductionDimensions,
|
||||
items=request.items,
|
||||
)
|
||||
|
||||
|
||||
def cluster_run(metadata: RunMetadata, items: list[PythonClusteringItem]) -> PythonClusteringResponse:
|
||||
return cluster_items(
|
||||
algorithm=metadata.algorithm,
|
||||
parameters=metadata.parameters or {},
|
||||
reduction_method=metadata.reductionMethod,
|
||||
reduction_dimensions=metadata.reductionDimensions,
|
||||
items=items,
|
||||
)
|
||||
|
||||
|
||||
def cluster_items(
|
||||
algorithm: ClusteringAlgorithm,
|
||||
parameters: dict[str, Any],
|
||||
reduction_method: ReductionMethod,
|
||||
reduction_dimensions: int | None,
|
||||
items: list[PythonClusteringItem],
|
||||
) -> PythonClusteringResponse:
|
||||
if not items:
|
||||
raise ClusteringServiceError("Request contains no items")
|
||||
|
||||
prepared = _prepare_data(
|
||||
items=items,
|
||||
parameters=parameters,
|
||||
reduction_method=reduction_method,
|
||||
reduction_dimensions=reduction_dimensions,
|
||||
)
|
||||
labels, membership_scores = _run_algorithm(
|
||||
algorithm=algorithm,
|
||||
vectors=prepared.transformed,
|
||||
parameters=parameters,
|
||||
)
|
||||
return _build_response(prepared, labels, membership_scores)
|
||||
|
||||
|
||||
def _prepare_data(
|
||||
items: list[PythonClusteringItem],
|
||||
parameters: dict[str, Any],
|
||||
reduction_method: ReductionMethod,
|
||||
reduction_dimensions: int | None,
|
||||
) -> PreparedData:
|
||||
vectors = np.asarray([item.vector for item in items], dtype=np.float32)
|
||||
if vectors.ndim != 2 or vectors.shape[0] == 0:
|
||||
raise ClusteringServiceError("Vectors must form a non-empty 2D array")
|
||||
|
||||
if _bool_param(parameters, "normalizeVectors", True):
|
||||
vectors = normalize(vectors, norm="l2")
|
||||
|
||||
transformed = vectors
|
||||
if reduction_method == ReductionMethod.PCA:
|
||||
target_dims = reduction_dimensions
|
||||
if target_dims is None:
|
||||
raise ClusteringServiceError("PCA reduction requires reductionDimensions")
|
||||
max_components = min(transformed.shape[0], transformed.shape[1])
|
||||
if target_dims <= 0 or target_dims > max_components:
|
||||
raise ClusteringServiceError(
|
||||
f"PCA reductionDimensions must be between 1 and {max_components}"
|
||||
)
|
||||
pca = PCA(
|
||||
n_components=target_dims,
|
||||
random_state=_int_param(parameters, "randomState", 42),
|
||||
)
|
||||
transformed = pca.fit_transform(transformed)
|
||||
elif reduction_method == ReductionMethod.UMAP:
|
||||
target_dims = reduction_dimensions
|
||||
if target_dims is None:
|
||||
raise ClusteringServiceError("UMAP reduction requires reductionDimensions")
|
||||
if umap is None:
|
||||
raise ClusteringServiceError("UMAP reduction requested but umap-learn is not installed")
|
||||
reducer = umap.UMAP(
|
||||
n_components=target_dims,
|
||||
metric=_str_param(parameters, "reductionMetric", "cosine"),
|
||||
n_neighbors=_int_param(parameters, "umapNeighbors", 15),
|
||||
min_dist=_float_param(parameters, "umapMinDist", 0.0),
|
||||
random_state=_int_param(parameters, "randomState", 42),
|
||||
)
|
||||
transformed = reducer.fit_transform(transformed)
|
||||
|
||||
return PreparedData(original=vectors, transformed=np.asarray(transformed, dtype=np.float32), items=items)
|
||||
|
||||
|
||||
def _run_algorithm(
|
||||
algorithm: ClusteringAlgorithm,
|
||||
vectors: np.ndarray,
|
||||
parameters: dict[str, Any],
|
||||
) -> tuple[np.ndarray, np.ndarray | None]:
|
||||
if algorithm == ClusteringAlgorithm.KMEANS:
|
||||
k = _required_int_param(parameters, "k")
|
||||
model = KMeans(
|
||||
n_clusters=k,
|
||||
random_state=_int_param(parameters, "randomState", 42),
|
||||
n_init=_int_param(parameters, "nInit", 10),
|
||||
max_iter=_int_param(parameters, "maxIter", 300),
|
||||
)
|
||||
labels = model.fit_predict(vectors)
|
||||
return np.asarray(labels, dtype=np.int32), None
|
||||
|
||||
if algorithm == ClusteringAlgorithm.MINI_BATCH_KMEANS:
|
||||
k = _required_int_param(parameters, "k")
|
||||
batch_size = _int_param(parameters, "batchSize", min(max(k * 16, 256), 4096))
|
||||
model = MiniBatchKMeans(
|
||||
n_clusters=k,
|
||||
random_state=_int_param(parameters, "randomState", 42),
|
||||
n_init=_int_param(parameters, "nInit", 10),
|
||||
max_iter=_int_param(parameters, "maxIter", 300),
|
||||
batch_size=batch_size,
|
||||
)
|
||||
labels = model.fit_predict(vectors)
|
||||
return np.asarray(labels, dtype=np.int32), None
|
||||
|
||||
if algorithm == ClusteringAlgorithm.DBSCAN:
|
||||
eps = _required_float_param(parameters, "eps")
|
||||
model = DBSCAN(
|
||||
eps=eps,
|
||||
min_samples=_int_param(parameters, "minSamples", 5),
|
||||
metric=_str_param(parameters, "metric", "euclidean"),
|
||||
algorithm=_str_param(parameters, "algorithm", "auto"),
|
||||
n_jobs=_int_param(parameters, "nJobs", -1),
|
||||
)
|
||||
labels = model.fit_predict(vectors)
|
||||
return np.asarray(labels, dtype=np.int32), None
|
||||
|
||||
if algorithm == ClusteringAlgorithm.HDBSCAN:
|
||||
if hdbscan is None:
|
||||
raise ClusteringServiceError("HDBSCAN requested but hdbscan is not installed")
|
||||
model = hdbscan.HDBSCAN(
|
||||
min_cluster_size=_int_param(parameters, "minClusterSize", 10),
|
||||
min_samples=_nullable_int_param(parameters, "minSamples"),
|
||||
metric=_str_param(parameters, "metric", "euclidean"),
|
||||
cluster_selection_method=_str_param(parameters, "clusterSelectionMethod", "eom"),
|
||||
)
|
||||
labels = model.fit_predict(vectors)
|
||||
probabilities = getattr(model, "probabilities_", None)
|
||||
return np.asarray(labels, dtype=np.int32), None if probabilities is None else np.asarray(probabilities, dtype=np.float32)
|
||||
|
||||
if algorithm == ClusteringAlgorithm.AGGLOMERATIVE:
|
||||
k = _required_int_param(parameters, "k")
|
||||
linkage = _str_param(parameters, "linkage", "average")
|
||||
metric = _str_param(parameters, "metric", "euclidean")
|
||||
if linkage == "ward":
|
||||
metric = "euclidean"
|
||||
model = AgglomerativeClustering(
|
||||
n_clusters=k,
|
||||
linkage=linkage,
|
||||
metric=metric,
|
||||
compute_distances=_bool_param(parameters, "computeDistances", False),
|
||||
)
|
||||
labels = model.fit_predict(vectors)
|
||||
return np.asarray(labels, dtype=np.int32), None
|
||||
|
||||
raise ClusteringServiceError(f"Unsupported algorithm: {algorithm}")
|
||||
|
||||
|
||||
def _build_response(
|
||||
prepared: PreparedData,
|
||||
labels: np.ndarray,
|
||||
membership_scores: np.ndarray | None,
|
||||
) -> PythonClusteringResponse:
|
||||
unique_labels = sorted(int(label) for label in np.unique(labels))
|
||||
clusters: list[PythonCluster] = []
|
||||
assignments: list[PythonAssignment] = []
|
||||
|
||||
centroids: dict[int, np.ndarray] = {}
|
||||
for label in unique_labels:
|
||||
mask = labels == label
|
||||
item_count = int(mask.sum())
|
||||
noise_cluster = label == -1
|
||||
clusters.append(PythonCluster(clusterLabel=label, itemCount=item_count, noiseCluster=noise_cluster))
|
||||
if not noise_cluster:
|
||||
centroids[label] = prepared.transformed[mask].mean(axis=0)
|
||||
|
||||
for index, item in enumerate(prepared.items):
|
||||
label = int(labels[index])
|
||||
noise = label == -1
|
||||
distance = None if noise else float(np.linalg.norm(prepared.transformed[index] - centroids[label]))
|
||||
membership = None
|
||||
if membership_scores is not None:
|
||||
membership = float(membership_scores[index])
|
||||
assignments.append(
|
||||
PythonAssignment(
|
||||
embeddingId=item.embeddingId,
|
||||
clusterLabel=label,
|
||||
distanceToCentroid=distance,
|
||||
membershipScore=membership,
|
||||
noise=noise,
|
||||
)
|
||||
)
|
||||
|
||||
noise_count = int((labels == -1).sum())
|
||||
return PythonClusteringResponse(clusters=clusters, assignments=assignments, noiseCount=noise_count)
|
||||
|
||||
|
||||
def _required_int_param(parameters: dict[str, Any], key: str) -> int:
|
||||
if key not in parameters or parameters[key] is None:
|
||||
raise ClusteringServiceError(f"Missing required parameter: {key}")
|
||||
return _coerce_int(parameters[key], key)
|
||||
|
||||
|
||||
def _required_float_param(parameters: dict[str, Any], key: str) -> float:
|
||||
if key not in parameters or parameters[key] is None:
|
||||
raise ClusteringServiceError(f"Missing required parameter: {key}")
|
||||
return _coerce_float(parameters[key], key)
|
||||
|
||||
|
||||
def _nullable_int_param(parameters: dict[str, Any], key: str) -> int | None:
|
||||
if key not in parameters or parameters[key] is None:
|
||||
return None
|
||||
return _coerce_int(parameters[key], key)
|
||||
|
||||
|
||||
def _int_param(parameters: dict[str, Any], key: str, default: int) -> int:
|
||||
if key not in parameters or parameters[key] is None:
|
||||
return default
|
||||
return _coerce_int(parameters[key], key)
|
||||
|
||||
|
||||
def _float_param(parameters: dict[str, Any], key: str, default: float) -> float:
|
||||
if key not in parameters or parameters[key] is None:
|
||||
return default
|
||||
return _coerce_float(parameters[key], key)
|
||||
|
||||
|
||||
def _bool_param(parameters: dict[str, Any], key: str, default: bool) -> bool:
|
||||
if key not in parameters or parameters[key] is None:
|
||||
return default
|
||||
value = parameters[key]
|
||||
if isinstance(value, bool):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
normalized = value.strip().lower()
|
||||
if normalized in {"true", "1", "yes", "y"}:
|
||||
return True
|
||||
if normalized in {"false", "0", "no", "n"}:
|
||||
return False
|
||||
raise ClusteringServiceError(f"Parameter {key} must be boolean-compatible")
|
||||
|
||||
|
||||
def _str_param(parameters: dict[str, Any], key: str, default: str) -> str:
|
||||
if key not in parameters or parameters[key] is None:
|
||||
return default
|
||||
return str(parameters[key])
|
||||
|
||||
|
||||
def _coerce_int(value: Any, key: str) -> int:
|
||||
if isinstance(value, bool):
|
||||
raise ClusteringServiceError(f"Parameter {key} must be integer-compatible")
|
||||
try:
|
||||
return int(value)
|
||||
except (TypeError, ValueError) as exc:
|
||||
raise ClusteringServiceError(f"Parameter {key} must be integer-compatible") from exc
|
||||
|
||||
|
||||
def _coerce_float(value: Any, key: str) -> float:
|
||||
if isinstance(value, bool):
|
||||
raise ClusteringServiceError(f"Parameter {key} must be float-compatible")
|
||||
try:
|
||||
return float(value)
|
||||
except (TypeError, ValueError) as exc:
|
||||
raise ClusteringServiceError(f"Parameter {key} must be float-compatible") from exc
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from fastapi.middleware.gzip import GZipMiddleware
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .cluster_service import ClusteringServiceError, cluster_embeddings, cluster_run
|
||||
from .models import (
|
||||
ClusteringAlgorithm,
|
||||
PythonClusteringRequest,
|
||||
PythonClusteringResponse,
|
||||
PythonRunExecutionRequest,
|
||||
ReductionMethod,
|
||||
)
|
||||
from .run_db_loader import load_run_and_embeddings
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="DIP Clustering Service",
|
||||
version="2.0.0",
|
||||
description="Remote clustering backend for DIP embedding clustering runs.",
|
||||
)
|
||||
app.add_middleware(GZipMiddleware, minimum_size=1024)
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
status: str
|
||||
algorithms: list[str]
|
||||
reductionMethods: list[str]
|
||||
endpoints: list[str]
|
||||
|
||||
|
||||
@app.get("/health", response_model=HealthResponse)
|
||||
def health() -> HealthResponse:
|
||||
return HealthResponse(
|
||||
status="UP",
|
||||
algorithms=[algorithm.value for algorithm in ClusteringAlgorithm],
|
||||
reductionMethods=[method.value for method in ReductionMethod],
|
||||
endpoints=["/cluster", "/cluster-run"],
|
||||
)
|
||||
|
||||
|
||||
@app.post("/cluster", response_model=PythonClusteringResponse)
|
||||
def cluster_direct(request: PythonClusteringRequest) -> PythonClusteringResponse:
|
||||
try:
|
||||
return cluster_embeddings(request)
|
||||
except ClusteringServiceError as exc:
|
||||
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||
except Exception as exc: # pragma: no cover - last-resort guard
|
||||
raise HTTPException(status_code=500, detail=f"Unexpected clustering failure: {exc}") from exc
|
||||
|
||||
|
||||
@app.post("/cluster-run", response_model=PythonClusteringResponse)
|
||||
def cluster_by_run(request: PythonRunExecutionRequest) -> PythonClusteringResponse:
|
||||
try:
|
||||
metadata, items = load_run_and_embeddings(request.runId)
|
||||
return cluster_run(metadata, items)
|
||||
except ClusteringServiceError as exc:
|
||||
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
||||
except Exception as exc: # pragma: no cover - last-resort guard
|
||||
raise HTTPException(status_code=500, detail=f"Unexpected clustering failure: {exc}") from exc
|
||||
|
|
@ -0,0 +1,75 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
|
||||
class ClusteringAlgorithm(str, Enum):
|
||||
KMEANS = "KMEANS"
|
||||
MINI_BATCH_KMEANS = "MINI_BATCH_KMEANS"
|
||||
DBSCAN = "DBSCAN"
|
||||
HDBSCAN = "HDBSCAN"
|
||||
AGGLOMERATIVE = "AGGLOMERATIVE"
|
||||
|
||||
|
||||
class ReductionMethod(str, Enum):
|
||||
NONE = "NONE"
|
||||
PCA = "PCA"
|
||||
UMAP = "UMAP"
|
||||
|
||||
|
||||
class PythonClusteringItem(BaseModel):
|
||||
embeddingId: UUID
|
||||
documentId: UUID | None = None
|
||||
representationId: UUID | None = None
|
||||
vector: list[float]
|
||||
|
||||
|
||||
class PythonClusteringRequest(BaseModel):
|
||||
algorithm: ClusteringAlgorithm
|
||||
parameters: dict[str, Any] = Field(default_factory=dict)
|
||||
reductionMethod: ReductionMethod = ReductionMethod.NONE
|
||||
reductionDimensions: int | None = None
|
||||
items: list[PythonClusteringItem]
|
||||
|
||||
model_config = ConfigDict(use_enum_values=True)
|
||||
|
||||
|
||||
class PythonRunExecutionRequest(BaseModel):
|
||||
runId: UUID
|
||||
|
||||
|
||||
class PythonCluster(BaseModel):
|
||||
clusterLabel: int
|
||||
itemCount: int
|
||||
noiseCluster: bool = False
|
||||
|
||||
|
||||
class PythonAssignment(BaseModel):
|
||||
embeddingId: UUID
|
||||
documentId: UUID | None = None
|
||||
representationId: UUID | None = None
|
||||
clusterLabel: int
|
||||
distanceToCentroid: float | None = None
|
||||
membershipScore: float | None = None
|
||||
noise: bool = False
|
||||
|
||||
|
||||
class PythonClusteringResponse(BaseModel):
|
||||
clusters: list[PythonCluster]
|
||||
assignments: list[PythonAssignment]
|
||||
noiseCount: int
|
||||
|
||||
|
||||
class RunMetadata(BaseModel):
|
||||
runId: UUID
|
||||
algorithm: ClusteringAlgorithm
|
||||
parameters: dict[str, Any] = Field(default_factory=dict)
|
||||
reductionMethod: ReductionMethod = ReductionMethod.NONE
|
||||
reductionDimensions: int | None = None
|
||||
selection: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
model_config = ConfigDict(use_enum_values=True)
|
||||
|
|
@ -0,0 +1,163 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
import numpy as np
|
||||
import psycopg2
|
||||
|
||||
from .models import ClusteringAlgorithm, PythonClusteringItem, ReductionMethod, RunMetadata
|
||||
from .settings import ServiceSettings
|
||||
|
||||
|
||||
def load_run_and_embeddings(run_id: UUID) -> tuple[RunMetadata, list[PythonClusteringItem]]:
|
||||
settings = ServiceSettings.from_env()
|
||||
with psycopg2.connect(settings.db_dsn) as connection:
|
||||
run = _load_run_metadata(connection, run_id)
|
||||
items = _load_embeddings(connection, run.selection)
|
||||
return run, items
|
||||
|
||||
|
||||
def _load_run_metadata(connection, run_id: UUID) -> RunMetadata:
|
||||
with connection.cursor() as cursor:
|
||||
cursor.execute(
|
||||
"""
|
||||
select
|
||||
id,
|
||||
algorithm,
|
||||
coalesce(parameters_json::text, '{}'),
|
||||
reduction_method,
|
||||
reduction_dimensions,
|
||||
coalesce(selection_json::text, '{}')
|
||||
from doc.doc_embedding_cluster_run
|
||||
where id = %s
|
||||
""",
|
||||
(str(run_id),),
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row is None:
|
||||
raise ValueError(f"Cluster run not found: {run_id}")
|
||||
|
||||
parameters = _json_to_dict(row[2])
|
||||
selection = _json_to_dict(row[5])
|
||||
|
||||
return RunMetadata(
|
||||
runId=row[0],
|
||||
algorithm=ClusteringAlgorithm(row[1]),
|
||||
parameters=parameters,
|
||||
reductionMethod=ReductionMethod(row[3]) if row[3] else ReductionMethod.NONE,
|
||||
reductionDimensions=row[4],
|
||||
selection=selection,
|
||||
)
|
||||
|
||||
|
||||
def _load_embeddings(connection, selection: dict[str, Any]) -> list[PythonClusteringItem]:
|
||||
sql_parts = [
|
||||
"""
|
||||
select
|
||||
e.id as embedding_id,
|
||||
e.document_id,
|
||||
e.representation_id,
|
||||
e.embedding_vector::text as embedding_vector_text
|
||||
from doc.doc_embedding e
|
||||
join doc.doc_document d on d.id = e.document_id
|
||||
join doc.doc_text_representation r on r.id = e.representation_id
|
||||
where e.embedding_status = 'COMPLETED'
|
||||
and e.embedding_vector is not null
|
||||
"""
|
||||
]
|
||||
params: list[Any] = []
|
||||
|
||||
_apply_selection_filters(selection, sql_parts, params)
|
||||
sql_parts.append(" order by e.created_at asc")
|
||||
sql = "".join(sql_parts)
|
||||
|
||||
items: list[PythonClusteringItem] = []
|
||||
with connection.cursor(name="cluster_embedding_selection") as cursor:
|
||||
cursor.itersize = 2000
|
||||
cursor.execute(sql, params)
|
||||
for embedding_id, document_id, representation_id, vector_text in cursor:
|
||||
items.append(
|
||||
PythonClusteringItem(
|
||||
embeddingId=embedding_id,
|
||||
documentId=document_id,
|
||||
representationId=representation_id,
|
||||
vector=_parse_vector_text(vector_text),
|
||||
)
|
||||
)
|
||||
return items
|
||||
|
||||
|
||||
def _apply_selection_filters(selection: dict[str, Any], sql_parts: list[str], params: list[Any]) -> None:
|
||||
if not selection:
|
||||
return
|
||||
|
||||
_append_in_filter(sql_parts, params, "documentTypes", "d.document_type", selection.get("documentTypes"))
|
||||
_append_in_filter(sql_parts, params, "documentFamilies", "d.document_family", selection.get("documentFamilies"))
|
||||
_append_in_filter(sql_parts, params, "representationTypes", "r.representation_type", selection.get("representationTypes"))
|
||||
_append_in_filter(sql_parts, params, "embeddingStatuses", "e.embedding_status", selection.get("embeddingStatuses"))
|
||||
_append_in_filter(sql_parts, params, "modelIds", "e.model_id", selection.get("modelIds"))
|
||||
_append_in_filter(sql_parts, params, "prefixProfileIds", "e.prefix_profile_id", selection.get("prefixProfileIds"))
|
||||
_append_in_filter(sql_parts, params, "builderKeys", "r.builder_key", selection.get("builderKeys"))
|
||||
_append_in_filter(sql_parts, params, "languageCodes", "r.language_code", selection.get("languageCodes"))
|
||||
_append_in_filter(sql_parts, params, "ownerTenantIds", "d.owner_tenant_id", selection.get("ownerTenantIds"))
|
||||
|
||||
business_key_like = selection.get("businessKeyLike")
|
||||
if business_key_like:
|
||||
sql_parts.append(" and d.business_key like %s")
|
||||
params.append(business_key_like)
|
||||
|
||||
created_from = selection.get("createdFrom")
|
||||
if created_from:
|
||||
sql_parts.append(" and d.created_at >= %s")
|
||||
params.append(created_from)
|
||||
|
||||
created_to = selection.get("createdTo")
|
||||
if created_to:
|
||||
sql_parts.append(" and d.created_at < %s")
|
||||
params.append(created_to)
|
||||
|
||||
if selection.get("primaryRepresentationOnly") is True:
|
||||
sql_parts.append(" and r.is_primary = true")
|
||||
|
||||
|
||||
def _append_in_filter(
|
||||
sql_parts: list[str],
|
||||
params: list[Any],
|
||||
_key: str,
|
||||
column_name: str,
|
||||
values: list[Any] | None,
|
||||
) -> None:
|
||||
if not values:
|
||||
return
|
||||
placeholders = ", ".join(["%s"] * len(values))
|
||||
sql_parts.append(f" and {column_name} in ({placeholders})")
|
||||
params.extend(values)
|
||||
|
||||
|
||||
def _parse_vector_text(raw_value: str) -> list[float]:
|
||||
if raw_value is None:
|
||||
return []
|
||||
|
||||
value = raw_value.strip()
|
||||
if value.startswith("[") and value.endswith("]"):
|
||||
value = value[1:-1]
|
||||
|
||||
if not value:
|
||||
return []
|
||||
|
||||
vector = np.fromstring(value, sep=",", dtype=np.float32)
|
||||
return vector.astype(float).tolist()
|
||||
|
||||
|
||||
def _json_to_dict(raw_json: str | dict[str, Any] | None) -> dict[str, Any]:
|
||||
if raw_json is None:
|
||||
return {}
|
||||
if isinstance(raw_json, dict):
|
||||
return raw_json
|
||||
if not raw_json.strip():
|
||||
return {}
|
||||
loaded = json.loads(raw_json)
|
||||
return loaded if isinstance(loaded, dict) else {}
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ServiceSettings:
|
||||
db_dsn: str
|
||||
|
||||
@staticmethod
|
||||
def from_env() -> "ServiceSettings":
|
||||
dsn = (
|
||||
os.getenv("CLUSTERING_DB_DSN")
|
||||
or os.getenv("DATABASE_URL")
|
||||
or _build_dsn_from_parts()
|
||||
)
|
||||
if not dsn:
|
||||
raise RuntimeError(
|
||||
"No database connection configured. Set CLUSTERING_DB_DSN or DATABASE_URL, "
|
||||
"or provide CLUSTERING_DB_HOST / CLUSTERING_DB_PORT / CLUSTERING_DB_NAME / "
|
||||
"CLUSTERING_DB_USER / CLUSTERING_DB_PASSWORD."
|
||||
)
|
||||
return ServiceSettings(db_dsn=dsn)
|
||||
|
||||
|
||||
def _build_dsn_from_parts() -> str | None:
|
||||
host = os.getenv("CLUSTERING_DB_HOST")
|
||||
database = os.getenv("CLUSTERING_DB_NAME")
|
||||
user = os.getenv("CLUSTERING_DB_USER")
|
||||
password = os.getenv("CLUSTERING_DB_PASSWORD")
|
||||
port = os.getenv("CLUSTERING_DB_PORT", "5432")
|
||||
|
||||
if not host or not database or not user:
|
||||
return None
|
||||
|
||||
if password:
|
||||
return f"postgresql://{user}:{password}@{host}:{port}/{database}"
|
||||
return f"postgresql://{user}@{host}:{port}/{database}"
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
fastapi==0.115.12
|
||||
uvicorn[standard]==0.34.2
|
||||
numpy==2.2.5
|
||||
scikit-learn==1.7.0
|
||||
hdbscan==0.8.40
|
||||
umap-learn==0.5.7
|
||||
psycopg2-binary==2.9.10
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
from fastapi.testclient import TestClient
|
||||
|
||||
from app.main import app
|
||||
|
||||
|
||||
client = TestClient(app)
|
||||
|
||||
|
||||
def test_health():
|
||||
response = client.get("/health")
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["status"] == "UP"
|
||||
assert "DBSCAN" in data["algorithms"]
|
||||
|
||||
|
||||
def test_kmeans_cluster():
|
||||
body = {
|
||||
"algorithm": "KMEANS",
|
||||
"parameters": {"k": 2},
|
||||
"reductionMethod": "NONE",
|
||||
"items": [
|
||||
{
|
||||
"embeddingId": "11111111-1111-1111-1111-111111111111",
|
||||
"documentId": "22222222-2222-2222-2222-222222222221",
|
||||
"representationId": "33333333-3333-3333-3333-333333333331",
|
||||
"vector": [1.0, 1.0]
|
||||
},
|
||||
{
|
||||
"embeddingId": "11111111-1111-1111-1111-111111111112",
|
||||
"documentId": "22222222-2222-2222-2222-222222222222",
|
||||
"representationId": "33333333-3333-3333-3333-333333333332",
|
||||
"vector": [1.1, 1.0]
|
||||
},
|
||||
{
|
||||
"embeddingId": "11111111-1111-1111-1111-111111111113",
|
||||
"documentId": "22222222-2222-2222-2222-222222222223",
|
||||
"representationId": "33333333-3333-3333-3333-333333333333",
|
||||
"vector": [-1.0, -1.0]
|
||||
},
|
||||
{
|
||||
"embeddingId": "11111111-1111-1111-1111-111111111114",
|
||||
"documentId": "22222222-2222-2222-2222-222222222224",
|
||||
"representationId": "33333333-3333-3333-3333-333333333334",
|
||||
"vector": [-1.1, -1.0]
|
||||
}
|
||||
]
|
||||
}
|
||||
response = client.post("/cluster", json=body)
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert len(data["clusters"]) == 2
|
||||
assert data["noiseCount"] == 0
|
||||
|
||||
|
||||
def test_dbscan_cluster_with_noise():
|
||||
body = {
|
||||
"algorithm": "DBSCAN",
|
||||
"parameters": {"eps": 0.25, "minSamples": 2, "normalizeVectors": False},
|
||||
"reductionMethod": "NONE",
|
||||
"items": [
|
||||
{
|
||||
"embeddingId": "11111111-1111-1111-1111-111111111211",
|
||||
"documentId": "22222222-2222-2222-2222-222222222211",
|
||||
"representationId": "33333333-3333-3333-3333-333333333211",
|
||||
"vector": [0.0, 0.0]
|
||||
},
|
||||
{
|
||||
"embeddingId": "11111111-1111-1111-1111-111111111212",
|
||||
"documentId": "22222222-2222-2222-2222-222222222212",
|
||||
"representationId": "33333333-3333-3333-3333-333333333212",
|
||||
"vector": [0.05, 0.0]
|
||||
},
|
||||
{
|
||||
"embeddingId": "11111111-1111-1111-1111-111111111213",
|
||||
"documentId": "22222222-2222-2222-2222-222222222213",
|
||||
"representationId": "33333333-3333-3333-3333-333333333213",
|
||||
"vector": [10.0, 10.0]
|
||||
}
|
||||
]
|
||||
}
|
||||
response = client.post("/cluster", json=body)
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["noiseCount"] == 1
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
package at.procon.dip.clustering;
|
||||
|
||||
public enum ClusterRunStatus {
|
||||
CREATED,
|
||||
QUEUED,
|
||||
RUNNING,
|
||||
CANCEL_REQUESTED,
|
||||
COMPLETED,
|
||||
FAILED,
|
||||
CANCELLED
|
||||
}
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
package at.procon.dip.clustering;
|
||||
|
||||
public enum ClusteringAlgorithm {
|
||||
KMEANS,
|
||||
MINI_BATCH_KMEANS,
|
||||
DBSCAN,
|
||||
HDBSCAN,
|
||||
AGGLOMERATIVE
|
||||
}
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
package at.procon.dip.clustering;
|
||||
|
||||
public enum ClusteringExecutionBackend {
|
||||
JAVA_LOCAL,
|
||||
PYTHON_REMOTE
|
||||
}
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
package at.procon.dip.clustering;
|
||||
|
||||
public enum PythonRequestMode {
|
||||
INLINE_VECTORS,
|
||||
RUN_ID
|
||||
}
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
package at.procon.dip.clustering;
|
||||
|
||||
public enum ReductionMethod {
|
||||
NONE,
|
||||
PCA,
|
||||
UMAP
|
||||
}
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
package at.procon.dip.clustering.client;
|
||||
|
||||
import at.procon.dip.clustering.dto.PythonClusteringRequest;
|
||||
import at.procon.dip.clustering.dto.PythonClusteringResponse;
|
||||
import at.procon.dip.clustering.dto.PythonRunExecutionRequest;
|
||||
|
||||
public interface PythonClusteringClient {
|
||||
|
||||
PythonClusteringResponse cluster(PythonClusteringRequest request);
|
||||
|
||||
PythonClusteringResponse clusterRun(PythonRunExecutionRequest request);
|
||||
}
|
||||
|
|
@ -0,0 +1,96 @@
|
|||
package at.procon.dip.clustering.client;
|
||||
|
||||
import at.procon.dip.clustering.config.ClusteringPhaseBProperties;
|
||||
import at.procon.dip.clustering.dto.PythonClusteringRequest;
|
||||
import at.procon.dip.clustering.dto.PythonClusteringResponse;
|
||||
import at.procon.dip.clustering.dto.PythonRunExecutionRequest;
|
||||
import java.net.http.HttpClient;
|
||||
import java.time.Duration;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.client.JdkClientHttpRequestFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.web.client.RestClient;
|
||||
import org.springframework.web.server.ResponseStatusException;
|
||||
|
||||
@Component
|
||||
@ConditionalOnProperty(prefix = "dip.clustering.python", name = "enabled", havingValue = "true")
|
||||
public class RestPythonClusteringClient implements PythonClusteringClient {
|
||||
|
||||
private static final Duration DEFAULT_CONNECT_TIMEOUT = Duration.ofSeconds(30);
|
||||
private static final Duration DEFAULT_READ_TIMEOUT = Duration.ofMinutes(30);
|
||||
|
||||
private final ClusteringPhaseBProperties properties;
|
||||
private final RestClient restClient;
|
||||
|
||||
public RestPythonClusteringClient(ClusteringPhaseBProperties properties) {
|
||||
this.properties = properties;
|
||||
|
||||
Duration connectTimeout = properties.connectTimeout() != null
|
||||
? properties.connectTimeout()
|
||||
: DEFAULT_CONNECT_TIMEOUT;
|
||||
Duration readTimeout = properties.readTimeout() != null
|
||||
? properties.readTimeout()
|
||||
: DEFAULT_READ_TIMEOUT;
|
||||
|
||||
HttpClient httpClient = HttpClient.newBuilder()
|
||||
.connectTimeout(connectTimeout)
|
||||
.version(HttpClient.Version.HTTP_1_1)
|
||||
.build();
|
||||
|
||||
JdkClientHttpRequestFactory requestFactory = new JdkClientHttpRequestFactory(httpClient);
|
||||
requestFactory.setReadTimeout(readTimeout);
|
||||
|
||||
this.restClient = RestClient.builder()
|
||||
.requestFactory(requestFactory)
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public PythonClusteringResponse cluster(PythonClusteringRequest request) {
|
||||
String url = properties.resolvedClusterUrl();
|
||||
if (url == null || url.isBlank()) {
|
||||
throw new ResponseStatusException(org.springframework.http.HttpStatus.BAD_REQUEST,
|
||||
"Python clustering is enabled but no baseUrl/clusterPath is configured");
|
||||
}
|
||||
|
||||
try {
|
||||
return restClient.post()
|
||||
.uri(url)
|
||||
.contentType(MediaType.APPLICATION_JSON)
|
||||
.accept(MediaType.APPLICATION_JSON)
|
||||
.body(request)
|
||||
.retrieve()
|
||||
.body(PythonClusteringResponse.class);
|
||||
} catch (Exception ex) {
|
||||
throw new ResponseStatusException(
|
||||
org.springframework.http.HttpStatus.BAD_GATEWAY,
|
||||
"Python cluster request failed: " + ex.getMessage(),
|
||||
ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public PythonClusteringResponse clusterRun(PythonRunExecutionRequest request) {
|
||||
String url = properties.resolvedClusterRunUrl();
|
||||
if (url == null || url.isBlank()) {
|
||||
throw new ResponseStatusException(org.springframework.http.HttpStatus.BAD_REQUEST,
|
||||
"Python clustering is enabled but no baseUrl/clusterRunPath is configured");
|
||||
}
|
||||
|
||||
try {
|
||||
return restClient.post()
|
||||
.uri(url)
|
||||
.contentType(MediaType.APPLICATION_JSON)
|
||||
.accept(MediaType.APPLICATION_JSON)
|
||||
.body(request)
|
||||
.retrieve()
|
||||
.body(PythonClusteringResponse.class);
|
||||
} catch (Exception ex) {
|
||||
throw new ResponseStatusException(
|
||||
org.springframework.http.HttpStatus.BAD_GATEWAY,
|
||||
"Python cluster-run request failed: " + ex.getMessage(),
|
||||
ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
package at.procon.dip.clustering.config;
|
||||
|
||||
import java.util.concurrent.Executor;
|
||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
|
||||
|
||||
@Configuration
|
||||
@EnableConfigurationProperties(ClusteringExecutionProperties.class)
|
||||
public class ClusteringExecutionConfiguration {
|
||||
|
||||
@Bean(name = "clusteringRunExecutor")
|
||||
public Executor clusteringRunExecutor(ClusteringExecutionProperties properties) {
|
||||
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
|
||||
executor.setThreadNamePrefix("clustering-run-");
|
||||
executor.setCorePoolSize(properties.resolvedCorePoolSize());
|
||||
executor.setMaxPoolSize(Math.max(properties.resolvedCorePoolSize(), properties.resolvedMaxPoolSize()));
|
||||
executor.setQueueCapacity(properties.resolvedQueueCapacity());
|
||||
executor.setWaitForTasksToCompleteOnShutdown(true);
|
||||
executor.setAwaitTerminationSeconds(30);
|
||||
executor.initialize();
|
||||
return executor;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
package at.procon.dip.clustering.config;
|
||||
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
|
||||
@ConfigurationProperties(prefix = "dip.clustering.execution")
|
||||
public record ClusteringExecutionProperties(
|
||||
int corePoolSize,
|
||||
int maxPoolSize,
|
||||
int queueCapacity
|
||||
) {
|
||||
public int resolvedCorePoolSize() {
|
||||
return corePoolSize > 0 ? corePoolSize : 1;
|
||||
}
|
||||
|
||||
public int resolvedMaxPoolSize() {
|
||||
return maxPoolSize > 0 ? maxPoolSize : Math.max(1, resolvedCorePoolSize());
|
||||
}
|
||||
|
||||
public int resolvedQueueCapacity() {
|
||||
return queueCapacity >= 0 ? queueCapacity : 50;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
package at.procon.dip.clustering.config;
|
||||
|
||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
@Configuration
|
||||
@EnableConfigurationProperties(ClusteringPhaseBProperties.class)
|
||||
public class ClusteringPhaseBConfig {
|
||||
}
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
package at.procon.dip.clustering.config;
|
||||
|
||||
import at.procon.dip.clustering.PythonRequestMode;
|
||||
import java.time.Duration;
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
|
||||
@ConfigurationProperties(prefix = "dip.clustering.python")
|
||||
public record ClusteringPhaseBProperties(
|
||||
boolean enabled,
|
||||
String baseUrl,
|
||||
String clusterPath,
|
||||
String clusterRunPath,
|
||||
Duration connectTimeout,
|
||||
Duration readTimeout,
|
||||
PythonRequestMode requestMode
|
||||
) {
|
||||
public String resolvedClusterUrl() {
|
||||
return resolveUrl(clusterPath == null || clusterPath.isBlank() ? "/cluster" : clusterPath);
|
||||
}
|
||||
|
||||
public String resolvedClusterRunUrl() {
|
||||
return resolveUrl(clusterRunPath == null || clusterRunPath.isBlank() ? "/cluster-run" : clusterRunPath);
|
||||
}
|
||||
|
||||
public PythonRequestMode effectiveRequestMode() {
|
||||
return requestMode == null ? PythonRequestMode.INLINE_VECTORS : requestMode;
|
||||
}
|
||||
|
||||
private String resolveUrl(String path) {
|
||||
if (baseUrl == null || baseUrl.isBlank()) {
|
||||
return null;
|
||||
}
|
||||
return baseUrl.endsWith("/")
|
||||
? baseUrl.substring(0, baseUrl.length() - 1) + path
|
||||
: baseUrl + path;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
import java.util.UUID;
|
||||
|
||||
public record ClusterAssignmentResponse(
|
||||
UUID id,
|
||||
UUID clusterId,
|
||||
UUID embeddingId,
|
||||
UUID documentId,
|
||||
UUID representationId,
|
||||
Integer clusterLabelRaw,
|
||||
Double membershipScore,
|
||||
Double distanceToCentroid,
|
||||
boolean noise
|
||||
) {
|
||||
}
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import java.util.UUID;
|
||||
|
||||
public record ClusterAssignmentViewResponse(
|
||||
UUID id,
|
||||
UUID clusterId,
|
||||
UUID embeddingId,
|
||||
UUID documentId,
|
||||
UUID representationId,
|
||||
Integer clusterLabelRaw,
|
||||
Double membershipScore,
|
||||
Double distanceToCentroid,
|
||||
boolean noise,
|
||||
String businessKey,
|
||||
DocumentType documentType,
|
||||
RepresentationType representationType,
|
||||
String builderKey,
|
||||
String languageCode,
|
||||
Integer textLength,
|
||||
String textPreview,
|
||||
String textBody
|
||||
) {
|
||||
}
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
import java.util.UUID;
|
||||
|
||||
public record ClusterMembersResponse(
|
||||
UUID clusterId,
|
||||
Integer clusterLabel,
|
||||
ClusterAssignmentViewResponse member
|
||||
) {
|
||||
}
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
import java.util.UUID;
|
||||
|
||||
public record ClusterResponse(
|
||||
UUID id,
|
||||
Integer clusterLabel,
|
||||
String displayName,
|
||||
Long itemCount,
|
||||
boolean noiseCluster,
|
||||
String summaryText
|
||||
) {
|
||||
}
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
import at.procon.dip.clustering.ClusterRunStatus;
|
||||
import at.procon.dip.clustering.ClusteringAlgorithm;
|
||||
import at.procon.dip.clustering.ClusteringExecutionBackend;
|
||||
import at.procon.dip.clustering.ReductionMethod;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.UUID;
|
||||
|
||||
public record ClusterRunResponse(
|
||||
UUID id,
|
||||
String name,
|
||||
ClusterRunStatus status,
|
||||
ClusteringAlgorithm algorithm,
|
||||
ClusteringExecutionBackend executionBackend,
|
||||
ReductionMethod reductionMethod,
|
||||
Integer reductionDimensions,
|
||||
Long itemCount,
|
||||
Long clusterCount,
|
||||
Long noiseCount,
|
||||
OffsetDateTime startedAt,
|
||||
OffsetDateTime finishedAt,
|
||||
String errorMessage
|
||||
) {
|
||||
}
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.UUID;
|
||||
|
||||
public record ClusterSetResponse(
|
||||
UUID id,
|
||||
String code,
|
||||
String name,
|
||||
String description,
|
||||
boolean active,
|
||||
EmbeddingSelectionSpec selection,
|
||||
OffsetDateTime createdAt,
|
||||
OffsetDateTime updatedAt
|
||||
) {
|
||||
}
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
import java.util.UUID;
|
||||
|
||||
public record ClusteringEngineAssignment(
|
||||
UUID embeddingId,
|
||||
UUID documentId,
|
||||
UUID representationId,
|
||||
int clusterLabel,
|
||||
Double distanceToCentroid,
|
||||
Double membershipScore,
|
||||
boolean noise
|
||||
) {
|
||||
public ClusteringEngineAssignment(
|
||||
UUID embeddingId,
|
||||
int clusterLabel,
|
||||
Double distanceToCentroid,
|
||||
Double membershipScore,
|
||||
boolean noise
|
||||
) {
|
||||
this(embeddingId, null, null, clusterLabel, distanceToCentroid, membershipScore, noise);
|
||||
}
|
||||
|
||||
public ClusteringEngineAssignment(
|
||||
UUID embeddingId,
|
||||
UUID documentId,
|
||||
UUID representationId,
|
||||
int clusterLabel,
|
||||
Double distanceToCentroid
|
||||
) {
|
||||
this(embeddingId, documentId, representationId, clusterLabel, distanceToCentroid, null, false);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
public record ClusteringEngineCluster(
|
||||
int clusterLabel,
|
||||
long itemCount,
|
||||
boolean noiseCluster
|
||||
) {
|
||||
public ClusteringEngineCluster(int clusterLabel, long itemCount) {
|
||||
this(clusterLabel, itemCount, false);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public record ClusteringEngineRequest(Map<String, Object> parameters) {
|
||||
|
||||
public ClusteringEngineRequest {
|
||||
parameters = parameters == null
|
||||
? Map.of()
|
||||
: Collections.unmodifiableMap(new LinkedHashMap<>(parameters));
|
||||
}
|
||||
|
||||
public int requiredInt(String key) {
|
||||
Object value = parameters.get(key);
|
||||
if (value == null) {
|
||||
throw new IllegalArgumentException("Missing required clustering parameter: " + key);
|
||||
}
|
||||
if (value instanceof Number number) {
|
||||
return number.intValue();
|
||||
}
|
||||
return Integer.parseInt(String.valueOf(value));
|
||||
}
|
||||
|
||||
public int intValue(String key, int defaultValue) {
|
||||
Object value = parameters.get(key);
|
||||
if (value == null) {
|
||||
return defaultValue;
|
||||
}
|
||||
if (value instanceof Number number) {
|
||||
return number.intValue();
|
||||
}
|
||||
return Integer.parseInt(String.valueOf(value));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record ClusteringEngineResult(
|
||||
List<ClusteringEngineCluster> clusters,
|
||||
List<ClusteringEngineAssignment> assignments,
|
||||
long noiseCount
|
||||
) {
|
||||
public ClusteringEngineResult(
|
||||
List<ClusteringEngineCluster> clusters,
|
||||
List<ClusteringEngineAssignment> assignments
|
||||
) {
|
||||
this(clusters, assignments, 0L);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
import at.procon.dip.clustering.ClusteringAlgorithm;
|
||||
import at.procon.dip.clustering.ClusteringExecutionBackend;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import jakarta.validation.Valid;
|
||||
import jakarta.validation.constraints.AssertTrue;
|
||||
import jakarta.validation.constraints.NotBlank;
|
||||
import jakarta.validation.constraints.NotNull;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public record CreateClusterRunRequest(
|
||||
String clusterSetCode,
|
||||
@NotBlank String name,
|
||||
@NotNull ClusteringAlgorithm algorithm,
|
||||
ClusteringExecutionBackend executionBackend,
|
||||
@Valid ReductionConfig reduction,
|
||||
@Valid @NotNull EmbeddingSelectionSpec selection,
|
||||
Integer k,
|
||||
Map<String, Object> parameters
|
||||
) {
|
||||
|
||||
@JsonIgnore
|
||||
public Map<String, Object> resolvedParameters() {
|
||||
Map<String, Object> merged = new LinkedHashMap<>();
|
||||
if (parameters != null) {
|
||||
merged.putAll(parameters);
|
||||
}
|
||||
if (k != null && !merged.containsKey("k")) {
|
||||
merged.put("k", k);
|
||||
}
|
||||
return merged;
|
||||
}
|
||||
|
||||
@AssertTrue(message = "k must be > 0 for KMEANS and MINI_BATCH_KMEANS; for other algorithms it must be omitted or > 0")
|
||||
@JsonIgnore
|
||||
public boolean isValidKConfiguration() {
|
||||
Integer effectiveK = extractPositiveInteger(resolvedParameters().get("k"));
|
||||
|
||||
if (algorithm == ClusteringAlgorithm.KMEANS
|
||||
|| algorithm == ClusteringAlgorithm.MINI_BATCH_KMEANS) {
|
||||
return effectiveK != null;
|
||||
}
|
||||
|
||||
Object rawK = resolvedParameters().get("k");
|
||||
return rawK == null || effectiveK != null;
|
||||
}
|
||||
|
||||
private Integer extractPositiveInteger(Object value) {
|
||||
if (value == null) {
|
||||
return null;
|
||||
}
|
||||
if (value instanceof Number number) {
|
||||
int intValue = number.intValue();
|
||||
return intValue > 0 ? intValue : null;
|
||||
}
|
||||
try {
|
||||
int intValue = Integer.parseInt(String.valueOf(value));
|
||||
return intValue > 0 ? intValue : null;
|
||||
} catch (NumberFormatException ex) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
import jakarta.validation.Valid;
|
||||
import jakarta.validation.constraints.NotBlank;
|
||||
import jakarta.validation.constraints.NotNull;
|
||||
|
||||
public record CreateClusterSetRequest(
|
||||
@NotBlank String code,
|
||||
@NotBlank String name,
|
||||
String description,
|
||||
@Valid @NotNull EmbeddingSelectionSpec selection,
|
||||
Boolean active
|
||||
) {
|
||||
}
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.EmbeddingStatus;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.Set;
|
||||
import java.util.UUID;
|
||||
|
||||
public record EmbeddingSelectionSpec(
|
||||
Set<DocumentType> documentTypes,
|
||||
Set<DocumentFamily> documentFamilies,
|
||||
Set<RepresentationType> representationTypes,
|
||||
Set<EmbeddingStatus> embeddingStatuses,
|
||||
Set<UUID> modelIds,
|
||||
Set<UUID> prefixProfileIds,
|
||||
Set<String> builderKeys,
|
||||
Set<String> languageCodes,
|
||||
Set<UUID> ownerTenantIds,
|
||||
String businessKeyLike,
|
||||
OffsetDateTime createdFrom,
|
||||
OffsetDateTime createdTo,
|
||||
Boolean primaryRepresentationOnly
|
||||
) {
|
||||
}
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
import at.procon.dip.clustering.ClusteringAlgorithm;
|
||||
import at.procon.dip.clustering.ReductionMethod;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
|
||||
public record PythonClusteringRequest(
|
||||
ClusteringAlgorithm algorithm,
|
||||
Map<String, Object> parameters,
|
||||
ReductionMethod reductionMethod,
|
||||
Integer reductionDimensions,
|
||||
List<PythonClusteringItem> items
|
||||
) {
|
||||
public record PythonClusteringItem(
|
||||
UUID embeddingId,
|
||||
UUID documentId,
|
||||
UUID representationId,
|
||||
float[] vector
|
||||
) {
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
public record PythonClusteringResponse(
|
||||
List<PythonCluster> clusters,
|
||||
List<PythonAssignment> assignments,
|
||||
Long noiseCount
|
||||
) {
|
||||
public record PythonCluster(
|
||||
Integer clusterLabel,
|
||||
Long itemCount,
|
||||
Boolean noiseCluster
|
||||
) {
|
||||
}
|
||||
|
||||
public record PythonAssignment(
|
||||
UUID embeddingId,
|
||||
UUID documentId,
|
||||
UUID representationId,
|
||||
Integer clusterLabel,
|
||||
Double distanceToCentroid,
|
||||
Double membershipScore,
|
||||
Boolean noise
|
||||
) {
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
import java.util.UUID;
|
||||
|
||||
public record PythonRunExecutionRequest(
|
||||
UUID runId
|
||||
) {
|
||||
}
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
import at.procon.dip.clustering.ReductionMethod;
|
||||
|
||||
public record ReductionConfig(
|
||||
ReductionMethod method,
|
||||
Integer targetDimensions
|
||||
) {
|
||||
}
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import java.util.UUID;
|
||||
|
||||
public record SelectedEmbeddingRow(
|
||||
UUID embeddingId,
|
||||
UUID documentId,
|
||||
UUID representationId,
|
||||
UUID modelId,
|
||||
UUID prefixProfileId,
|
||||
DocumentType documentType,
|
||||
DocumentFamily documentFamily,
|
||||
RepresentationType representationType,
|
||||
String builderKey,
|
||||
String languageCode,
|
||||
String businessKey,
|
||||
float[] embeddingVector
|
||||
) {
|
||||
}
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
public record SelectionCountResponse(long count) {
|
||||
}
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
package at.procon.dip.clustering.dto;
|
||||
|
||||
import jakarta.validation.Valid;
|
||||
import jakarta.validation.constraints.NotBlank;
|
||||
import jakarta.validation.constraints.NotNull;
|
||||
|
||||
public record UpdateClusterSetRequest(
|
||||
@NotBlank String name,
|
||||
String description,
|
||||
@Valid @NotNull EmbeddingSelectionSpec selection,
|
||||
Boolean active
|
||||
) {
|
||||
}
|
||||
|
|
@ -0,0 +1,72 @@
|
|||
package at.procon.dip.clustering.entity;
|
||||
|
||||
import at.procon.dip.architecture.SchemaNames;
|
||||
import jakarta.persistence.Column;
|
||||
import jakarta.persistence.Entity;
|
||||
import jakarta.persistence.FetchType;
|
||||
import jakarta.persistence.GeneratedValue;
|
||||
import jakarta.persistence.GenerationType;
|
||||
import jakarta.persistence.Id;
|
||||
import jakarta.persistence.Index;
|
||||
import jakarta.persistence.JoinColumn;
|
||||
import jakarta.persistence.ManyToOne;
|
||||
import jakarta.persistence.PrePersist;
|
||||
import jakarta.persistence.Table;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.UUID;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
import org.hibernate.annotations.JdbcTypeCode;
|
||||
import org.hibernate.type.SqlTypes;
|
||||
|
||||
@Entity
|
||||
@Table(schema = SchemaNames.DOC, name = "doc_embedding_cluster", indexes = {
|
||||
@Index(name = "idx_doc_cluster_cluster_run_jpa", columnList = "cluster_run_id")
|
||||
})
|
||||
@Getter
|
||||
@Setter
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
public class EmbeddingCluster {
|
||||
|
||||
@Id
|
||||
@GeneratedValue(strategy = GenerationType.UUID)
|
||||
private UUID id;
|
||||
|
||||
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||
@JoinColumn(name = "cluster_run_id", nullable = false)
|
||||
private EmbeddingClusterRun clusterRun;
|
||||
|
||||
@Column(name = "cluster_label", nullable = false)
|
||||
private Integer clusterLabel;
|
||||
|
||||
@Column(name = "display_name", length = 255)
|
||||
private String displayName;
|
||||
|
||||
@Column(name = "item_count", nullable = false)
|
||||
private Long itemCount;
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "is_noise_cluster", nullable = false)
|
||||
private boolean noiseCluster = false;
|
||||
|
||||
@Column(name = "summary_text", columnDefinition = "TEXT")
|
||||
private String summaryText;
|
||||
|
||||
@JdbcTypeCode(SqlTypes.JSON)
|
||||
@Column(name = "top_terms_json", columnDefinition = "jsonb")
|
||||
private String topTermsJson;
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "created_at", nullable = false, updatable = false)
|
||||
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||
|
||||
@PrePersist
|
||||
protected void onCreate() {
|
||||
createdAt = OffsetDateTime.now();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,84 @@
|
|||
package at.procon.dip.clustering.entity;
|
||||
|
||||
import at.procon.dip.architecture.SchemaNames;
|
||||
import at.procon.dip.domain.document.entity.Document;
|
||||
import at.procon.dip.domain.document.entity.DocumentEmbedding;
|
||||
import at.procon.dip.domain.document.entity.DocumentTextRepresentation;
|
||||
import jakarta.persistence.Column;
|
||||
import jakarta.persistence.Entity;
|
||||
import jakarta.persistence.FetchType;
|
||||
import jakarta.persistence.GeneratedValue;
|
||||
import jakarta.persistence.GenerationType;
|
||||
import jakarta.persistence.Id;
|
||||
import jakarta.persistence.Index;
|
||||
import jakarta.persistence.JoinColumn;
|
||||
import jakarta.persistence.ManyToOne;
|
||||
import jakarta.persistence.PrePersist;
|
||||
import jakarta.persistence.Table;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.UUID;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
|
||||
@Entity
|
||||
@Table(schema = SchemaNames.DOC, name = "doc_embedding_cluster_assignment", indexes = {
|
||||
@Index(name = "idx_doc_cluster_assignment_run_jpa", columnList = "cluster_run_id"),
|
||||
@Index(name = "idx_doc_cluster_assignment_cluster_jpa", columnList = "cluster_id"),
|
||||
@Index(name = "idx_doc_cluster_assignment_document_jpa", columnList = "cluster_run_id, document_id")
|
||||
})
|
||||
@Getter
|
||||
@Setter
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
public class EmbeddingClusterAssignment {
|
||||
|
||||
@Id
|
||||
@GeneratedValue(strategy = GenerationType.UUID)
|
||||
private UUID id;
|
||||
|
||||
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||
@JoinColumn(name = "cluster_run_id", nullable = false)
|
||||
private EmbeddingClusterRun clusterRun;
|
||||
|
||||
@ManyToOne(fetch = FetchType.LAZY)
|
||||
@JoinColumn(name = "cluster_id")
|
||||
private EmbeddingCluster cluster;
|
||||
|
||||
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||
@JoinColumn(name = "embedding_id", nullable = false)
|
||||
private DocumentEmbedding embedding;
|
||||
|
||||
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||
@JoinColumn(name = "document_id", nullable = false)
|
||||
private Document document;
|
||||
|
||||
@ManyToOne(fetch = FetchType.LAZY, optional = false)
|
||||
@JoinColumn(name = "representation_id", nullable = false)
|
||||
private DocumentTextRepresentation representation;
|
||||
|
||||
@Column(name = "cluster_label_raw", nullable = false)
|
||||
private Integer clusterLabelRaw;
|
||||
|
||||
@Column(name = "membership_score")
|
||||
private Double membershipScore;
|
||||
|
||||
@Column(name = "distance_to_centroid")
|
||||
private Double distanceToCentroid;
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "is_noise", nullable = false)
|
||||
private boolean noise = false;
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "created_at", nullable = false, updatable = false)
|
||||
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||
|
||||
@PrePersist
|
||||
protected void onCreate() {
|
||||
createdAt = OffsetDateTime.now();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,147 @@
|
|||
package at.procon.dip.clustering.entity;
|
||||
|
||||
import at.procon.dip.architecture.SchemaNames;
|
||||
import at.procon.dip.clustering.ClusterRunStatus;
|
||||
import at.procon.dip.clustering.ClusteringAlgorithm;
|
||||
import at.procon.dip.clustering.ClusteringExecutionBackend;
|
||||
import at.procon.dip.clustering.ReductionMethod;
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import at.procon.dip.domain.document.entity.DocumentEmbeddingModel;
|
||||
import at.procon.dip.domain.document.entity.DocumentEmbeddingPrefixProfile;
|
||||
import jakarta.persistence.Column;
|
||||
import jakarta.persistence.Entity;
|
||||
import jakarta.persistence.EnumType;
|
||||
import jakarta.persistence.Enumerated;
|
||||
import jakarta.persistence.FetchType;
|
||||
import jakarta.persistence.GeneratedValue;
|
||||
import jakarta.persistence.GenerationType;
|
||||
import jakarta.persistence.Id;
|
||||
import jakarta.persistence.Index;
|
||||
import jakarta.persistence.JoinColumn;
|
||||
import jakarta.persistence.ManyToOne;
|
||||
import jakarta.persistence.PrePersist;
|
||||
import jakarta.persistence.PreUpdate;
|
||||
import jakarta.persistence.Table;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.UUID;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
import org.hibernate.annotations.JdbcTypeCode;
|
||||
import org.hibernate.type.SqlTypes;
|
||||
|
||||
@Entity
|
||||
@Table(schema = SchemaNames.DOC, name = "doc_embedding_cluster_run", indexes = {
|
||||
@Index(name = "idx_doc_cluster_run_status_jpa", columnList = "status"),
|
||||
@Index(name = "idx_doc_cluster_run_algorithm_jpa", columnList = "algorithm"),
|
||||
@Index(name = "idx_doc_cluster_run_created_at_jpa", columnList = "created_at")
|
||||
})
|
||||
@Getter
|
||||
@Setter
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
public class EmbeddingClusterRun {
|
||||
|
||||
@Id
|
||||
@GeneratedValue(strategy = GenerationType.UUID)
|
||||
private UUID id;
|
||||
|
||||
@ManyToOne(fetch = FetchType.LAZY)
|
||||
@JoinColumn(name = "cluster_set_id")
|
||||
private EmbeddingClusterSet clusterSet;
|
||||
|
||||
@Column(name = "name", nullable = false, length = 255)
|
||||
private String name;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "status", nullable = false, length = 32)
|
||||
private ClusterRunStatus status;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "algorithm", nullable = false, length = 64)
|
||||
private ClusteringAlgorithm algorithm;
|
||||
|
||||
@Column(name = "algorithm_version", length = 64)
|
||||
private String algorithmVersion;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "execution_backend", length = 64)
|
||||
private ClusteringExecutionBackend executionBackend;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "reduction_method", length = 32)
|
||||
private ReductionMethod reductionMethod;
|
||||
|
||||
@Column(name = "reduction_dimensions")
|
||||
private Integer reductionDimensions;
|
||||
|
||||
@JdbcTypeCode(SqlTypes.JSON)
|
||||
@Column(name = "selection_json", nullable = false, columnDefinition = "jsonb")
|
||||
private String selectionJson;
|
||||
|
||||
@JdbcTypeCode(SqlTypes.JSON)
|
||||
@Column(name = "parameters_json", nullable = false, columnDefinition = "jsonb")
|
||||
private String parametersJson;
|
||||
|
||||
@ManyToOne(fetch = FetchType.LAZY)
|
||||
@JoinColumn(name = "embedding_model_id")
|
||||
private DocumentEmbeddingModel embeddingModel;
|
||||
|
||||
@ManyToOne(fetch = FetchType.LAZY)
|
||||
@JoinColumn(name = "prefix_profile_id")
|
||||
private DocumentEmbeddingPrefixProfile prefixProfile;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "document_type", length = 64)
|
||||
private DocumentType documentType;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "document_family", length = 64)
|
||||
private DocumentFamily documentFamily;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(name = "representation_type", length = 64)
|
||||
private RepresentationType representationType;
|
||||
|
||||
@Column(name = "builder_key", length = 255)
|
||||
private String builderKey;
|
||||
|
||||
@Column(name = "item_count")
|
||||
private Long itemCount;
|
||||
|
||||
@Column(name = "cluster_count")
|
||||
private Long clusterCount;
|
||||
|
||||
@Column(name = "noise_count")
|
||||
private Long noiseCount;
|
||||
|
||||
@Column(name = "started_at")
|
||||
private OffsetDateTime startedAt;
|
||||
|
||||
@Column(name = "finished_at")
|
||||
private OffsetDateTime finishedAt;
|
||||
|
||||
@Column(name = "error_message", columnDefinition = "TEXT")
|
||||
private String errorMessage;
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "created_at", nullable = false, updatable = false)
|
||||
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||
|
||||
@PrePersist
|
||||
protected void onCreate() {
|
||||
createdAt = OffsetDateTime.now();
|
||||
}
|
||||
|
||||
@PreUpdate
|
||||
protected void onUpdate() {
|
||||
if (status == null) {
|
||||
status = ClusterRunStatus.CREATED;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,74 @@
|
|||
package at.procon.dip.clustering.entity;
|
||||
|
||||
import at.procon.dip.architecture.SchemaNames;
|
||||
import jakarta.persistence.Column;
|
||||
import jakarta.persistence.Entity;
|
||||
import jakarta.persistence.GeneratedValue;
|
||||
import jakarta.persistence.GenerationType;
|
||||
import jakarta.persistence.Id;
|
||||
import jakarta.persistence.Index;
|
||||
import jakarta.persistence.PrePersist;
|
||||
import jakarta.persistence.PreUpdate;
|
||||
import jakarta.persistence.Table;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.UUID;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
import org.hibernate.annotations.JdbcTypeCode;
|
||||
import org.hibernate.type.SqlTypes;
|
||||
|
||||
@Entity
|
||||
@Table(schema = SchemaNames.DOC, name = "doc_embedding_cluster_set", indexes = {
|
||||
@Index(name = "idx_doc_embedding_cluster_set_code", columnList = "code", unique = true),
|
||||
@Index(name = "idx_doc_embedding_cluster_set_active", columnList = "active")
|
||||
})
|
||||
@Getter
|
||||
@Setter
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
public class EmbeddingClusterSet {
|
||||
|
||||
@Id
|
||||
@GeneratedValue(strategy = GenerationType.UUID)
|
||||
private UUID id;
|
||||
|
||||
@Column(name = "code", nullable = false, length = 128, unique = true)
|
||||
private String code;
|
||||
|
||||
@Column(name = "name", nullable = false, length = 255)
|
||||
private String name;
|
||||
|
||||
@Column(name = "description", columnDefinition = "TEXT")
|
||||
private String description;
|
||||
|
||||
@JdbcTypeCode(SqlTypes.JSON)
|
||||
@Column(name = "selection_json", nullable = false, columnDefinition = "jsonb")
|
||||
private String selectionJson;
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "active", nullable = false)
|
||||
private boolean active = true;
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "created_at", nullable = false, updatable = false)
|
||||
private OffsetDateTime createdAt = OffsetDateTime.now();
|
||||
|
||||
@Builder.Default
|
||||
@Column(name = "updated_at", nullable = false)
|
||||
private OffsetDateTime updatedAt = OffsetDateTime.now();
|
||||
|
||||
@PrePersist
|
||||
protected void onCreate() {
|
||||
createdAt = OffsetDateTime.now();
|
||||
updatedAt = OffsetDateTime.now();
|
||||
}
|
||||
|
||||
@PreUpdate
|
||||
protected void onUpdate() {
|
||||
updatedAt = OffsetDateTime.now();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
package at.procon.dip.clustering.repository;
|
||||
|
||||
import at.procon.dip.clustering.dto.EmbeddingSelectionSpec;
|
||||
import at.procon.dip.clustering.dto.SelectedEmbeddingRow;
|
||||
import java.util.List;
|
||||
|
||||
public interface DocumentEmbeddingClusterSelectionRepository {
|
||||
List<SelectedEmbeddingRow> findSelection(EmbeddingSelectionSpec spec);
|
||||
long countSelection(EmbeddingSelectionSpec spec);
|
||||
}
|
||||
|
|
@ -0,0 +1,219 @@
|
|||
package at.procon.dip.clustering.repository;
|
||||
|
||||
import at.procon.dip.clustering.dto.EmbeddingSelectionSpec;
|
||||
import at.procon.dip.clustering.dto.SelectedEmbeddingRow;
|
||||
import at.procon.dip.domain.document.DocumentFamily;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.jdbc.core.RowMapper;
|
||||
import org.springframework.jdbc.core.namedparam.MapSqlParameterSource;
|
||||
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
|
||||
import org.springframework.stereotype.Repository;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
import org.springframework.util.StringUtils;
|
||||
|
||||
@Repository
|
||||
@RequiredArgsConstructor
|
||||
public class DocumentEmbeddingClusterSelectionRepositoryImpl implements DocumentEmbeddingClusterSelectionRepository {
|
||||
|
||||
private final NamedParameterJdbcTemplate jdbcTemplate;
|
||||
|
||||
@Override
|
||||
public List<SelectedEmbeddingRow> findSelection(EmbeddingSelectionSpec spec) {
|
||||
StringBuilder sql = new StringBuilder("""
|
||||
select
|
||||
e.id as embedding_id,
|
||||
d.id as document_id,
|
||||
r.id as representation_id,
|
||||
e.model_id,
|
||||
e.prefix_profile_id,
|
||||
d.document_type,
|
||||
d.document_family,
|
||||
r.representation_type,
|
||||
r.builder_key,
|
||||
r.language_code,
|
||||
d.business_key,
|
||||
e.embedding_vector::text as embedding_vector_text
|
||||
from doc.doc_embedding e
|
||||
join doc.doc_document d on d.id = e.document_id
|
||||
join doc.doc_text_representation r on r.id = e.representation_id
|
||||
where e.embedding_status = 'COMPLETED'
|
||||
and e.embedding_vector is not null
|
||||
and e.prefix_profile_id is not null
|
||||
""");
|
||||
MapSqlParameterSource params = new MapSqlParameterSource();
|
||||
applyFilters(spec, sql, params);
|
||||
sql.append(" order by e.created_at asc");
|
||||
|
||||
List<RawSelectedEmbeddingRow> rawRows = jdbcTemplate.query(
|
||||
sql.toString(),
|
||||
params,
|
||||
new RawSelectedEmbeddingRowMapper());
|
||||
|
||||
return rawRows.stream()
|
||||
.map(this::toSelectedEmbeddingRow)
|
||||
.toList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long countSelection(EmbeddingSelectionSpec spec) {
|
||||
StringBuilder sql = new StringBuilder("""
|
||||
select count(*)
|
||||
from doc.doc_embedding e
|
||||
join doc.doc_document d on d.id = e.document_id
|
||||
join doc.doc_text_representation r on r.id = e.representation_id
|
||||
where e.embedding_status = 'COMPLETED'
|
||||
and e.embedding_vector is not null
|
||||
and e.prefix_profile_id is not null
|
||||
""");
|
||||
MapSqlParameterSource params = new MapSqlParameterSource();
|
||||
applyFilters(spec, sql, params);
|
||||
Long result = jdbcTemplate.queryForObject(sql.toString(), params, Long.class);
|
||||
return result == null ? 0L : result;
|
||||
}
|
||||
|
||||
private void applyFilters(EmbeddingSelectionSpec spec, StringBuilder sql, MapSqlParameterSource params) {
|
||||
if (spec == null) {
|
||||
return;
|
||||
}
|
||||
if (!CollectionUtils.isEmpty(spec.documentTypes())) {
|
||||
sql.append(" and d.document_type in (:documentTypes)");
|
||||
params.addValue("documentTypes", enumNames(spec.documentTypes()));
|
||||
}
|
||||
if (!CollectionUtils.isEmpty(spec.documentFamilies())) {
|
||||
sql.append(" and d.document_family in (:documentFamilies)");
|
||||
params.addValue("documentFamilies", enumNames(spec.documentFamilies()));
|
||||
}
|
||||
if (!CollectionUtils.isEmpty(spec.representationTypes())) {
|
||||
sql.append(" and r.representation_type in (:representationTypes)");
|
||||
params.addValue("representationTypes", enumNames(spec.representationTypes()));
|
||||
}
|
||||
if (!CollectionUtils.isEmpty(spec.embeddingStatuses())) {
|
||||
sql.append(" and e.embedding_status in (:embeddingStatuses)");
|
||||
params.addValue("embeddingStatuses", enumNames(spec.embeddingStatuses()));
|
||||
}
|
||||
if (!CollectionUtils.isEmpty(spec.modelIds())) {
|
||||
sql.append(" and e.model_id in (:modelIds)");
|
||||
params.addValue("modelIds", spec.modelIds());
|
||||
}
|
||||
if (!CollectionUtils.isEmpty(spec.prefixProfileIds())) {
|
||||
sql.append(" and e.prefix_profile_id in (:prefixProfileIds)");
|
||||
params.addValue("prefixProfileIds", spec.prefixProfileIds());
|
||||
}
|
||||
if (!CollectionUtils.isEmpty(spec.builderKeys())) {
|
||||
sql.append(" and r.builder_key in (:builderKeys)");
|
||||
params.addValue("builderKeys", spec.builderKeys());
|
||||
}
|
||||
if (!CollectionUtils.isEmpty(spec.languageCodes())) {
|
||||
sql.append(" and r.language_code in (:languageCodes)");
|
||||
params.addValue("languageCodes", spec.languageCodes());
|
||||
}
|
||||
if (!CollectionUtils.isEmpty(spec.ownerTenantIds())) {
|
||||
sql.append(" and d.owner_tenant_id in (:ownerTenantIds)");
|
||||
params.addValue("ownerTenantIds", spec.ownerTenantIds());
|
||||
}
|
||||
if (StringUtils.hasText(spec.businessKeyLike())) {
|
||||
sql.append(" and d.business_key like :businessKeyLike");
|
||||
params.addValue("businessKeyLike", spec.businessKeyLike());
|
||||
}
|
||||
if (spec.createdFrom() != null) {
|
||||
sql.append(" and d.created_at >= :createdFrom");
|
||||
params.addValue("createdFrom", spec.createdFrom());
|
||||
}
|
||||
if (spec.createdTo() != null) {
|
||||
sql.append(" and d.created_at < :createdTo");
|
||||
params.addValue("createdTo", spec.createdTo());
|
||||
}
|
||||
if (Boolean.TRUE.equals(spec.primaryRepresentationOnly())) {
|
||||
sql.append(" and r.is_primary = true");
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> enumNames(Iterable<?> values) {
|
||||
List<String> result = new ArrayList<>();
|
||||
for (Object value : values) {
|
||||
result.add(String.valueOf(value));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private SelectedEmbeddingRow toSelectedEmbeddingRow(RawSelectedEmbeddingRow row) {
|
||||
return new SelectedEmbeddingRow(
|
||||
row.embeddingId(),
|
||||
row.documentId(),
|
||||
row.representationId(),
|
||||
row.modelId(),
|
||||
row.prefixProfileId(),
|
||||
row.documentType(),
|
||||
row.documentFamily(),
|
||||
row.representationType(),
|
||||
row.builderKey(),
|
||||
row.languageCode(),
|
||||
row.businessKey(),
|
||||
parseVector(row.embeddingVectorText()));
|
||||
}
|
||||
|
||||
private float[] parseVector(String raw) {
|
||||
if (raw == null) {
|
||||
return null;
|
||||
}
|
||||
String value = raw.trim();
|
||||
if (value.length() < 2) {
|
||||
return new float[0];
|
||||
}
|
||||
if (value.charAt(0) == '[' && value.charAt(value.length() - 1) == ']') {
|
||||
value = value.substring(1, value.length() - 1);
|
||||
}
|
||||
if (value.isBlank()) {
|
||||
return new float[0];
|
||||
}
|
||||
String[] parts = value.split(",");
|
||||
float[] result = new float[parts.length];
|
||||
for (int i = 0; i < parts.length; i++) {
|
||||
result[i] = Float.parseFloat(parts[i].trim());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private record RawSelectedEmbeddingRow(
|
||||
UUID embeddingId,
|
||||
UUID documentId,
|
||||
UUID representationId,
|
||||
UUID modelId,
|
||||
UUID prefixProfileId,
|
||||
DocumentType documentType,
|
||||
DocumentFamily documentFamily,
|
||||
RepresentationType representationType,
|
||||
String builderKey,
|
||||
String languageCode,
|
||||
String businessKey,
|
||||
String embeddingVectorText
|
||||
) {
|
||||
}
|
||||
|
||||
private static class RawSelectedEmbeddingRowMapper implements RowMapper<RawSelectedEmbeddingRow> {
|
||||
@Override
|
||||
public RawSelectedEmbeddingRow mapRow(ResultSet rs, int rowNum) throws SQLException {
|
||||
return new RawSelectedEmbeddingRow(
|
||||
rs.getObject("embedding_id", UUID.class),
|
||||
rs.getObject("document_id", UUID.class),
|
||||
rs.getObject("representation_id", UUID.class),
|
||||
rs.getObject("model_id", UUID.class),
|
||||
rs.getObject("prefix_profile_id", UUID.class),
|
||||
DocumentType.valueOf(rs.getString("document_type")),
|
||||
DocumentFamily.valueOf(rs.getString("document_family")),
|
||||
RepresentationType.valueOf(rs.getString("representation_type")),
|
||||
rs.getString("builder_key"),
|
||||
rs.getString("language_code"),
|
||||
rs.getString("business_key"),
|
||||
rs.getString("embedding_vector_text")
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
package at.procon.dip.clustering.repository;
|
||||
|
||||
import at.procon.dip.clustering.entity.EmbeddingClusterAssignment;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
|
||||
public interface EmbeddingClusterAssignmentRepository extends JpaRepository<EmbeddingClusterAssignment, UUID> {
|
||||
List<EmbeddingClusterAssignment> findByClusterRun_IdOrderByClusterLabelRawAscDocument_IdAsc(UUID clusterRunId);
|
||||
}
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
package at.procon.dip.clustering.repository;
|
||||
|
||||
import at.procon.dip.clustering.entity.EmbeddingCluster;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
|
||||
public interface EmbeddingClusterRepository extends JpaRepository<EmbeddingCluster, UUID> {
|
||||
List<EmbeddingCluster> findByClusterRun_IdOrderByClusterLabelAsc(UUID clusterRunId);
|
||||
}
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
package at.procon.dip.clustering.repository;
|
||||
|
||||
import at.procon.dip.clustering.dto.ClusterAssignmentViewResponse;
|
||||
import at.procon.dip.clustering.dto.ClusterMembersResponse;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
public interface EmbeddingClusterResultQueryRepository {
|
||||
List<ClusterAssignmentViewResponse> findAssignments(UUID runId, boolean includeText, int previewLength);
|
||||
List<ClusterMembersResponse> findClusterMembers(UUID runId, UUID clusterId, boolean includeText, int previewLength);
|
||||
}
|
||||
|
|
@ -0,0 +1,87 @@
|
|||
package at.procon.dip.clustering.repository;
|
||||
|
||||
import at.procon.dip.clustering.dto.ClusterAssignmentViewResponse;
|
||||
import at.procon.dip.clustering.dto.ClusterMembersResponse;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import at.procon.dip.domain.document.RepresentationType;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.jdbc.core.RowMapper;
|
||||
import org.springframework.jdbc.core.namedparam.MapSqlParameterSource;
|
||||
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
@Repository
|
||||
@RequiredArgsConstructor
|
||||
public class EmbeddingClusterResultQueryRepositoryImpl implements EmbeddingClusterResultQueryRepository {
|
||||
|
||||
private final NamedParameterJdbcTemplate jdbcTemplate;
|
||||
|
||||
@Override
|
||||
public List<ClusterAssignmentViewResponse> findAssignments(UUID runId, boolean includeText, int previewLength) {
|
||||
String sql = baseSql(includeText) + " where a.cluster_run_id = :runId order by a.cluster_label_raw asc, d.id asc";
|
||||
MapSqlParameterSource params = new MapSqlParameterSource()
|
||||
.addValue("runId", runId)
|
||||
.addValue("previewLength", previewLength);
|
||||
return jdbcTemplate.query(sql, params, assignmentRowMapper(includeText));
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ClusterMembersResponse> findClusterMembers(UUID runId, UUID clusterId, boolean includeText, int previewLength) {
|
||||
String sql = "select c.id as result_cluster_id, c.cluster_label as result_cluster_label, x.* from doc.doc_embedding_cluster c "
|
||||
+ "join (" + baseSql(includeText) + ") x on x.cluster_id = c.id "
|
||||
+ "where c.cluster_run_id = :runId and c.id = :clusterId order by x.business_key asc, x.document_id asc";
|
||||
MapSqlParameterSource params = new MapSqlParameterSource()
|
||||
.addValue("runId", runId)
|
||||
.addValue("clusterId", clusterId)
|
||||
.addValue("previewLength", previewLength);
|
||||
return jdbcTemplate.query(sql, params, (rs, rowNum) -> new ClusterMembersResponse(
|
||||
rs.getObject("result_cluster_id", UUID.class),
|
||||
rs.getInt("result_cluster_label"),
|
||||
assignmentRowMapper(includeText).mapRow(rs, rowNum)
|
||||
));
|
||||
}
|
||||
|
||||
private String baseSql(boolean includeText) {
|
||||
String textBodyExpression = includeText ? "r.text_body" : "null::text";
|
||||
return "select a.id, a.cluster_id, a.embedding_id, a.document_id, a.representation_id, a.cluster_label_raw, "
|
||||
+ "a.membership_score, a.distance_to_centroid, a.is_noise, d.business_key, d.document_type, "
|
||||
+ "r.representation_type, r.builder_key, r.language_code, r.char_count as text_length, "
|
||||
+ "case when r.text_body is null then null "
|
||||
+ "when char_length(r.text_body) <= :previewLength then r.text_body "
|
||||
+ "else substring(r.text_body from 1 for :previewLength) end as text_preview, "
|
||||
+ textBodyExpression + " as text_body "
|
||||
+ "from doc.doc_embedding_cluster_assignment a "
|
||||
+ "join doc.doc_document d on d.id = a.document_id "
|
||||
+ "join doc.doc_text_representation r on r.id = a.representation_id";
|
||||
}
|
||||
|
||||
private RowMapper<ClusterAssignmentViewResponse> assignmentRowMapper(boolean includeText) {
|
||||
return (rs, rowNum) -> new ClusterAssignmentViewResponse(
|
||||
rs.getObject("id", UUID.class),
|
||||
rs.getObject("cluster_id", UUID.class),
|
||||
rs.getObject("embedding_id", UUID.class),
|
||||
rs.getObject("document_id", UUID.class),
|
||||
rs.getObject("representation_id", UUID.class),
|
||||
rs.getInt("cluster_label_raw"),
|
||||
rs.getObject("membership_score", Double.class),
|
||||
rs.getObject("distance_to_centroid", Double.class),
|
||||
rs.getBoolean("is_noise"),
|
||||
rs.getString("business_key"),
|
||||
enumValue(DocumentType.class, rs.getString("document_type")),
|
||||
enumValue(RepresentationType.class, rs.getString("representation_type")),
|
||||
rs.getString("builder_key"),
|
||||
rs.getString("language_code"),
|
||||
rs.getObject("text_length", Integer.class),
|
||||
rs.getString("text_preview"),
|
||||
includeText ? rs.getString("text_body") : null
|
||||
);
|
||||
}
|
||||
|
||||
private <T extends Enum<T>> T enumValue(Class<T> type, String value) {
|
||||
return value == null ? null : Enum.valueOf(type, value);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
package at.procon.dip.clustering.repository;
|
||||
|
||||
import at.procon.dip.clustering.entity.EmbeddingClusterRun;
|
||||
import java.util.UUID;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
import org.springframework.data.jpa.repository.JpaSpecificationExecutor;
|
||||
|
||||
public interface EmbeddingClusterRunRepository
|
||||
extends JpaRepository<EmbeddingClusterRun, UUID>, JpaSpecificationExecutor<EmbeddingClusterRun> {
|
||||
}
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
package at.procon.dip.clustering.repository;
|
||||
|
||||
import at.procon.dip.clustering.ClusterRunStatus;
|
||||
import at.procon.dip.clustering.ClusteringAlgorithm;
|
||||
import at.procon.dip.clustering.ClusteringExecutionBackend;
|
||||
import at.procon.dip.clustering.entity.EmbeddingClusterRun;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import java.time.OffsetDateTime;
|
||||
import org.springframework.data.jpa.domain.Specification;
|
||||
|
||||
public final class EmbeddingClusterRunSpecifications {
|
||||
|
||||
private EmbeddingClusterRunSpecifications() {
|
||||
}
|
||||
|
||||
public static Specification<EmbeddingClusterRun> hasStatus(ClusterRunStatus status) {
|
||||
return (root, query, cb) -> status == null ? null : cb.equal(root.get("status"), status);
|
||||
}
|
||||
|
||||
public static Specification<EmbeddingClusterRun> hasAlgorithm(ClusteringAlgorithm algorithm) {
|
||||
return (root, query, cb) -> algorithm == null ? null : cb.equal(root.get("algorithm"), algorithm);
|
||||
}
|
||||
|
||||
public static Specification<EmbeddingClusterRun> hasExecutionBackend(ClusteringExecutionBackend executionBackend) {
|
||||
return (root, query, cb) -> executionBackend == null ? null : cb.equal(root.get("executionBackend"), executionBackend);
|
||||
}
|
||||
|
||||
public static Specification<EmbeddingClusterRun> hasDocumentType(DocumentType documentType) {
|
||||
return (root, query, cb) -> documentType == null ? null : cb.equal(root.get("documentType"), documentType);
|
||||
}
|
||||
|
||||
public static Specification<EmbeddingClusterRun> nameContains(String value) {
|
||||
return (root, query, cb) -> value == null || value.isBlank()
|
||||
? null
|
||||
: cb.like(cb.lower(root.get("name")), "%" + value.toLowerCase() + "%");
|
||||
}
|
||||
|
||||
public static Specification<EmbeddingClusterRun> createdAtFrom(OffsetDateTime value) {
|
||||
return (root, query, cb) -> value == null ? null : cb.greaterThanOrEqualTo(root.get("createdAt"), value);
|
||||
}
|
||||
|
||||
public static Specification<EmbeddingClusterRun> createdAtTo(OffsetDateTime value) {
|
||||
return (root, query, cb) -> value == null ? null : cb.lessThan(root.get("createdAt"), value);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
package at.procon.dip.clustering.repository;
|
||||
|
||||
import at.procon.dip.clustering.entity.EmbeddingClusterSet;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
|
||||
public interface EmbeddingClusterSetRepository extends JpaRepository<EmbeddingClusterSet, UUID> {
|
||||
Optional<EmbeddingClusterSet> findByCode(String code);
|
||||
|
||||
boolean existsByCodeIgnoreCaseAndIdNot(String code, UUID id);
|
||||
|
||||
boolean existsByCodeIgnoreCase(String code);
|
||||
|
||||
List<EmbeddingClusterSet> findAllByActiveOrderByCodeAsc(boolean active);
|
||||
|
||||
List<EmbeddingClusterSet> findAllByOrderByCodeAsc();
|
||||
}
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
package at.procon.dip.clustering.service;
|
||||
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.scheduling.annotation.Async;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class EmbeddingClusterAsyncExecutionService {
|
||||
|
||||
private final EmbeddingClusterRunService runService;
|
||||
|
||||
@Async("clusteringRunExecutor")
|
||||
public void executeRunAsync(UUID runId) {
|
||||
try {
|
||||
runService.executeQueuedRun(runId);
|
||||
} catch (Exception ex) {
|
||||
log.error("Cluster run {} failed during async execution: {}", runId, ex.getMessage(), ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,103 @@
|
|||
package at.procon.dip.clustering.service;
|
||||
|
||||
import at.procon.dip.clustering.dto.ClusteringEngineAssignment;
|
||||
import at.procon.dip.clustering.dto.ClusteringEngineCluster;
|
||||
import at.procon.dip.clustering.dto.ClusteringEngineResult;
|
||||
import at.procon.dip.clustering.entity.EmbeddingCluster;
|
||||
import at.procon.dip.clustering.entity.EmbeddingClusterRun;
|
||||
import at.procon.dip.clustering.repository.EmbeddingClusterRepository;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.jdbc.core.namedparam.MapSqlParameterSource;
|
||||
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class EmbeddingClusterPersistenceService {
|
||||
|
||||
private final EmbeddingClusterRepository clusterRepository;
|
||||
private final NamedParameterJdbcTemplate jdbcTemplate;
|
||||
|
||||
@Transactional
|
||||
public void persist(EmbeddingClusterRun run,
|
||||
List<ClusteringEngineCluster> clusters,
|
||||
List<ClusteringEngineAssignment> assignments) {
|
||||
persist(run, new ClusteringEngineResult(clusters, assignments));
|
||||
}
|
||||
|
||||
@Transactional
|
||||
public void persist(EmbeddingClusterRun run, ClusteringEngineResult result) {
|
||||
Map<Integer, EmbeddingCluster> clusterByLabel = new HashMap<>();
|
||||
for (ClusteringEngineCluster cluster : result.clusters()) {
|
||||
EmbeddingCluster saved = clusterRepository.save(EmbeddingCluster.builder()
|
||||
.clusterRun(run)
|
||||
.clusterLabel(cluster.clusterLabel())
|
||||
.itemCount(cluster.itemCount())
|
||||
.noiseCluster(cluster.noiseCluster())
|
||||
.build());
|
||||
clusterRepository.flush();
|
||||
clusterByLabel.put(cluster.clusterLabel(), saved);
|
||||
}
|
||||
|
||||
if (result.assignments().isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
MapSqlParameterSource[] batch = result.assignments().stream()
|
||||
.map(assignment -> toInsertParams(run, clusterByLabel.get(assignment.clusterLabel()), assignment))
|
||||
.toArray(MapSqlParameterSource[]::new);
|
||||
|
||||
jdbcTemplate.batchUpdate("""
|
||||
insert into doc.doc_embedding_cluster_assignment (
|
||||
id,
|
||||
cluster_run_id,
|
||||
cluster_id,
|
||||
embedding_id,
|
||||
document_id,
|
||||
representation_id,
|
||||
cluster_label_raw,
|
||||
membership_score,
|
||||
distance_to_centroid,
|
||||
is_noise,
|
||||
created_at
|
||||
)
|
||||
select
|
||||
:id,
|
||||
:clusterRunId,
|
||||
:clusterId,
|
||||
e.id,
|
||||
e.document_id,
|
||||
e.representation_id,
|
||||
:clusterLabelRaw,
|
||||
:membershipScore,
|
||||
:distanceToCentroid,
|
||||
:isNoise,
|
||||
:createdAt
|
||||
from doc.doc_embedding e
|
||||
where e.id = :embeddingId
|
||||
""", batch);
|
||||
}
|
||||
|
||||
private MapSqlParameterSource toInsertParams(
|
||||
EmbeddingClusterRun run,
|
||||
EmbeddingCluster cluster,
|
||||
ClusteringEngineAssignment assignment
|
||||
) {
|
||||
return new MapSqlParameterSource()
|
||||
.addValue("id", UUID.randomUUID())
|
||||
.addValue("clusterRunId", run.getId())
|
||||
.addValue("clusterId", cluster == null ? null : cluster.getId())
|
||||
.addValue("embeddingId", assignment.embeddingId())
|
||||
.addValue("clusterLabelRaw", assignment.clusterLabel())
|
||||
.addValue("membershipScore", assignment.membershipScore())
|
||||
.addValue("distanceToCentroid", assignment.distanceToCentroid())
|
||||
.addValue("isNoise", assignment.noise())
|
||||
.addValue("createdAt", OffsetDateTime.now());
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,580 @@
|
|||
package at.procon.dip.clustering.service;
|
||||
|
||||
import static at.procon.dip.clustering.repository.EmbeddingClusterRunSpecifications.createdAtFrom;
|
||||
import static at.procon.dip.clustering.repository.EmbeddingClusterRunSpecifications.createdAtTo;
|
||||
import static at.procon.dip.clustering.repository.EmbeddingClusterRunSpecifications.hasAlgorithm;
|
||||
import static at.procon.dip.clustering.repository.EmbeddingClusterRunSpecifications.hasDocumentType;
|
||||
import static at.procon.dip.clustering.repository.EmbeddingClusterRunSpecifications.hasExecutionBackend;
|
||||
import static at.procon.dip.clustering.repository.EmbeddingClusterRunSpecifications.hasStatus;
|
||||
import static at.procon.dip.clustering.repository.EmbeddingClusterRunSpecifications.nameContains;
|
||||
|
||||
import at.procon.dip.clustering.ClusterRunStatus;
|
||||
import at.procon.dip.clustering.ClusteringAlgorithm;
|
||||
import at.procon.dip.clustering.ClusteringExecutionBackend;
|
||||
import at.procon.dip.clustering.ReductionMethod;
|
||||
import at.procon.dip.clustering.PythonRequestMode;
|
||||
import at.procon.dip.clustering.client.PythonClusteringClient;
|
||||
import at.procon.dip.clustering.config.ClusteringPhaseBProperties;
|
||||
import at.procon.dip.clustering.dto.ClusterAssignmentViewResponse;
|
||||
import at.procon.dip.clustering.dto.ClusterMembersResponse;
|
||||
import at.procon.dip.clustering.dto.ClusterResponse;
|
||||
import at.procon.dip.clustering.dto.ClusterRunResponse;
|
||||
import at.procon.dip.clustering.dto.ClusteringEngineAssignment;
|
||||
import at.procon.dip.clustering.dto.ClusteringEngineCluster;
|
||||
import at.procon.dip.clustering.dto.ClusteringEngineRequest;
|
||||
import at.procon.dip.clustering.dto.ClusteringEngineResult;
|
||||
import at.procon.dip.clustering.dto.CreateClusterRunRequest;
|
||||
import at.procon.dip.clustering.dto.EmbeddingSelectionSpec;
|
||||
import at.procon.dip.clustering.dto.PythonClusteringRequest;
|
||||
import at.procon.dip.clustering.dto.PythonClusteringResponse;
|
||||
import at.procon.dip.clustering.dto.PythonRunExecutionRequest;
|
||||
import at.procon.dip.clustering.dto.ReductionConfig;
|
||||
import at.procon.dip.clustering.dto.SelectedEmbeddingRow;
|
||||
import at.procon.dip.clustering.entity.EmbeddingClusterRun;
|
||||
import at.procon.dip.clustering.entity.EmbeddingClusterSet;
|
||||
import at.procon.dip.clustering.repository.EmbeddingClusterRepository;
|
||||
import at.procon.dip.clustering.repository.EmbeddingClusterResultQueryRepository;
|
||||
import at.procon.dip.clustering.repository.EmbeddingClusterRunRepository;
|
||||
import at.procon.dip.clustering.repository.EmbeddingClusterSetRepository;
|
||||
import at.procon.dip.clustering.spi.EmbeddingClusteringEngine;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.core.type.TypeReference;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.data.domain.Sort;
|
||||
import org.springframework.data.jpa.domain.Specification;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
import org.springframework.web.server.ResponseStatusException;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class EmbeddingClusterRunService {
|
||||
|
||||
private static final int DEFAULT_PREVIEW_LENGTH = 1000;
|
||||
private static final TypeReference<Map<String, Object>> PARAMETERS_TYPE = new TypeReference<>() {
|
||||
};
|
||||
private static final Set<ClusterRunStatus> STARTABLE_STATUSES = Set.of(ClusterRunStatus.CREATED);
|
||||
private static final Set<ClusterRunStatus> FINAL_STATUSES = Set.of(
|
||||
ClusterRunStatus.COMPLETED, ClusterRunStatus.FAILED, ClusterRunStatus.CANCELLED);
|
||||
|
||||
private final EmbeddingClusterRunRepository runRepository;
|
||||
private final EmbeddingClusterSetRepository clusterSetRepository;
|
||||
private final EmbeddingClusterRepository clusterRepository;
|
||||
private final EmbeddingClusterResultQueryRepository resultQueryRepository;
|
||||
private final EmbeddingSelectionService selectionService;
|
||||
private final EmbeddingClusterPersistenceService persistenceService;
|
||||
private final List<EmbeddingClusteringEngine> clusteringEngines;
|
||||
private final ClusteringPhaseBProperties pythonProperties;
|
||||
private final Optional<PythonClusteringClient> pythonClusteringClient;
|
||||
private final ObjectMapper objectMapper;
|
||||
|
||||
@Transactional
|
||||
public ClusterRunResponse createRun(CreateClusterRunRequest request) {
|
||||
EmbeddingClusterRun run = EmbeddingClusterRun.builder()
|
||||
.clusterSet(resolveClusterSet(request.clusterSetCode()))
|
||||
.name(request.name())
|
||||
.status(ClusterRunStatus.CREATED)
|
||||
.algorithm(request.algorithm())
|
||||
.algorithmVersion("phase-e-dual-python")
|
||||
.executionBackend(resolveExecutionBackend(request))
|
||||
.reductionMethod(resolveReductionMethod(request.reduction()))
|
||||
.reductionDimensions(resolveReductionDimensions(request.reduction()))
|
||||
.selectionJson(writeJson(request.selection()))
|
||||
.parametersJson(writeJson(request.resolvedParameters()))
|
||||
.documentType(firstOrNull(request.selection() == null ? null : request.selection().documentTypes()))
|
||||
.documentFamily(firstOrNull(request.selection() == null ? null : request.selection().documentFamilies()))
|
||||
.representationType(firstOrNull(request.selection() == null ? null : request.selection().representationTypes()))
|
||||
.builderKey(firstOrNullString(request.selection() == null ? null : request.selection().builderKeys()))
|
||||
.build();
|
||||
return toResponse(runRepository.save(run));
|
||||
}
|
||||
|
||||
public ClusterRunResponse executeRun(UUID runId) {
|
||||
EmbeddingClusterRun run = loadRun(runId);
|
||||
return run.getExecutionBackend() == ClusteringExecutionBackend.PYTHON_REMOTE
|
||||
? executeRunRemote(run)
|
||||
: executeRunLocal(run);
|
||||
}
|
||||
|
||||
@Transactional
|
||||
public ClusterRunResponse queueRun(UUID runId) {
|
||||
EmbeddingClusterRun run = loadRun(runId);
|
||||
if (!STARTABLE_STATUSES.contains(run.getStatus())) {
|
||||
throw new ResponseStatusException(HttpStatus.CONFLICT,
|
||||
"Cluster run cannot be started from status " + run.getStatus());
|
||||
}
|
||||
run.setStatus(ClusterRunStatus.QUEUED);
|
||||
run.setStartedAt(null);
|
||||
run.setFinishedAt(null);
|
||||
run.setErrorMessage(null);
|
||||
run.setItemCount(null);
|
||||
run.setClusterCount(null);
|
||||
run.setNoiseCount(null);
|
||||
return toResponse(runRepository.save(run));
|
||||
}
|
||||
|
||||
public ClusterRunResponse executeQueuedRun(UUID runId) {
|
||||
EmbeddingClusterRun run = loadRun(runId);
|
||||
if (run.getStatus() == ClusterRunStatus.CANCELLED) {
|
||||
return toResponse(run);
|
||||
}
|
||||
if (run.getStatus() != ClusterRunStatus.QUEUED && run.getStatus() != ClusterRunStatus.CANCEL_REQUESTED) {
|
||||
throw new ResponseStatusException(HttpStatus.CONFLICT,
|
||||
"Cluster run is not queued for execution: " + runId);
|
||||
}
|
||||
|
||||
transitionToRunning(runId);
|
||||
|
||||
if (loadRun(runId).getExecutionBackend() == ClusteringExecutionBackend.PYTHON_REMOTE) {
|
||||
return executeQueuedRemoteRun(runId);
|
||||
}
|
||||
return executeQueuedLocalRun(runId);
|
||||
}
|
||||
|
||||
public ClusterRunResponse getRun(UUID runId) {
|
||||
return toResponse(loadRun(runId));
|
||||
}
|
||||
|
||||
public List<ClusterRunResponse> listRuns(
|
||||
ClusterRunStatus status,
|
||||
ClusteringAlgorithm algorithm,
|
||||
ClusteringExecutionBackend executionBackend,
|
||||
DocumentType documentType,
|
||||
String nameLike,
|
||||
OffsetDateTime createdFrom,
|
||||
OffsetDateTime createdTo) {
|
||||
|
||||
Specification<EmbeddingClusterRun> spec = Specification
|
||||
.where(hasStatus(status))
|
||||
.and(hasAlgorithm(algorithm))
|
||||
.and(hasExecutionBackend(executionBackend))
|
||||
.and(hasDocumentType(documentType))
|
||||
.and(nameContains(nameLike))
|
||||
.and(createdAtFrom(createdFrom))
|
||||
.and(createdAtTo(createdTo));
|
||||
|
||||
return runRepository.findAll(spec, Sort.by(Sort.Direction.DESC, "createdAt")).stream()
|
||||
.map(this::toResponse)
|
||||
.toList();
|
||||
}
|
||||
|
||||
@Transactional
|
||||
public ClusterRunResponse requestCancellation(UUID runId) {
|
||||
EmbeddingClusterRun run = loadRun(runId);
|
||||
if (FINAL_STATUSES.contains(run.getStatus())) {
|
||||
return toResponse(run);
|
||||
}
|
||||
if (run.getStatus() == ClusterRunStatus.CREATED || run.getStatus() == ClusterRunStatus.QUEUED) {
|
||||
run.setStatus(ClusterRunStatus.CANCELLED);
|
||||
run.setFinishedAt(OffsetDateTime.now());
|
||||
run.setErrorMessage("Cluster run was cancelled");
|
||||
return toResponse(runRepository.save(run));
|
||||
}
|
||||
if (run.getStatus() == ClusterRunStatus.RUNNING) {
|
||||
run.setStatus(ClusterRunStatus.CANCEL_REQUESTED);
|
||||
run.setErrorMessage("Cluster run cancellation requested");
|
||||
return toResponse(runRepository.save(run));
|
||||
}
|
||||
return toResponse(run);
|
||||
}
|
||||
|
||||
public List<ClusterResponse> listClusters(UUID runId) {
|
||||
ensureRunExists(runId);
|
||||
return clusterRepository.findByClusterRun_IdOrderByClusterLabelAsc(runId).stream()
|
||||
.map(cluster -> new ClusterResponse(
|
||||
cluster.getId(),
|
||||
cluster.getClusterLabel(),
|
||||
cluster.getDisplayName(),
|
||||
cluster.getItemCount(),
|
||||
cluster.isNoiseCluster(),
|
||||
cluster.getSummaryText()))
|
||||
.toList();
|
||||
}
|
||||
|
||||
public List<ClusterAssignmentViewResponse> listAssignments(UUID runId, boolean includeText) {
|
||||
ensureRunExists(runId);
|
||||
return resultQueryRepository.findAssignments(runId, includeText, DEFAULT_PREVIEW_LENGTH);
|
||||
}
|
||||
|
||||
public List<ClusterMembersResponse> listClusterMembers(UUID runId, UUID clusterId, boolean includeText) {
|
||||
ensureRunExists(runId);
|
||||
return resultQueryRepository.findClusterMembers(runId, clusterId, includeText, DEFAULT_PREVIEW_LENGTH);
|
||||
}
|
||||
|
||||
private ClusterRunResponse executeRunRemote(EmbeddingClusterRun run) {
|
||||
EmbeddingSelectionSpec selection = readSelection(run.getSelectionJson());
|
||||
List<SelectedEmbeddingRow> selected = null;
|
||||
long itemCount;
|
||||
|
||||
if (useCompactPythonRunMode()) {
|
||||
itemCount = selectionService.count(selection);
|
||||
} else {
|
||||
selected = selectionService.load(selection);
|
||||
itemCount = selected.size();
|
||||
}
|
||||
|
||||
if (itemCount == 0L) {
|
||||
failRun(run.getId(), "Selection contains no completed embeddings");
|
||||
return getRun(run.getId());
|
||||
}
|
||||
|
||||
run.setStatus(ClusterRunStatus.RUNNING);
|
||||
run.setStartedAt(OffsetDateTime.now());
|
||||
run.setItemCount(itemCount);
|
||||
run.setErrorMessage(null);
|
||||
run.setFinishedAt(null);
|
||||
run.setClusterCount(null);
|
||||
run.setNoiseCount(null);
|
||||
runRepository.save(run);
|
||||
|
||||
try {
|
||||
ClusteringEngineResult result = executePythonClustering(run, selected);
|
||||
persistenceService.persist(run, result);
|
||||
run.setStatus(ClusterRunStatus.COMPLETED);
|
||||
run.setFinishedAt(OffsetDateTime.now());
|
||||
run.setClusterCount(result.clusters().stream().filter(cluster -> !cluster.noiseCluster()).count());
|
||||
run.setNoiseCount(result.noiseCount());
|
||||
runRepository.save(run);
|
||||
return toResponse(run);
|
||||
} catch (Exception ex) {
|
||||
failRun(run.getId(), ex.getMessage());
|
||||
return getRun(run.getId());
|
||||
}
|
||||
}
|
||||
|
||||
private ClusterRunResponse executeRunLocal(EmbeddingClusterRun run) {
|
||||
EmbeddingSelectionSpec selection = readSelection(run.getSelectionJson());
|
||||
List<SelectedEmbeddingRow> selected = selectionService.load(selection);
|
||||
if (selected.isEmpty()) {
|
||||
failRun(run.getId(), "Selection contains no completed embeddings");
|
||||
return getRun(run.getId());
|
||||
}
|
||||
|
||||
run.setStatus(ClusterRunStatus.RUNNING);
|
||||
run.setStartedAt(OffsetDateTime.now());
|
||||
run.setItemCount((long) selected.size());
|
||||
run.setErrorMessage(null);
|
||||
run.setFinishedAt(null);
|
||||
run.setClusterCount(null);
|
||||
run.setNoiseCount(null);
|
||||
runRepository.save(run);
|
||||
|
||||
try {
|
||||
ClusteringEngineResult result = executeLocalClustering(run, selected);
|
||||
persistenceService.persist(run, result);
|
||||
run.setStatus(ClusterRunStatus.COMPLETED);
|
||||
run.setFinishedAt(OffsetDateTime.now());
|
||||
run.setClusterCount(result.clusters().stream().filter(cluster -> !cluster.noiseCluster()).count());
|
||||
run.setNoiseCount(result.noiseCount());
|
||||
runRepository.save(run);
|
||||
return toResponse(run);
|
||||
} catch (Exception ex) {
|
||||
failRun(run.getId(), ex.getMessage());
|
||||
return getRun(run.getId());
|
||||
}
|
||||
}
|
||||
|
||||
private ClusterRunResponse executeQueuedRemoteRun(UUID runId) {
|
||||
EmbeddingClusterRun run = loadRun(runId);
|
||||
EmbeddingSelectionSpec selection = readSelection(run.getSelectionJson());
|
||||
List<SelectedEmbeddingRow> selected = null;
|
||||
long itemCount;
|
||||
|
||||
if (useCompactPythonRunMode()) {
|
||||
itemCount = selectionService.count(selection);
|
||||
} else {
|
||||
selected = selectionService.load(selection);
|
||||
itemCount = selected.size();
|
||||
}
|
||||
|
||||
if (itemCount == 0L) {
|
||||
failRun(runId, "Selection contains no completed embeddings");
|
||||
return getRun(runId);
|
||||
}
|
||||
updateItemCount(runId, itemCount);
|
||||
|
||||
if (isCancellationRequested(runId)) {
|
||||
cancelRunNow(runId, "Cluster run was cancelled before clustering started");
|
||||
return getRun(runId);
|
||||
}
|
||||
|
||||
try {
|
||||
ClusteringEngineResult result = executePythonClustering(loadRun(runId), selected);
|
||||
if (isCancellationRequested(runId)) {
|
||||
cancelRunNow(runId, "Cluster run was cancelled before results were persisted");
|
||||
return getRun(runId);
|
||||
}
|
||||
persistenceService.persist(loadRun(runId), result);
|
||||
completeRun(runId, result);
|
||||
return getRun(runId);
|
||||
} catch (Exception ex) {
|
||||
failRun(runId, ex.getMessage());
|
||||
return getRun(runId);
|
||||
}
|
||||
}
|
||||
|
||||
private ClusterRunResponse executeQueuedLocalRun(UUID runId) {
|
||||
EmbeddingSelectionSpec selection = readSelection(loadRun(runId).getSelectionJson());
|
||||
List<SelectedEmbeddingRow> selected = selectionService.load(selection);
|
||||
if (selected.isEmpty()) {
|
||||
failRun(runId, "Selection contains no completed embeddings");
|
||||
return getRun(runId);
|
||||
}
|
||||
updateItemCount(runId, (long) selected.size());
|
||||
|
||||
if (isCancellationRequested(runId)) {
|
||||
cancelRunNow(runId, "Cluster run was cancelled before clustering started");
|
||||
return getRun(runId);
|
||||
}
|
||||
|
||||
try {
|
||||
ClusteringEngineResult result = executeLocalClustering(loadRun(runId), selected);
|
||||
if (isCancellationRequested(runId)) {
|
||||
cancelRunNow(runId, "Cluster run was cancelled before results were persisted");
|
||||
return getRun(runId);
|
||||
}
|
||||
persistenceService.persist(loadRun(runId), result);
|
||||
completeRun(runId, result);
|
||||
return getRun(runId);
|
||||
} catch (Exception ex) {
|
||||
failRun(runId, ex.getMessage());
|
||||
return getRun(runId);
|
||||
}
|
||||
}
|
||||
|
||||
private void transitionToRunning(UUID runId) {
|
||||
EmbeddingClusterRun run = loadRun(runId);
|
||||
if (run.getStatus() == ClusterRunStatus.CANCELLED) {
|
||||
return;
|
||||
}
|
||||
run.setStatus(ClusterRunStatus.RUNNING);
|
||||
run.setStartedAt(OffsetDateTime.now());
|
||||
run.setFinishedAt(null);
|
||||
run.setErrorMessage(null);
|
||||
runRepository.save(run);
|
||||
}
|
||||
|
||||
private void updateItemCount(UUID runId, Long itemCount) {
|
||||
EmbeddingClusterRun run = loadRun(runId);
|
||||
run.setItemCount(itemCount);
|
||||
runRepository.save(run);
|
||||
}
|
||||
|
||||
private void completeRun(UUID runId, ClusteringEngineResult result) {
|
||||
EmbeddingClusterRun run = loadRun(runId);
|
||||
run.setStatus(ClusterRunStatus.COMPLETED);
|
||||
run.setFinishedAt(OffsetDateTime.now());
|
||||
run.setClusterCount(result.clusters().stream().filter(cluster -> !cluster.noiseCluster()).count());
|
||||
run.setNoiseCount(result.noiseCount());
|
||||
run.setErrorMessage(null);
|
||||
runRepository.save(run);
|
||||
}
|
||||
|
||||
private void cancelRunNow(UUID runId, String message) {
|
||||
EmbeddingClusterRun run = loadRun(runId);
|
||||
run.setStatus(ClusterRunStatus.CANCELLED);
|
||||
run.setFinishedAt(OffsetDateTime.now());
|
||||
run.setErrorMessage(message);
|
||||
runRepository.save(run);
|
||||
}
|
||||
|
||||
private boolean isCancellationRequested(UUID runId) {
|
||||
ClusterRunStatus status = loadRun(runId).getStatus();
|
||||
return status == ClusterRunStatus.CANCEL_REQUESTED || status == ClusterRunStatus.CANCELLED;
|
||||
}
|
||||
|
||||
private ClusteringEngineResult executePythonClustering(EmbeddingClusterRun run, List<SelectedEmbeddingRow> selected) {
|
||||
PythonClusteringClient client = pythonClusteringClient.orElseThrow(() -> new ResponseStatusException(
|
||||
HttpStatus.BAD_REQUEST,
|
||||
"Cluster run requires PYTHON_REMOTE backend but no Python client is configured"));
|
||||
|
||||
PythonClusteringResponse response = useCompactPythonRunMode()
|
||||
? client.clusterRun(new PythonRunExecutionRequest(run.getId()))
|
||||
: client.cluster(new PythonClusteringRequest(
|
||||
run.getAlgorithm(),
|
||||
readParameters(run.getParametersJson()),
|
||||
run.getReductionMethod(),
|
||||
run.getReductionDimensions(),
|
||||
selected == null ? List.of() : selected.stream()
|
||||
.map(item -> new PythonClusteringRequest.PythonClusteringItem(
|
||||
item.embeddingId(),
|
||||
item.documentId(),
|
||||
item.representationId(),
|
||||
item.embeddingVector()))
|
||||
.toList()));
|
||||
|
||||
return mapPythonResponse(response);
|
||||
}
|
||||
|
||||
private boolean useCompactPythonRunMode() {
|
||||
return pythonProperties.effectiveRequestMode() == PythonRequestMode.RUN_ID;
|
||||
}
|
||||
|
||||
private ClusteringEngineResult executeLocalClustering(EmbeddingClusterRun run, List<SelectedEmbeddingRow> selected) {
|
||||
EmbeddingClusteringEngine engine = resolveEngine(run.getAlgorithm());
|
||||
return engine.cluster(selected, new ClusteringEngineRequest(readParameters(run.getParametersJson())));
|
||||
}
|
||||
|
||||
private ClusteringEngineResult mapPythonResponse(PythonClusteringResponse response) {
|
||||
long noiseCount = response.noiseCount() == null
|
||||
? response.assignments().stream().filter(assignment -> Boolean.TRUE.equals(assignment.noise())).count()
|
||||
: response.noiseCount();
|
||||
|
||||
return new ClusteringEngineResult(
|
||||
response.clusters().stream()
|
||||
.map(cluster -> new ClusteringEngineCluster(
|
||||
cluster.clusterLabel(),
|
||||
cluster.itemCount(),
|
||||
Boolean.TRUE.equals(cluster.noiseCluster())))
|
||||
.toList(),
|
||||
response.assignments().stream()
|
||||
.map(assignment -> new ClusteringEngineAssignment(
|
||||
assignment.embeddingId(),
|
||||
assignment.documentId(),
|
||||
assignment.representationId(),
|
||||
assignment.clusterLabel(),
|
||||
assignment.distanceToCentroid(),
|
||||
assignment.membershipScore(),
|
||||
Boolean.TRUE.equals(assignment.noise())))
|
||||
.toList(),
|
||||
noiseCount);
|
||||
}
|
||||
|
||||
private EmbeddingClusterSet resolveClusterSet(String clusterSetCode) {
|
||||
if (clusterSetCode == null || clusterSetCode.isBlank()) {
|
||||
return null;
|
||||
}
|
||||
return clusterSetRepository.findByCode(clusterSetCode)
|
||||
.orElseThrow(() -> new ResponseStatusException(HttpStatus.BAD_REQUEST,
|
||||
"Cluster set not found: " + clusterSetCode));
|
||||
}
|
||||
|
||||
private void ensureRunExists(UUID runId) {
|
||||
if (!runRepository.existsById(runId)) {
|
||||
throw new ResponseStatusException(HttpStatus.NOT_FOUND, "Cluster run not found: " + runId);
|
||||
}
|
||||
}
|
||||
|
||||
private void failRun(UUID runId, String message) {
|
||||
EmbeddingClusterRun run = loadRun(runId);
|
||||
run.setStatus(ClusterRunStatus.FAILED);
|
||||
run.setFinishedAt(OffsetDateTime.now());
|
||||
run.setErrorMessage(message);
|
||||
runRepository.save(run);
|
||||
}
|
||||
|
||||
private EmbeddingClusteringEngine resolveEngine(ClusteringAlgorithm algorithm) {
|
||||
return clusteringEngines.stream()
|
||||
.filter(engine -> engine.algorithm() == algorithm)
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new ResponseStatusException(HttpStatus.BAD_REQUEST,
|
||||
"No clustering engine registered for algorithm " + algorithm));
|
||||
}
|
||||
|
||||
private EmbeddingSelectionSpec readSelection(String json) {
|
||||
try {
|
||||
return objectMapper.readValue(json, EmbeddingSelectionSpec.class);
|
||||
} catch (JsonProcessingException e) {
|
||||
throw new ResponseStatusException(HttpStatus.INTERNAL_SERVER_ERROR,
|
||||
"Cannot parse stored clustering selection", e);
|
||||
}
|
||||
}
|
||||
|
||||
private Map<String, Object> readParameters(String json) {
|
||||
if (json == null || json.isBlank()) {
|
||||
return Map.of();
|
||||
}
|
||||
try {
|
||||
Map<String, Object> values = objectMapper.readValue(json, PARAMETERS_TYPE);
|
||||
return values == null ? Map.of() : values;
|
||||
} catch (JsonProcessingException e) {
|
||||
throw new ResponseStatusException(HttpStatus.INTERNAL_SERVER_ERROR,
|
||||
"Cannot parse clustering parameters", e);
|
||||
}
|
||||
}
|
||||
|
||||
private String writeJson(Object value) {
|
||||
try {
|
||||
return objectMapper.writeValueAsString(value);
|
||||
} catch (JsonProcessingException e) {
|
||||
throw new ResponseStatusException(HttpStatus.INTERNAL_SERVER_ERROR,
|
||||
"Cannot serialize clustering payload", e);
|
||||
}
|
||||
}
|
||||
|
||||
private ClusteringExecutionBackend resolveExecutionBackend(CreateClusterRunRequest request) {
|
||||
if (request.executionBackend() != null) {
|
||||
if (request.executionBackend() == ClusteringExecutionBackend.JAVA_LOCAL
|
||||
&& request.reduction() != null
|
||||
&& request.reduction().method() != null
|
||||
&& request.reduction().method() != ReductionMethod.NONE) {
|
||||
throw new ResponseStatusException(HttpStatus.BAD_REQUEST,
|
||||
"JAVA_LOCAL backend does not support PCA/UMAP reduction; use PYTHON_REMOTE");
|
||||
}
|
||||
return request.executionBackend();
|
||||
}
|
||||
if (request.reduction() != null && request.reduction().method() != null && request.reduction().method() != ReductionMethod.NONE) {
|
||||
return ClusteringExecutionBackend.PYTHON_REMOTE;
|
||||
}
|
||||
return switch (request.algorithm()) {
|
||||
case KMEANS -> ClusteringExecutionBackend.JAVA_LOCAL;
|
||||
default -> ClusteringExecutionBackend.PYTHON_REMOTE;
|
||||
};
|
||||
}
|
||||
|
||||
private ReductionMethod resolveReductionMethod(ReductionConfig reduction) {
|
||||
if (reduction == null || reduction.method() == null) {
|
||||
return ReductionMethod.NONE;
|
||||
}
|
||||
return reduction.method();
|
||||
}
|
||||
|
||||
private Integer resolveReductionDimensions(ReductionConfig reduction) {
|
||||
return reduction == null ? null : reduction.targetDimensions();
|
||||
}
|
||||
|
||||
private <T> T firstOrNull(Iterable<T> values) {
|
||||
if (values == null) {
|
||||
return null;
|
||||
}
|
||||
for (T value : values) {
|
||||
return value;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private String firstOrNullString(Iterable<String> values) {
|
||||
return firstOrNull(values);
|
||||
}
|
||||
|
||||
private ClusterRunResponse toResponse(EmbeddingClusterRun run) {
|
||||
return new ClusterRunResponse(
|
||||
run.getId(),
|
||||
run.getName(),
|
||||
run.getStatus(),
|
||||
run.getAlgorithm(),
|
||||
run.getExecutionBackend(),
|
||||
run.getReductionMethod(),
|
||||
run.getReductionDimensions(),
|
||||
run.getItemCount(),
|
||||
run.getClusterCount(),
|
||||
run.getNoiseCount(),
|
||||
run.getStartedAt(),
|
||||
run.getFinishedAt(),
|
||||
run.getErrorMessage());
|
||||
}
|
||||
|
||||
private EmbeddingClusterRun loadRun(UUID runId) {
|
||||
return runRepository.findById(runId)
|
||||
.orElseThrow(() -> new ResponseStatusException(HttpStatus.NOT_FOUND, "Cluster run not found: " + runId));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,95 @@
|
|||
package at.procon.dip.clustering.service;
|
||||
|
||||
import at.procon.dip.clustering.dto.ClusterSetResponse;
|
||||
import at.procon.dip.clustering.dto.CreateClusterSetRequest;
|
||||
import at.procon.dip.clustering.dto.EmbeddingSelectionSpec;
|
||||
import at.procon.dip.clustering.dto.UpdateClusterSetRequest;
|
||||
import at.procon.dip.clustering.entity.EmbeddingClusterSet;
|
||||
import at.procon.dip.clustering.repository.EmbeddingClusterSetRepository;
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.server.ResponseStatusException;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class EmbeddingClusterSetService {
|
||||
|
||||
private final EmbeddingClusterSetRepository clusterSetRepository;
|
||||
private final ObjectMapper objectMapper;
|
||||
|
||||
@Transactional
|
||||
public ClusterSetResponse create(CreateClusterSetRequest request) {
|
||||
if (clusterSetRepository.existsByCodeIgnoreCase(request.code())) {
|
||||
throw new ResponseStatusException(HttpStatus.CONFLICT,
|
||||
"Cluster set code already exists: " + request.code());
|
||||
}
|
||||
EmbeddingClusterSet saved = clusterSetRepository.save(EmbeddingClusterSet.builder()
|
||||
.code(request.code().trim())
|
||||
.name(request.name().trim())
|
||||
.description(request.description())
|
||||
.selectionJson(writeJson(request.selection()))
|
||||
.active(request.active() == null || request.active())
|
||||
.build());
|
||||
return toResponse(saved);
|
||||
}
|
||||
|
||||
@Transactional
|
||||
public ClusterSetResponse update(UUID id, UpdateClusterSetRequest request) {
|
||||
EmbeddingClusterSet existing = clusterSetRepository.findById(id)
|
||||
.orElseThrow(() -> new ResponseStatusException(HttpStatus.NOT_FOUND, "Cluster set not found: " + id));
|
||||
existing.setName(request.name().trim());
|
||||
existing.setDescription(request.description());
|
||||
existing.setSelectionJson(writeJson(request.selection()));
|
||||
existing.setActive(request.active() == null || request.active());
|
||||
return toResponse(clusterSetRepository.save(existing));
|
||||
}
|
||||
|
||||
public ClusterSetResponse get(UUID id) {
|
||||
return toResponse(clusterSetRepository.findById(id)
|
||||
.orElseThrow(() -> new ResponseStatusException(HttpStatus.NOT_FOUND, "Cluster set not found: " + id)));
|
||||
}
|
||||
|
||||
public List<ClusterSetResponse> list(Boolean activeOnly) {
|
||||
List<EmbeddingClusterSet> sets = activeOnly == null
|
||||
? clusterSetRepository.findAllByOrderByCodeAsc()
|
||||
: clusterSetRepository.findAllByActiveOrderByCodeAsc(activeOnly);
|
||||
return sets.stream().map(this::toResponse).toList();
|
||||
}
|
||||
|
||||
private ClusterSetResponse toResponse(EmbeddingClusterSet entity) {
|
||||
return new ClusterSetResponse(
|
||||
entity.getId(),
|
||||
entity.getCode(),
|
||||
entity.getName(),
|
||||
entity.getDescription(),
|
||||
entity.isActive(),
|
||||
readSelection(entity.getSelectionJson()),
|
||||
entity.getCreatedAt(),
|
||||
entity.getUpdatedAt()
|
||||
);
|
||||
}
|
||||
|
||||
private EmbeddingSelectionSpec readSelection(String json) {
|
||||
try {
|
||||
return objectMapper.readValue(json, EmbeddingSelectionSpec.class);
|
||||
} catch (JsonProcessingException e) {
|
||||
throw new ResponseStatusException(HttpStatus.INTERNAL_SERVER_ERROR,
|
||||
"Cannot parse stored cluster set selection", e);
|
||||
}
|
||||
}
|
||||
|
||||
private String writeJson(Object value) {
|
||||
try {
|
||||
return objectMapper.writeValueAsString(value);
|
||||
} catch (JsonProcessingException e) {
|
||||
throw new ResponseStatusException(HttpStatus.INTERNAL_SERVER_ERROR,
|
||||
"Cannot serialize cluster set selection", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
package at.procon.dip.clustering.service;
|
||||
|
||||
import at.procon.dip.clustering.dto.EmbeddingSelectionSpec;
|
||||
import at.procon.dip.clustering.dto.SelectedEmbeddingRow;
|
||||
import at.procon.dip.clustering.repository.DocumentEmbeddingClusterSelectionRepository;
|
||||
import java.util.List;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class EmbeddingSelectionService {
|
||||
|
||||
private final DocumentEmbeddingClusterSelectionRepository selectionRepository;
|
||||
|
||||
public long count(EmbeddingSelectionSpec spec) {
|
||||
return selectionRepository.countSelection(spec);
|
||||
}
|
||||
|
||||
public List<SelectedEmbeddingRow> load(EmbeddingSelectionSpec spec) {
|
||||
return selectionRepository.findSelection(spec);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
package at.procon.dip.clustering.spi;
|
||||
|
||||
import at.procon.dip.clustering.ClusteringAlgorithm;
|
||||
import at.procon.dip.clustering.dto.ClusteringEngineRequest;
|
||||
import at.procon.dip.clustering.dto.ClusteringEngineResult;
|
||||
import at.procon.dip.clustering.dto.SelectedEmbeddingRow;
|
||||
import java.util.List;
|
||||
|
||||
public interface EmbeddingClusteringEngine {
|
||||
|
||||
ClusteringAlgorithm algorithm();
|
||||
|
||||
ClusteringEngineResult cluster(List<SelectedEmbeddingRow> items, ClusteringEngineRequest request);
|
||||
}
|
||||
|
|
@ -0,0 +1,104 @@
|
|||
package at.procon.dip.clustering.spi;
|
||||
|
||||
import at.procon.dip.clustering.ClusteringAlgorithm;
|
||||
import at.procon.dip.clustering.dto.ClusteringEngineAssignment;
|
||||
import at.procon.dip.clustering.dto.ClusteringEngineCluster;
|
||||
import at.procon.dip.clustering.dto.ClusteringEngineRequest;
|
||||
import at.procon.dip.clustering.dto.ClusteringEngineResult;
|
||||
import at.procon.dip.clustering.dto.SelectedEmbeddingRow;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.commons.math3.ml.clustering.CentroidCluster;
|
||||
import org.apache.commons.math3.ml.clustering.DoublePoint;
|
||||
import org.apache.commons.math3.ml.clustering.KMeansPlusPlusClusterer;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class KMeansClusteringEngine implements EmbeddingClusteringEngine {
|
||||
|
||||
@Override
|
||||
public ClusteringAlgorithm algorithm() {
|
||||
return ClusteringAlgorithm.KMEANS;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ClusteringEngineResult cluster(List<SelectedEmbeddingRow> items, ClusteringEngineRequest request) {
|
||||
if (items == null || items.isEmpty()) {
|
||||
throw new IllegalArgumentException("Selection contains no embeddings to cluster");
|
||||
}
|
||||
if (request == null) {
|
||||
throw new IllegalArgumentException("Missing clustering request");
|
||||
}
|
||||
|
||||
int k = request.requiredInt("k");
|
||||
if (k <= 0) {
|
||||
throw new IllegalArgumentException("KMeans requires k > 0");
|
||||
}
|
||||
if (k > items.size()) {
|
||||
throw new IllegalArgumentException("KMeans k must be <= selected item count");
|
||||
}
|
||||
|
||||
int maxIterations = request.intValue("maxIterations", 100);
|
||||
|
||||
List<IndexedPoint> points = new ArrayList<>(items.size());
|
||||
for (int i = 0; i < items.size(); i++) {
|
||||
points.add(new IndexedPoint(i, toDouble(items.get(i).embeddingVector())));
|
||||
}
|
||||
|
||||
KMeansPlusPlusClusterer<IndexedPoint> clusterer = new KMeansPlusPlusClusterer<>(k, maxIterations);
|
||||
List<CentroidCluster<IndexedPoint>> clusters = clusterer.cluster(points);
|
||||
|
||||
List<ClusteringEngineCluster> resultClusters = new ArrayList<>();
|
||||
List<ClusteringEngineAssignment> assignments = new ArrayList<>();
|
||||
|
||||
for (int label = 0; label < clusters.size(); label++) {
|
||||
CentroidCluster<IndexedPoint> cluster = clusters.get(label);
|
||||
resultClusters.add(new ClusteringEngineCluster(label, cluster.getPoints().size(), false));
|
||||
double[] centroid = cluster.getCenter().getPoint();
|
||||
for (IndexedPoint point : cluster.getPoints()) {
|
||||
SelectedEmbeddingRow item = items.get(point.index());
|
||||
assignments.add(new ClusteringEngineAssignment(
|
||||
item.embeddingId(),
|
||||
item.documentId(),
|
||||
item.representationId(),
|
||||
label,
|
||||
euclidean(point.getPoint(), centroid),
|
||||
null,
|
||||
false
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
return new ClusteringEngineResult(resultClusters, assignments, 0L);
|
||||
}
|
||||
|
||||
private double[] toDouble(float[] values) {
|
||||
double[] result = new double[values.length];
|
||||
for (int i = 0; i < values.length; i++) {
|
||||
result[i] = values[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private double euclidean(double[] a, double[] b) {
|
||||
double sum = 0.0d;
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
double d = a[i] - b[i];
|
||||
sum += d * d;
|
||||
}
|
||||
return Math.sqrt(sum);
|
||||
}
|
||||
|
||||
private static class IndexedPoint extends DoublePoint {
|
||||
private final int index;
|
||||
|
||||
IndexedPoint(int index, double[] point) {
|
||||
super(point);
|
||||
this.index = index;
|
||||
}
|
||||
|
||||
int index() {
|
||||
return index;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,130 @@
|
|||
package at.procon.dip.clustering.web;
|
||||
|
||||
import at.procon.dip.clustering.ClusterRunStatus;
|
||||
import at.procon.dip.clustering.ClusteringAlgorithm;
|
||||
import at.procon.dip.clustering.ClusteringExecutionBackend;
|
||||
import at.procon.dip.clustering.dto.ClusterAssignmentViewResponse;
|
||||
import at.procon.dip.clustering.dto.ClusterMembersResponse;
|
||||
import at.procon.dip.clustering.dto.ClusterResponse;
|
||||
import at.procon.dip.clustering.dto.ClusterRunResponse;
|
||||
import at.procon.dip.clustering.dto.ClusterSetResponse;
|
||||
import at.procon.dip.clustering.dto.CreateClusterRunRequest;
|
||||
import at.procon.dip.clustering.dto.CreateClusterSetRequest;
|
||||
import at.procon.dip.clustering.dto.EmbeddingSelectionSpec;
|
||||
import at.procon.dip.clustering.dto.SelectionCountResponse;
|
||||
import at.procon.dip.clustering.dto.UpdateClusterSetRequest;
|
||||
import at.procon.dip.clustering.service.EmbeddingClusterAsyncExecutionService;
|
||||
import at.procon.dip.clustering.service.EmbeddingClusterRunService;
|
||||
import at.procon.dip.clustering.service.EmbeddingClusterSetService;
|
||||
import at.procon.dip.clustering.service.EmbeddingSelectionService;
|
||||
import at.procon.dip.domain.document.DocumentType;
|
||||
import jakarta.validation.Valid;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.springframework.format.annotation.DateTimeFormat;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.PathVariable;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.PutMapping;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
@RestController
|
||||
@RequestMapping("/v1/dip/clustering")
|
||||
@RequiredArgsConstructor
|
||||
public class EmbeddingClusterController {
|
||||
|
||||
private final EmbeddingSelectionService selectionService;
|
||||
private final EmbeddingClusterSetService clusterSetService;
|
||||
private final EmbeddingClusterRunService runService;
|
||||
private final EmbeddingClusterAsyncExecutionService asyncExecutionService;
|
||||
|
||||
@PostMapping("/selection/count")
|
||||
public ResponseEntity<SelectionCountResponse> countSelection(@RequestBody EmbeddingSelectionSpec selection) {
|
||||
return ResponseEntity.ok(new SelectionCountResponse(selectionService.count(selection)));
|
||||
}
|
||||
|
||||
@PostMapping("/sets")
|
||||
public ResponseEntity<ClusterSetResponse> createSet(@Valid @RequestBody CreateClusterSetRequest request) {
|
||||
return ResponseEntity.ok(clusterSetService.create(request));
|
||||
}
|
||||
|
||||
@PutMapping("/sets/{id}")
|
||||
public ResponseEntity<ClusterSetResponse> updateSet(@PathVariable UUID id,
|
||||
@Valid @RequestBody UpdateClusterSetRequest request) {
|
||||
return ResponseEntity.ok(clusterSetService.update(id, request));
|
||||
}
|
||||
|
||||
@GetMapping("/sets")
|
||||
public ResponseEntity<List<ClusterSetResponse>> listSets(
|
||||
@RequestParam(name = "active", required = false) Boolean active) {
|
||||
return ResponseEntity.ok(clusterSetService.list(active));
|
||||
}
|
||||
|
||||
@GetMapping("/sets/{id}")
|
||||
public ResponseEntity<ClusterSetResponse> getSet(@PathVariable UUID id) {
|
||||
return ResponseEntity.ok(clusterSetService.get(id));
|
||||
}
|
||||
|
||||
@PostMapping("/runs")
|
||||
public ResponseEntity<ClusterRunResponse> createRun(@Valid @RequestBody CreateClusterRunRequest request) {
|
||||
return ResponseEntity.ok(runService.createRun(request));
|
||||
}
|
||||
|
||||
@GetMapping("/runs")
|
||||
public ResponseEntity<List<ClusterRunResponse>> listRuns(
|
||||
@RequestParam(name = "status", required = false) ClusterRunStatus status,
|
||||
@RequestParam(name = "algorithm", required = false) ClusteringAlgorithm algorithm,
|
||||
@RequestParam(name = "executionBackend", required = false) ClusteringExecutionBackend executionBackend,
|
||||
@RequestParam(name = "documentType", required = false) DocumentType documentType,
|
||||
@RequestParam(name = "nameLike", required = false) String nameLike,
|
||||
@RequestParam(name = "createdFrom", required = false)
|
||||
@DateTimeFormat(iso = DateTimeFormat.ISO.DATE_TIME) OffsetDateTime createdFrom,
|
||||
@RequestParam(name = "createdTo", required = false)
|
||||
@DateTimeFormat(iso = DateTimeFormat.ISO.DATE_TIME) OffsetDateTime createdTo) {
|
||||
return ResponseEntity.ok(runService.listRuns(
|
||||
status, algorithm, executionBackend, documentType, nameLike, createdFrom, createdTo));
|
||||
}
|
||||
|
||||
@PostMapping("/runs/{id}/start")
|
||||
public ResponseEntity<ClusterRunResponse> startRun(@PathVariable UUID id) {
|
||||
ClusterRunResponse queued = runService.queueRun(id);
|
||||
asyncExecutionService.executeRunAsync(id);
|
||||
return ResponseEntity.accepted().body(queued);
|
||||
}
|
||||
|
||||
@PostMapping("/runs/{id}/cancel")
|
||||
public ResponseEntity<ClusterRunResponse> cancelRun(@PathVariable UUID id) {
|
||||
return ResponseEntity.ok(runService.requestCancellation(id));
|
||||
}
|
||||
|
||||
@GetMapping("/runs/{id}")
|
||||
public ResponseEntity<ClusterRunResponse> getRun(@PathVariable UUID id) {
|
||||
return ResponseEntity.ok(runService.getRun(id));
|
||||
}
|
||||
|
||||
@GetMapping("/runs/{id}/clusters")
|
||||
public ResponseEntity<List<ClusterResponse>> listClusters(@PathVariable UUID id) {
|
||||
return ResponseEntity.ok(runService.listClusters(id));
|
||||
}
|
||||
|
||||
@GetMapping("/runs/{id}/assignments")
|
||||
public ResponseEntity<List<ClusterAssignmentViewResponse>> listAssignments(
|
||||
@PathVariable UUID id,
|
||||
@RequestParam(name = "includeText", defaultValue = "false") boolean includeText) {
|
||||
return ResponseEntity.ok(runService.listAssignments(id, includeText));
|
||||
}
|
||||
|
||||
@GetMapping("/runs/{runId}/clusters/{clusterId}/members")
|
||||
public ResponseEntity<List<ClusterMembersResponse>> listClusterMembers(
|
||||
@PathVariable UUID runId,
|
||||
@PathVariable UUID clusterId,
|
||||
@RequestParam(name = "includeText", defaultValue = "false") boolean includeText) {
|
||||
return ResponseEntity.ok(runService.listClusterMembers(runId, clusterId, includeText));
|
||||
}
|
||||
}
|
||||
|
|
@ -77,7 +77,7 @@ public class Document {
|
|||
@Builder.Default
|
||||
private DocumentStatus status = DocumentStatus.RECEIVED;
|
||||
|
||||
@Column(name = "title", length = 1000)
|
||||
@Column(name = "title", columnDefinition = "TEXT")
|
||||
private String title;
|
||||
|
||||
@Column(name = "summary", columnDefinition = "TEXT")
|
||||
|
|
|
|||
|
|
@ -176,6 +176,16 @@ dip:
|
|||
profile-key: disabled
|
||||
enabled: false
|
||||
|
||||
clustering:
|
||||
python:
|
||||
enabled: true
|
||||
base-url: http://localhost:8001
|
||||
cluster-path: /cluster
|
||||
cluster-run-path: /cluster-run
|
||||
request-mode: INLINE_VECTORS
|
||||
connect-timeout: 30s
|
||||
read-timeout: 30m
|
||||
|
||||
# Phase 4 generic ingestion configuration
|
||||
ingestion:
|
||||
# Master switch for arbitrary document ingestion into the DOC model
|
||||
|
|
@ -275,7 +285,7 @@ dip:
|
|||
# ted packages download configuration
|
||||
ted-download:
|
||||
# Enable/disable automatic package download
|
||||
enabled: true
|
||||
enabled: false
|
||||
# Base URL for TED Daily Packages
|
||||
base-url: https://ted.europa.eu/packages/daily/
|
||||
# Download directory for tar.gz files
|
||||
|
|
@ -304,6 +314,10 @@ dip:
|
|||
leitstand:
|
||||
enabled: false
|
||||
startup-sync-enabled: false
|
||||
startup-selective-materialization-enabled: true
|
||||
selective-materialization-person-dbk: 100920031023144811001000
|
||||
selective-materialization-person-number:
|
||||
selective-materialization-build-projection: true
|
||||
create-canonical-time-entries: true
|
||||
build-search-projection: true
|
||||
build-representations: true
|
||||
|
|
|
|||
|
|
@ -0,0 +1,107 @@
|
|||
CREATE TABLE IF NOT EXISTS doc.doc_embedding_cluster_set (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
code VARCHAR(128) NOT NULL UNIQUE,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
description TEXT,
|
||||
selection_json JSONB NOT NULL,
|
||||
active BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS doc.doc_embedding_cluster_run (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
cluster_set_id UUID,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
status VARCHAR(32) NOT NULL,
|
||||
algorithm VARCHAR(64) NOT NULL,
|
||||
algorithm_version VARCHAR(64),
|
||||
selection_json JSONB NOT NULL,
|
||||
parameters_json JSONB NOT NULL,
|
||||
embedding_model_id UUID,
|
||||
prefix_profile_id UUID,
|
||||
document_type VARCHAR(64),
|
||||
document_family VARCHAR(64),
|
||||
representation_type VARCHAR(64),
|
||||
builder_key VARCHAR(255),
|
||||
item_count BIGINT,
|
||||
cluster_count BIGINT,
|
||||
noise_count BIGINT,
|
||||
started_at TIMESTAMP WITH TIME ZONE,
|
||||
finished_at TIMESTAMP WITH TIME ZONE,
|
||||
error_message TEXT,
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
CONSTRAINT fk_doc_cluster_run_set
|
||||
FOREIGN KEY (cluster_set_id)
|
||||
REFERENCES doc.doc_embedding_cluster_set(id),
|
||||
CONSTRAINT fk_doc_cluster_run_model
|
||||
FOREIGN KEY (embedding_model_id)
|
||||
REFERENCES doc.doc_embedding_model(id),
|
||||
CONSTRAINT fk_doc_cluster_run_prefix_profile
|
||||
FOREIGN KEY (prefix_profile_id)
|
||||
REFERENCES doc.doc_embedding_prefix_profile(id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_cluster_run_status
|
||||
ON doc.doc_embedding_cluster_run(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_cluster_run_algorithm
|
||||
ON doc.doc_embedding_cluster_run(algorithm);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_cluster_run_created_at
|
||||
ON doc.doc_embedding_cluster_run(created_at);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS doc.doc_embedding_cluster (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
cluster_run_id UUID NOT NULL,
|
||||
cluster_label INTEGER NOT NULL,
|
||||
display_name VARCHAR(255),
|
||||
item_count BIGINT NOT NULL,
|
||||
is_noise_cluster BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
summary_text TEXT,
|
||||
top_terms_json JSONB,
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
CONSTRAINT fk_doc_cluster_cluster_run
|
||||
FOREIGN KEY (cluster_run_id)
|
||||
REFERENCES doc.doc_embedding_cluster_run(id)
|
||||
ON DELETE CASCADE,
|
||||
CONSTRAINT uq_doc_cluster_run_label
|
||||
UNIQUE (cluster_run_id, cluster_label)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_cluster_cluster_run
|
||||
ON doc.doc_embedding_cluster(cluster_run_id);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS doc.doc_embedding_cluster_assignment (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
cluster_run_id UUID NOT NULL,
|
||||
cluster_id UUID,
|
||||
embedding_id UUID NOT NULL,
|
||||
document_id UUID NOT NULL,
|
||||
representation_id UUID NOT NULL,
|
||||
cluster_label_raw INTEGER NOT NULL,
|
||||
membership_score DOUBLE PRECISION,
|
||||
distance_to_centroid DOUBLE PRECISION,
|
||||
is_noise BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
CONSTRAINT fk_doc_cluster_assignment_run
|
||||
FOREIGN KEY (cluster_run_id)
|
||||
REFERENCES doc.doc_embedding_cluster_run(id)
|
||||
ON DELETE CASCADE,
|
||||
CONSTRAINT fk_doc_cluster_assignment_cluster
|
||||
FOREIGN KEY (cluster_id)
|
||||
REFERENCES doc.doc_embedding_cluster(id)
|
||||
ON DELETE CASCADE,
|
||||
CONSTRAINT fk_doc_cluster_assignment_embedding
|
||||
FOREIGN KEY (embedding_id)
|
||||
REFERENCES doc.doc_embedding(id),
|
||||
CONSTRAINT uq_doc_cluster_run_embedding
|
||||
UNIQUE (cluster_run_id, embedding_id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_cluster_assignment_run
|
||||
ON doc.doc_embedding_cluster_assignment(cluster_run_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_cluster_assignment_cluster
|
||||
ON doc.doc_embedding_cluster_assignment(cluster_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_cluster_assignment_document
|
||||
ON doc.doc_embedding_cluster_assignment(cluster_run_id, document_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_cluster_assignment_noise
|
||||
ON doc.doc_embedding_cluster_assignment(cluster_run_id, is_noise);
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
ALTER TABLE doc.doc_embedding_cluster_run
|
||||
ADD COLUMN IF NOT EXISTS execution_backend VARCHAR(64),
|
||||
ADD COLUMN IF NOT EXISTS reduction_method VARCHAR(32),
|
||||
ADD COLUMN IF NOT EXISTS reduction_dimensions INTEGER;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_cluster_run_backend
|
||||
ON doc.doc_embedding_cluster_run(execution_backend);
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
-- V31__doc_embedding_clustering_enum_constraints.sql
|
||||
-- Updates check constraints for clustering run enums after adding new algorithms and statuses.
|
||||
|
||||
ALTER TABLE doc.doc_embedding_cluster_run
|
||||
DROP CONSTRAINT IF EXISTS doc_embedding_cluster_run_algorithm_check;
|
||||
|
||||
ALTER TABLE doc.doc_embedding_cluster_run
|
||||
ADD CONSTRAINT doc_embedding_cluster_run_algorithm_check
|
||||
CHECK (algorithm IN (
|
||||
'KMEANS',
|
||||
'MINI_BATCH_KMEANS',
|
||||
'DBSCAN',
|
||||
'HDBSCAN',
|
||||
'AGGLOMERATIVE'
|
||||
));
|
||||
|
||||
ALTER TABLE doc.doc_embedding_cluster_run
|
||||
DROP CONSTRAINT IF EXISTS doc_embedding_cluster_run_status_check;
|
||||
|
||||
ALTER TABLE doc.doc_embedding_cluster_run
|
||||
ADD CONSTRAINT doc_embedding_cluster_run_status_check
|
||||
CHECK (status IN (
|
||||
'CREATED',
|
||||
'QUEUED',
|
||||
'RUNNING',
|
||||
'CANCEL_REQUESTED',
|
||||
'COMPLETED',
|
||||
'FAILED',
|
||||
'CANCELLED'
|
||||
));
|
||||
|
||||
-- Optional hardening in case these columns were also created with check constraints.
|
||||
ALTER TABLE doc.doc_embedding_cluster_run
|
||||
DROP CONSTRAINT IF EXISTS doc_embedding_cluster_run_execution_backend_check;
|
||||
|
||||
ALTER TABLE doc.doc_embedding_cluster_run
|
||||
ADD CONSTRAINT doc_embedding_cluster_run_execution_backend_check
|
||||
CHECK (execution_backend IN (
|
||||
'JAVA_LOCAL',
|
||||
'PYTHON_REMOTE'
|
||||
));
|
||||
|
||||
ALTER TABLE doc.doc_embedding_cluster_run
|
||||
DROP CONSTRAINT IF EXISTS doc_embedding_cluster_run_reduction_method_check;
|
||||
|
||||
ALTER TABLE doc.doc_embedding_cluster_run
|
||||
ADD CONSTRAINT doc_embedding_cluster_run_reduction_method_check
|
||||
CHECK (reduction_method IN (
|
||||
'NONE',
|
||||
'PCA',
|
||||
'UMAP'
|
||||
));
|
||||
Loading…
Reference in New Issue