DIP/python/dip-clustering-service/app/models.py

76 lines
1.8 KiB
Python

from __future__ import annotations
from enum import Enum
from typing import Any
from uuid import UUID
from pydantic import BaseModel, ConfigDict, Field
class ClusteringAlgorithm(str, Enum):
KMEANS = "KMEANS"
MINI_BATCH_KMEANS = "MINI_BATCH_KMEANS"
DBSCAN = "DBSCAN"
HDBSCAN = "HDBSCAN"
AGGLOMERATIVE = "AGGLOMERATIVE"
class ReductionMethod(str, Enum):
NONE = "NONE"
PCA = "PCA"
UMAP = "UMAP"
class PythonClusteringItem(BaseModel):
embeddingId: UUID
documentId: UUID | None = None
representationId: UUID | None = None
vector: list[float]
class PythonClusteringRequest(BaseModel):
algorithm: ClusteringAlgorithm
parameters: dict[str, Any] = Field(default_factory=dict)
reductionMethod: ReductionMethod = ReductionMethod.NONE
reductionDimensions: int | None = None
items: list[PythonClusteringItem]
model_config = ConfigDict(use_enum_values=True)
class PythonRunExecutionRequest(BaseModel):
runId: UUID
class PythonCluster(BaseModel):
clusterLabel: int
itemCount: int
noiseCluster: bool = False
class PythonAssignment(BaseModel):
embeddingId: UUID
documentId: UUID | None = None
representationId: UUID | None = None
clusterLabel: int
distanceToCentroid: float | None = None
membershipScore: float | None = None
noise: bool = False
class PythonClusteringResponse(BaseModel):
clusters: list[PythonCluster]
assignments: list[PythonAssignment]
noiseCount: int
class RunMetadata(BaseModel):
runId: UUID
algorithm: ClusteringAlgorithm
parameters: dict[str, Any] = Field(default_factory=dict)
reductionMethod: ReductionMethod = ReductionMethod.NONE
reductionDimensions: int | None = None
selection: dict[str, Any] = Field(default_factory=dict)
model_config = ConfigDict(use_enum_values=True)