Source code for thoth.shared.health

"""Health check logic for Cloud Run and local deployments.

This module provides checks for Python version, critical imports (LanceDB,
sentence-transformers, MCP), storage writability, and GCS configuration.
Used by the /health endpoint and monitoring to report service readiness.
"""

import importlib.util
import json
import os
from pathlib import Path
import sys
from typing import Any

from thoth.shared.utils.logger import setup_logger

logger = setup_logger(__name__)


[docs] class HealthCheck: """Static health checks for Python, imports, storage, and GCS config. Used by the HTTP health endpoint to return a single status dict; each check returns a bool or a dict of sub-checks. Overall status is healthy only when Python version and critical imports (lancedb, mcp) pass. """
[docs] @staticmethod def check_python_version() -> bool: """Return True if Python version is in the supported range (3.10 to 3.12). Returns: True when 3.10 <= version < 3.13 (LanceDB/sentence-transformers compatibility). """ # Python 3.13+ not yet fully supported by some deps (e.g., OnnxRuntime). return (3, 10) <= sys.version_info < (3, 13)
[docs] @staticmethod def check_imports() -> dict[str, bool]: """Check that critical runtime dependencies can be imported. Returns: Dict of module name -> True if importable (lancedb, torch, sentence_transformers, mcp). """ checks = {} checks["lancedb"] = importlib.util.find_spec("lancedb") is not None checks["torch"] = importlib.util.find_spec("torch") is not None checks["sentence_transformers"] = importlib.util.find_spec("sentence_transformers") is not None checks["mcp"] = importlib.util.find_spec("mcp") is not None return checks
[docs] @staticmethod def check_storage() -> dict[str, bool]: """Check storage availability.""" checks = {} # Check data directories data_dir = Path("/app/data") checks["data_dir_exists"] = data_dir.exists() checks["data_dir_writable"] = os.access(str(data_dir), os.W_OK) if data_dir.exists() else False return checks
[docs] @staticmethod def check_gcs_config() -> dict[str, bool]: """Check that GCS env vars and credentials are configured. Returns: Dict with gcs_bucket_configured, gcp_project_configured, gcs_credentials_file_exists (when GOOGLE_APPLICATION_CREDENTIALS is set). """ checks = {} checks["gcs_bucket_configured"] = bool(os.getenv("GCS_BUCKET_NAME")) checks["gcp_project_configured"] = bool(os.getenv("GCP_PROJECT_ID")) # Check if credentials are available creds_env = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") if creds_env: creds_path = Path(creds_env) checks["gcs_credentials_file_exists"] = creds_path.exists() else: # May be using metadata server in Cloud Run checks["gcs_credentials_file_exists"] = False return checks
[docs] @classmethod def get_health_status(cls) -> dict[str, Any]: """Return a full health status dict for the /health endpoint. Aggregates Python version, import checks (lancedb, torch, sentence_transformers, mcp), storage writability, and GCS config. Overall status is 'healthy' only when Python version and critical imports (lancedb, mcp) all pass. Returns: Dict with keys: status ('healthy'|'unhealthy'), python_version, python_ok, imports, storage, gcs. """ status = { "status": "healthy", "python_version": f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}", "python_ok": cls.check_python_version(), "imports": cls.check_imports(), "storage": cls.check_storage(), "gcs": cls.check_gcs_config(), } # Determine overall health: Python OK and critical imports (lancedb, mcp) must pass. imports_status = status["imports"] critical_checks = [ status["python_ok"], (imports_status.get("lancedb", False) if isinstance(imports_status, dict) else False), (imports_status.get("mcp", False) if isinstance(imports_status, dict) else False), ] if not all(critical_checks): status["status"] = "unhealthy" return status
[docs] @classmethod def is_healthy(cls) -> bool: """Quick health check. Returns: True if service is healthy, False otherwise """ status = cls.get_health_status() return bool(status["status"] == "healthy")
[docs] def health_check_cli() -> None: """Print health status to stdout and exit with 0 if healthy, 1 if unhealthy.""" status = HealthCheck.get_health_status() # Print for CLI use (this is intentional for the CLI tool) print(json.dumps(status, indent=2)) # noqa: T201 if status["status"] == "healthy": print("\n✓ Service is healthy") # noqa: T201 sys.exit(0) else: print("\n✗ Service is unhealthy") # noqa: T201 sys.exit(1)
if __name__ == "__main__": health_check_cli()