import asyncio
import logging
import os
import sys
from typing import Any

# Ensure we can import from backend root
current_dir = os.path.dirname(os.path.abspath(__file__))
backend_root = os.path.dirname(current_dir)
sys.path.append(backend_root)

from common.starrocks_connection import StarRocksConnection
from config import STARROCKS_DB, STARROCKS_HOST, STARROCKS_PASSWORD, STARROCKS_USER

logger = logging.getLogger(__name__)

TABLE_NAME = "shared_source.magento_product_dimension_with_text_embedding"
DEFAULT_LIMIT = 50


def _get_missing_env() -> list[str]:
    missing = []
    if not STARROCKS_HOST:
        missing.append("STARROCKS_HOST")
    if not STARROCKS_DB:
        missing.append("STARROCKS_DB")
    if not STARROCKS_USER:
        missing.append("STARROCKS_USER")
    if not STARROCKS_PASSWORD:
        missing.append("STARROCKS_PASSWORD")
    return missing


def _skip_or_warn_if_missing_env() -> bool:
    missing = _get_missing_env()
    if not missing:
        return False

    message = f"Missing StarRocks env vars: {', '.join(missing)}"
    if "PYTEST_CURRENT_TEST" in os.environ:
        import pytest

        pytest.skip(message)

    print(f"[SKIP] {message}")
    return True


async def fetch_size_scale_stats(limit: int = DEFAULT_LIMIT) -> dict[str, Any]:
    limit = max(1, int(limit))
    db = StarRocksConnection()

    total_sql = f"SELECT COUNT(*) AS total_rows FROM {TABLE_NAME}"
    distinct_sql = (
        f"SELECT COUNT(DISTINCT size_scale) AS distinct_size_scale FROM {TABLE_NAME}"
    )
    null_sql = (
        f"SELECT COUNT(*) AS null_size_scale FROM {TABLE_NAME} "
        "WHERE size_scale IS NULL OR size_scale = ''"
    )
    top_sizes_sql = f"""
    SELECT
        size_scale,
        COUNT(*) AS row_count
    FROM {TABLE_NAME}
    GROUP BY size_scale
    ORDER BY row_count DESC
    LIMIT {limit}
    """

    total_rows = await db.execute_query_async(total_sql)
    distinct_sizes = await db.execute_query_async(distinct_sql)
    null_sizes = await db.execute_query_async(null_sql)
    top_sizes = await db.execute_query_async(top_sizes_sql)

    total_rows_value = int(total_rows[0]["total_rows"]) if total_rows else 0
    distinct_size_value = (
        int(distinct_sizes[0]["distinct_size_scale"]) if distinct_sizes else 0
    )
    null_size_value = int(null_sizes[0]["null_size_scale"]) if null_sizes else 0

    return {
        "total_rows": total_rows_value,
        "distinct_size_scale": distinct_size_value,
        "null_size_scale": null_size_value,
        "top_sizes": top_sizes,
    }


def test_starrocks_size_scale_stats():
    if _skip_or_warn_if_missing_env():
        return

    stats = asyncio.run(fetch_size_scale_stats(limit=10))
    assert stats["total_rows"] >= 0
    assert stats["distinct_size_scale"] >= 0


async def _run():
    if _skip_or_warn_if_missing_env():
        return

    stats = await fetch_size_scale_stats(limit=DEFAULT_LIMIT)

    print("\n" + "=" * 80)
    print("STARROCKS SIZE_SCALE STATS")
    print("=" * 80)
    print(f"Table: {TABLE_NAME}")
    print(f"Total rows: {stats['total_rows']}")
    print(f"Distinct size_scale: {stats['distinct_size_scale']}")
    print(f"Null/empty size_scale: {stats['null_size_scale']}")
    print("\nTop size_scale by row count:")
    for row in stats["top_sizes"]:
        size_scale = row.get("size_scale")
        row_count = row.get("row_count")
        print(f"- {size_scale}: {row_count}")

    await StarRocksConnection.clear_pool()


if __name__ == "__main__":
    asyncio.run(_run())
