import asyncio
import os
import re
import sys
from collections import Counter
from typing import Any

# Ensure we can import from backend root
current_dir = os.path.dirname(os.path.abspath(__file__))
backend_root = os.path.dirname(current_dir)
sys.path.append(backend_root)

from common.starrocks_connection import StarRocksConnection
from config import STARROCKS_DB, STARROCKS_HOST, STARROCKS_PASSWORD, STARROCKS_USER

TABLE_NAME = "shared_source.magento_product_dimension_with_text_embedding"


def _get_missing_env() -> list[str]:
    missing = []
    if not STARROCKS_HOST:
        missing.append("STARROCKS_HOST")
    if not STARROCKS_DB:
        missing.append("STARROCKS_DB")
    if not STARROCKS_USER:
        missing.append("STARROCKS_USER")
    if not STARROCKS_PASSWORD:
        missing.append("STARROCKS_PASSWORD")
    return missing


def _skip_or_warn_if_missing_env() -> bool:
    missing = _get_missing_env()
    if not missing:
        return False

    message = f"Missing StarRocks env vars: {', '.join(missing)}"
    if "PYTEST_CURRENT_TEST" in os.environ:
        import pytest

        pytest.skip(message)

    print(f"[SKIP] {message}")
    return True


def _split_size_scale(size_scale: str | None) -> list[str]:
    if not size_scale:
        return []
    return [token.strip() for token in size_scale.split("|") if token.strip()]


def _normalize_numeric_token(token: str) -> str | None:
    if not token:
        return None
    token = token.strip().lower()
    token = re.sub(r"cm$", "", token)
    if re.fullmatch(r"\d+(\.\d+)?", token):
        return token
    return None


async def fetch_size_scale_rows() -> list[dict[str, Any]]:
    db = StarRocksConnection()
    sql = f"""
    SELECT
        size_scale,
        COUNT(*) AS row_count
    FROM {TABLE_NAME}
    GROUP BY size_scale
    """
    return await db.execute_query_async(sql)


def _build_numeric_summary(rows: list[dict[str, Any]]) -> Counter[str]:
    counter: Counter[str] = Counter()
    for row in rows:
        size_scale = row.get("size_scale")
        row_count = int(row.get("row_count") or 0)
        for token in _split_size_scale(size_scale):
            numeric_token = _normalize_numeric_token(token)
            if numeric_token:
                counter[numeric_token] += row_count
    return counter


def _print_summary(counter: Counter[str]) -> None:
    def _sort_key(val: str) -> float:
        try:
            return float(val)
        except ValueError:
            return float("inf")

    tokens_sorted = sorted(counter.keys(), key=_sort_key)

    print("\n" + "=" * 80)
    print("NUMERIC SIZE TOKENS")
    print("=" * 80)
    print(f"Total unique numeric sizes: {len(tokens_sorted)}")
    print("\nAll numeric sizes (sorted):")
    print(", ".join(tokens_sorted))

    print("\nCounts (descending):")
    for token, count in counter.most_common():
        print(f"- {token}: {count}")


async def _run() -> None:
    if _skip_or_warn_if_missing_env():
        return

    rows = await fetch_size_scale_rows()
    numeric_counter = _build_numeric_summary(rows)
    _print_summary(numeric_counter)

    await StarRocksConnection.clear_pool()


if __name__ == "__main__":
    asyncio.run(_run())
