Commit eea82e60 authored by Vũ Hoàng Anh's avatar Vũ Hoàng Anh

feat: refactor AI Tagging to output semantic natural language tags and fix unicode logging

parent b3888b0b
......@@ -285,18 +285,19 @@ CANIFA_LINES = [
]
# Attempt to dynamically fetch product lines from DB to override the static list
try:
_conn = get_pooled_connection_compat()
_cur = _conn.cursor()
_cur.execute(f"SELECT DISTINCT product_line FROM {PG_TABLE} WHERE product_line IS NOT NULL AND product_line != ''")
_db_lines = [r[0] for r in _cur.fetchall()]
_cur.close()
_conn.close()
if _db_lines:
CANIFA_LINES = _db_lines
logger.info(f"✅ Loaded {len(CANIFA_LINES)} product lines dynamically from DB")
except Exception as e:
logger.warning(f"⚠️ Could not load product lines from DB, using fallback. Error: {e}")
# Removed dynamic loading on import to prevent blocking thread when DB is slow
# try:
# _conn = get_pooled_connection_compat()
# _cur = _conn.cursor()
# _cur.execute(f"SELECT DISTINCT product_line FROM {PG_TABLE} WHERE product_line IS NOT NULL AND product_line != ''")
# _db_lines = [r[0] for r in _cur.fetchall()]
# _cur.close()
# _conn.close()
# if _db_lines:
# CANIFA_LINES = _db_lines
# logger.info(f"✅ Loaded {len(CANIFA_LINES)} product lines dynamically from DB")
# except Exception as e:
# logger.warning(f"⚠️ Could not load product lines from DB, using fallback. Error: {e}")
AI_SEARCH_SYSTEM = """Bạn là AI phân tích ý định tìm kiếm sản phẩm thời trang Canifa.
......@@ -945,7 +946,7 @@ async def batch_generate_tags():
conn = get_pooled_connection_compat()
cur = conn.cursor()
# Find up to 500 products that either don't have the tags column populated or it's empty
cur.execute(f"SELECT internal_ref_code FROM {PG_TABLE} WHERE tags IS NULL OR tags = '[]'::jsonb LIMIT 500")
cur.execute(f"SELECT internal_ref_code FROM {PG_TABLE} WHERE tags IS NULL LIMIT 500")
rows = cur.fetchall()
if not rows:
......
This diff is collapsed.
......@@ -28,10 +28,36 @@ class UltraDescriptionDB:
"""Create ultra_descriptions table if it doesn't exist."""
if cls._initialized:
return
from config import USE_LOCAL_SQLITE
conn = None
try:
conn = get_pooled_connection_compat()
cur = conn.cursor()
if USE_LOCAL_SQLITE:
# SQLite: Create full table at once with all columns
cur.execute(f"""
CREATE TABLE IF NOT EXISTS {TABLE} (
id INTEGER PRIMARY KEY AUTOINCREMENT,
internal_ref_code TEXT NOT NULL,
magento_ref_code TEXT,
base_ref_code TEXT,
product_name TEXT,
product_image_url TEXT,
product_line TEXT,
description_data TEXT NOT NULL,
phase TEXT DEFAULT 'enriched',
status INTEGER DEFAULT 0,
clean_description TEXT DEFAULT '',
tags TEXT DEFAULT '[]',
ai_matches TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
""")
else:
# PostgreSQL: Original logic
cur.execute(f"""
CREATE TABLE IF NOT EXISTS {TABLE} (
id SERIAL PRIMARY KEY,
......@@ -45,35 +71,18 @@ class UltraDescriptionDB:
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_ultra_desc_ref_code
ON {TABLE}(internal_ref_code);
-- Migration: add columns if table already existed
ALTER TABLE {TABLE} ADD COLUMN IF NOT EXISTS status SMALLINT DEFAULT 0;
ALTER TABLE {TABLE} ADD COLUMN IF NOT EXISTS clean_description TEXT DEFAULT '';
ALTER TABLE {TABLE} ADD COLUMN IF NOT EXISTS tags JSONB DEFAULT '[]'::jsonb;
-- Migration v2: magento_ref_code support
ALTER TABLE {TABLE} ADD COLUMN IF NOT EXISTS magento_ref_code VARCHAR(100);
ALTER TABLE {TABLE} ADD COLUMN IF NOT EXISTS base_ref_code VARCHAR(50);
ALTER TABLE {TABLE} ADD COLUMN IF NOT EXISTS ai_matches JSONB;
""")
# Drop old unique constraint on internal_ref_code (v2 migration: allow multiple colors per style)
try:
cur.execute(f"ALTER TABLE {TABLE} DROP CONSTRAINT IF EXISTS ultra_descriptions_internal_ref_code_key")
except Exception:
pass
# Non-unique index on internal_ref_code (for grouping)
try:
cur.execute(f"CREATE INDEX IF NOT EXISTS idx_ultra_desc_internal_code ON {TABLE}(internal_ref_code)")
except Exception:
pass
# Unique index on magento_ref_code (partial, ignores NULLs)
try:
cur.execute(f"""CREATE UNIQUE INDEX IF NOT EXISTS idx_ultra_desc_magento_code
ON {TABLE}(magento_ref_code) WHERE magento_ref_code IS NOT NULL;""")
except Exception:
pass # already exists
cur.close()
conn.commit()
cls._initialized = True
logger.info("✅ Table %s ready", TABLE)
logger.info("✅ Table %s ready (Mock: %s)", TABLE, USE_LOCAL_SQLITE)
except Exception as e:
logger.error("Error creating ultra_descriptions table: %s", e)
finally:
......@@ -134,9 +143,9 @@ class UltraDescriptionDB:
json.dumps(description_data, ensure_ascii=False), phase, clean_description),
)
row = cur.fetchone()
cur.close()
row_id = row[0] if row else None
conn.commit() # critical: without this INSERT/UPDATE rolls back on conn.close()
cur.close() # Close cursor FIRST
conn.commit()
logger.info("💾 Saved ultra desc: %s / magento=%s (id=%s)", internal_ref_code, magento_ref_code, row_id)
return row_id
except Exception as e:
......@@ -565,7 +574,7 @@ class UltraDescriptionDB:
try:
conn = get_pooled_connection_compat()
cur = conn.cursor()
cur.execute(f"SELECT COUNT(*) FROM {TABLE} WHERE tags IS NULL OR tags = '[]'::jsonb")
cur.execute(f"SELECT COUNT(*) FROM {TABLE} WHERE tags IS NULL")
count = cur.fetchone()[0]
cur.close()
return count
......@@ -618,11 +627,11 @@ class UltraDescriptionDB:
conn.close()
# Auto-init table on import
try:
UltraDescriptionDB.ensure_table()
except Exception:
pass
# Removed auto-init on import to prevent blocking thread when DB is slow
# try:
# UltraDescriptionDB.ensure_table()
# except Exception:
# pass
# ═══════════════════════════════════════════════════════════════
......@@ -848,8 +857,8 @@ class DescFieldConfig:
conn.close()
# Auto-init field config table
try:
DescFieldConfig.ensure_table()
except Exception:
pass
# Removed auto-init on import to prevent blocking thread when DB is slow
# try:
# DescFieldConfig.ensure_table()
# except Exception:
# pass
import re
from collections import Counter
with open('test_db.magento_product_dimension_with_text_embedding.sql', encoding='utf-8', errors='ignore') as f:
data = f.read()
# Find the column order from INSERT statement
col_match = re.search(r'INSERT INTO[^(]+\(([^)]+)\)', data, re.DOTALL)
if col_match:
cols = [c.strip().strip('`') for c in col_match.group(1).split(',')]
season_idx = cols.index('season') if 'season' in cols else -1
print(f"Column order found. 'season' is at index: {season_idx}")
print(f"Columns around season: {cols[max(0,season_idx-2):season_idx+3]}")
else:
print("Could not find INSERT column list")
season_idx = -1
# Extract VALUES rows - find all value tuples
# Look for patterns like ('...', '...', NULL, ...)
rows = re.findall(r'\(([^;]+?)\)(?:,|\s*;)', data, re.DOTALL)
print(f"\nTotal value rows found: {len(rows)}")
if season_idx >= 0 and rows:
seasons = []
for row in rows[:50]: # sample first 50
# Split by comma but respect quoted strings
parts = re.split(r",(?=(?:[^']*'[^']*')*[^']*$)", row.strip())
if len(parts) > season_idx:
val = parts[season_idx].strip().strip("'")
seasons.append(val)
counts = Counter(seasons)
print(f"\nSeason values (sample from first 50 rows):")
print("-" * 40)
for s, c in sorted(counts.items(), key=lambda x: -x[1]):
print(f"{c:5d} | '{s}'")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment