fix: add size_scale to results, fix description NULL, strengthen anti-hallucination prompt rules

12c89dc6 · Vũ Hoàng Anh · 5bae1bdf · 12c89dc6 · 12c89dc6 · 12c89dc6
Commit 12c89dc6 authored Feb 10, 2026 by Vũ Hoàng Anh
4 changed files
--- a/backend/agent/helper.py
+++ b/backend/agent/helper.py
@@ -137,8 +137,12 @@ def format_product_results(products: list[dict]) -> list[dict]:
                "sale_price": int(sale_price) if sale_price else int(original_price),
                "url": web_url,  # First color's URL
                "thumbnail_image_url": thumb_url,  # First color's thumbnail
-                "description": p.get("description_text", ""),
+                "description": (p.get("description_text") or "")[:200],
            }
+            # Include sizes if available (pipe-separated → list)
+            size_scale = p.get("size_scale")
+            if size_scale:
+                product_entry["sizes"] = [s.strip() for s in size_scale.split("|") if s.strip()]
            # Include quantity_sold if available (for best seller)
            qty_sold = p.get("quantity_sold")
            if qty_sold is not None:

--- a/backend/agent/system_prompt.txt
+++ b/backend/agent/system_prompt.txt
@@ -1142,11 +1142,18 @@ Cả hai màu này đều rất sang, anh xem thử mẫu nào thích nhé!"
 ```
 ### ⚠️ QUY TẮC SỐNG CÒN (ANTI-HALLUCINATION):
-1. **KHÔNG BAO GIỜ COPY VÍ DỤ MINH HỌA**: Tuyệt đối không được lấy thông tin "Nam, 1m72, 70kg" từ ví dụ để điền vào nếu khách không nói.
+1. **KHÔNG BAO GIỜ COPY VÍ DỤ MINH HỌA**: Tất cả ví dụ trong prompt này CHỈ LÀ MINH HỌA cách viết format. Tuyệt đối không lấy thông tin từ ví dụ ("Nam, 1m72, 70kg"...) để điền vào insight.
-2. **KHÔNG BIẾT THÌ GHI "Chưa rõ"**:
+2. **SUY LUẬN ĐƯỢC — NHƯNG PHẢI CÓ CĂN CỨ RÕ RÀNG**:
-   - Nếu khách chưa nói chiều cao/cân nặng → Ghi "Chưa rõ".
+   - ✅ **CĂN CỨ ĐỦ MẠNH** để suy luận giới tính:
-   - Nếu khách chưa nói giới tính → Ghi "Chưa rõ".
+     - Khách tự xưng: "anh muốn..." → Nam | "chị cần..." → Nữ
-3. **CHỈ UPDATE KHI CÓ THÔNG TIN THỰC TẾ**: Chỉ ghi nhận thông tin khách thực sự cung cấp trong chat.
+     - Nói rõ: "tôi là nam", "mình là nữ"
+     - Ngữ cảnh tích lũy qua nhiều turn: "mua cho chồng" + "anh ấy thích..." → USER là Nữ
+   - ❌ **CĂN CỨ KHÔNG ĐỦ** (CẤM suy luận giới tính):
+     - Chỉ có chiều cao/cân nặng: "1m6, 50kg" → KHÔNG biết Nam hay Nữ → Ghi "Chưa rõ"
+     - Chỉ hỏi sản phẩm 1 giới: "tìm áo nam" → filter SP = men, NHƯNG USER vẫn "Chưa rõ" (có thể mua hộ)
+     - Chỉ có 1 câu đầu tiên chưa đủ context → Ghi "Chưa rõ", chờ thêm thông tin
+3. **CHƯA ĐỦ CĂN CỨ THÌ GHI "Chưa rõ"** — Luôn mặc định "Chưa rõ" cho mọi trường chưa có thông tin chắc chắn (giới tính, style, size, chiều cao...)
+4. **KHÔNG VỘI KẾT LUẬN TỪ CÂU ĐẦU TIÊN**: Câu hỏi đầu tiên thường thiếu context → ưu tiên hỏi thêm, KHÔNG vội gán thông tin. Sau 2-3 turn tích lũy đủ context mới cập nhật.
 ---
@@ -1391,9 +1398,9 @@ Turn 4: User nói 'xem mẫu khác' → Bot cần tìm váy đen khác, tránh 3
    "ai_response": "Dạ bạn cho mình biết thêm: vợ bạn thích màu gì, size bao nhiêu, và giá tầm bao nhiêu để mình tư vấn chính xác nhé?",
    "product_ids": [],
    "user_insight": {{
-        "USER": "Nam, Adult, có vợ.",
+        "USER": "Chưa rõ giới tính. Adult, có vợ.",
-        "TARGET": "Vợ (Nữ).",
+        "TARGET": "Vợ (Nữ, Adult).",
-        "GOAL": "Tìm váy.",
+        "GOAL": "Tìm váy cho vợ.",
        "CONSTRAINS": "Chưa có.",
        "LATEST_PRODUCT_INTEREST": "Váy cho vợ",
        "NEXT": "Cần hỏi: màu sắc, size, ngân sách.",

--- a/backend/agent/tools/product_search_helpers.py
+++ b/backend/agent/tools/product_search_helpers.py
@@ -185,7 +185,9 @@ async def build_starrocks_query(params, query_vector: list[float] | None = None)
                age_by_product,
                gender_by_product,
                product_line_vn,
-                quantity_sold
+                quantity_sold,
+                description_text,
+                size_scale
            FROM shared_source.magento_product_dimension_with_text_embedding
            WHERE {where_str}
            ORDER BY {order_by}
@@ -281,6 +283,7 @@ async def build_starrocks_query(params, query_vector: list[float] | None = None)
            product_line_vn,
            product_line_en,
            description_text,
+            size_scale,
            quantity_sold,
            is_new_product,
            approx_cosine_similarity(vector, {v_str}) as similarity_score
@@ -311,6 +314,7 @@ async def build_starrocks_query(params, query_vector: list[float] | None = None)
        MAX_BY(age_by_product, similarity_score) as age_by_product,
        MAX_BY(product_line_vn, similarity_score) as product_line_vn,
        MAX_BY(quantity_sold, similarity_score) as quantity_sold,
+        MAX_BY(size_scale, similarity_score) as size_scale,
        MAX(similarity_score) as max_score{extra_agg}
    FROM filtered_matches
    GROUP BY product_color_code, internal_ref_code

--- a/backend/tests/check_schema.py
+++ b/backend/tests/check_schema.py
 import asyncio
 import json
-import os
 import sys
+sys.stdout.reconfigure(encoding='utf-8')
-# Add parent directory to path to import common
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from common.starrocks_connection import get_db_connection
 async def main():
    db = get_db_connection()
-    try:
+    rows = await db.execute_query_async(
-        # 1. Describe table
+        "SELECT size_scale, LEFT(description_text, 100) as desc_short "
-        schema = await db.execute_query_async("DESCRIBE shared_source.magento_product_dimension_with_text_embedding")
+        "FROM shared_source.magento_product_dimension_with_text_embedding "
+        "WHERE description_text IS NOT NULL AND description_text != '' "
-        # 2. Get 1 sample row to see actual values
+        "LIMIT 5"
-        sample = await db.execute_query_async("SELECT * FROM shared_source.magento_product_dimension_with_text_embedding LIMIT 1")
+    )
+    for r in rows:
-        output = {
+        print(json.dumps(r, ensure_ascii=False))
-            "schema": schema,
-            "sample": sample[0] if sample else None
-        }
-        # Remove vector from output to keep it clean
-        if output["sample"] and "vector" in output["sample"]:
-            del output["sample"]["vector"]
-        with open("schema_dump.json", "w", encoding="utf-8") as f:
-            json.dump(output, f, ensure_ascii=False, indent=2)
-        print("Schema dumped to schema_dump.json")
-    except Exception as e:
-        print(f"Error: {e}")
-if __name__ == '__main__':
+asyncio.run(main())
-    asyncio.run(main())