import os
import json
import pymysql
from openai import OpenAI
import time

# ==========================================
# 🔐 HARD KEY CONFIGURATION (As requested)
# ==========================================
OPENAI_API_KEY = "sk-proj-srJ3l3B5q1CzRezXAnaewbbRfuWzIjYHbcAdggzsa4MmtXEHaIwS1OTkMgLpMDikgh"
SR_HOST = "172.16.2.100"
SR_PORT = 9030
SR_USER = "anhvh"
SR_PASS = "v0WYGeyLRCckXotT"
SR_DB = "shared_source"

# Parameter
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
EMBEDDING_MODEL = "text-embedding-3-small" # 1536 dimensions

client = OpenAI(api_key=OPENAI_API_KEY)

def get_embedding(text):
    """Lấy vector 1536 chiều từ OpenAI"""
    try:
        text = text.replace("\n", " ")
        return client.embeddings.create(input=[text], model=EMBEDDING_MODEL).data[0].embedding
    except Exception as e:
        print(f"❌ Lỗi Embedding: {e}")
        return None

def connect_starrocks():
    return pymysql.connect(
        host=SR_HOST,
        port=SR_PORT,
        user=SR_USER,
        password=SR_PASS,
        database=SR_DB,
        charset='utf8mb4',
        cursorclass=pymysql.cursors.DictCursor
    )

def chunk_text(text, size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    """Chia nhỏ văn bản với overlap"""
    chunks = []
    start = 0
    while start < len(text):
        end = start + size
        chunks.append(text[start:end])
        start += size - overlap
    return chunks

def ingest():
    input_file = r"d:\cnf\chatbot_canifa\backend\datadb\tonghop.txt"
    if not os.path.exists(input_file):
        print(f"❌ Không tìm thấy file: {input_file}")
        return

    print(f"📖 Đang đọc file {input_file}...")
    with open(input_file, "r", encoding="utf-8") as f:
        full_content = f.read()

    # Tách dữ liệu theo từng FILE giả định trong tonghop.txt
    sections = full_content.split("================================================================================")
    
    db = connect_starrocks()
    cursor = db.cursor()
    
    total_chunks = 0
    record_id = int(time.time()) # Làm ID cơ bản

    for section in sections:
        if not section.strip(): continue
        
        # Lấy tiêu đề file nếu có
        lines = section.strip().split("\n")
        title = "Canifa Knowledge"
        if "FILE:" in lines[0]:
            title = lines[0].replace("FILE:", "").strip()
            content = "\n".join(lines[1:])
        else:
            content = section

        print(f"🚀 Đang xử lý section: {title}")
        chunks = chunk_text(content)
        
        for i, chunk in enumerate(chunks):
            if len(chunk.strip()) < 20: continue # Bỏ qua đoạn quá ngắn
            
            vector = get_embedding(chunk)
            if not vector: continue
            
            metadata = {
                "title": title,
                "chunk_idx": i,
                "source": "tonghop.txt",
                "timestamp": time.time()
            }
            
            sql = "INSERT INTO shared_source.canifa_knowledge (id, content, metadata, embedding) VALUES (%s, %s, %s, %s)"
            try:
                cursor.execute(sql, (record_id, chunk, json.dumps(metadata, ensure_ascii=False), str(vector)))
                record_id += 1
                total_chunks += 1
                if total_chunks % 10 == 0:
                    db.commit()
                    print(f"✅ Đã nạp {total_chunks} chunks...")
            except Exception as e:
                print(f"❌ Lỗi SQL: {e}")
                
    db.commit()
    db.close()
    print(f"🎊 HOÀN THÀNH! Tổng cộng đã nạp {total_chunks} vào StarRocks.")

if __name__ == "__main__":
    ingest()
