"""
批次查詢估價師證書資料 - Playwright + Claude 視覺辨識驗證碼
結果先存入 staging 表，同名情況需人工確認
"""
import sys
sys.stdout.reconfigure(encoding='utf-8')

from playwright.sync_api import sync_playwright
import time
import os
import json
import random
import sqlite3
from datetime import datetime
from pathlib import Path

# 設定
SCRIPT_DIR = Path(__file__).parent
DATA_DIR = SCRIPT_DIR / 'data'
DB_PATH = DATA_DIR / 'appraisers.db'

CAPTCHA_IMAGE = SCRIPT_DIR / 'captcha_to_read.png'
CAPTCHA_ANSWER = SCRIPT_DIR / 'captcha_answer.txt'
PROGRESS_FILE = SCRIPT_DIR / 'query_progress.json'

TIMEOUT_SECONDS = 120  # 等待驗證碼的最長時間
MIN_DELAY = 10  # 最小延遲秒數
MAX_DELAY = 30  # 最大延遲秒數
BATCH_SIZE = 25  # 每批查詢數量，避免記憶體累積

def init_staging_table():
    """建立 staging 表"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()

    cursor.execute('''
        CREATE TABLE IF NOT EXISTS certificate_staging (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            query_name TEXT NOT NULL,
            name TEXT,
            english_name TEXT,
            moi_certificate_number TEXT,
            exam_certificate_number TEXT,
            certificate_status TEXT,
            query_time DATETIME DEFAULT CURRENT_TIMESTAMP,
            is_processed INTEGER DEFAULT 0,
            matched_appraiser_id INTEGER,
            notes TEXT
        )
    ''')

    conn.commit()
    conn.close()
    print('✓ Staging 表已準備就緒')

def get_pending_names(limit=None):
    """取得待查詢的姓名清單（支援斷點續傳）"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()

    # 取得已在 staging 表中的姓名（已查詢過，不論是否處理）
    cursor.execute('SELECT DISTINCT query_name FROM certificate_staging')
    already_queried = set(row[0] for row in cursor.fetchall())

    # 取得主表中已有證書的姓名
    cursor.execute('SELECT DISTINCT name FROM appraisers WHERE moi_certificate_number IS NOT NULL')
    already_has_cert = set(row[0] for row in cursor.fetchall())

    # 合併排除名單
    exclude_names = already_queried | already_has_cert

    # 取得所有待查詢的姓名
    cursor.execute('SELECT DISTINCT name FROM appraisers ORDER BY name')
    all_names = [row[0] for row in cursor.fetchall()]

    # 過濾掉已處理的
    pending_names = [n for n in all_names if n not in exclude_names]

    conn.close()

    if exclude_names:
        print(f'  已跳過 {len(exclude_names)} 個已查詢/已有資料的姓名')

    if limit:
        return pending_names[:limit]
    return pending_names

def save_to_staging(query_name, result):
    """將查詢結果存入 staging 表"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()

    cursor.execute('''
        INSERT INTO certificate_staging
        (query_name, name, english_name, moi_certificate_number,
         exam_certificate_number, certificate_status)
        VALUES (?, ?, ?, ?, ?, ?)
    ''', (
        query_name,
        result.get('姓名'),
        result.get('英文姓名'),
        result.get('估價師證書字號'),
        result.get('考試院證書字號'),
        result.get('證書狀態')
    ))

    conn.commit()
    conn.close()

def cleanup():
    """清理暫存檔案"""
    for f in [CAPTCHA_IMAGE, CAPTCHA_ANSWER]:
        if os.path.exists(f):
            os.remove(f)

def wait_for_captcha_answer(timeout=TIMEOUT_SECONDS):
    """等待 Claude 辨識驗證碼並寫入答案"""
    print(f'  等待驗證碼輸入（最長 {timeout} 秒）...')
    start = time.time()
    while time.time() - start < timeout:
        if os.path.exists(CAPTCHA_ANSWER):
            with open(CAPTCHA_ANSWER, 'r') as f:
                answer = f.read().strip()
            if answer:
                # 清除答案檔，準備下一次
                os.remove(CAPTCHA_ANSWER)
                return answer
        time.sleep(0.5)
    return None

def query_single(page, name):
    """在現有 session 中查詢單一估價師"""
    cleanup()

    print(f'\n{"="*50}')
    print(f'查詢: {name}')
    print(f'{"="*50}')

    # 重新載入查詢頁面
    page.goto('https://resim.moi.gov.tw/Home/AssessIndex')
    page.wait_for_load_state('networkidle')

    # 點擊證書資料查詢
    page.click('text=證書資料查詢')
    page.wait_for_timeout(1500)

    # 輸入姓名
    page.fill('#assess_name', name)

    # 截圖驗證碼
    page.locator('#vCodeView').screenshot(path=str(CAPTCHA_IMAGE))
    print(f'  驗證碼已截圖: {CAPTCHA_IMAGE}')
    print('  >>> 等待 Claude 辨識... <<<')

    # 等待驗證碼答案
    captcha = wait_for_captcha_answer()

    if not captcha:
        print('  ❌ 等待驗證碼超時')
        return None, 'timeout'

    print(f'  收到驗證碼: {captcha}')
    page.fill('#codenumber', captcha)

    # 點擊查詢
    page.click('#subb')
    page.wait_for_timeout(3000)

    # 檢查結果
    page_content = page.content()

    if '驗證碼錯誤' in page_content or '驗證碼輸入錯誤' in page_content:
        print('  ❌ 驗證碼錯誤')
        return None, 'captcha_error'

    if '查無資料' in page_content:
        print('  ⚠ 查無資料')
        return None, 'not_found'

    # 解析結果
    results = []
    rows = page.locator('table tr').all()
    for row in rows:
        cells = row.locator('td').all()
        if len(cells) >= 5:
            texts = [cell.text_content().strip() for cell in cells]
            # 確認是資料列（不是表頭）
            if texts[0] and not texts[0].startswith('姓名'):
                result = {
                    '姓名': texts[0],
                    '英文姓名': texts[1],
                    '估價師證書字號': texts[2],
                    '考試院證書字號': texts[3],
                    '證書狀態': texts[4]
                }
                results.append(result)

    if results:
        print(f'  ✓ 找到 {len(results)} 筆結果')
        for r in results:
            print(f'    - {r["姓名"]} ({r["英文姓名"]}): {r["估價師證書字號"]}')
        return results, 'success'
    else:
        print('  ⚠ 無法解析結果')
        return None, 'parse_error'

def save_progress(completed, failed, pending):
    """儲存進度"""
    with open(PROGRESS_FILE, 'w', encoding='utf-8') as f:
        json.dump({
            'last_update': datetime.now().isoformat(),
            'completed': completed,
            'failed': failed,
            'pending': len(pending)
        }, f, ensure_ascii=False, indent=2)

def batch_query(names, start_index=0):
    """批次查詢（分批重啟 browser 避免記憶體累積）"""
    total = len(names)
    completed = []
    failed = []

    print(f'\n開始批次查詢，共 {total} 人')
    print(f'分批大小: {BATCH_SIZE}，隨機延遲: {MIN_DELAY}-{MAX_DELAY} 秒')
    print('='*60)

    # 分批處理
    for batch_start in range(start_index, total, BATCH_SIZE):
        batch_end = min(batch_start + BATCH_SIZE, total)
        batch_names = names[batch_start:batch_end]
        batch_num = batch_start // BATCH_SIZE + 1
        total_batches = (total + BATCH_SIZE - 1) // BATCH_SIZE

        print(f'\n{"="*60}')
        print(f'開始批次 {batch_num}/{total_batches}（第 {batch_start+1}-{batch_end} 人）')
        print(f'{"="*60}')

        with sync_playwright() as p:
            browser = p.chromium.launch(
                headless=True,
                args=['--disable-dev-shm-usage', '--disable-gpu']
            )
            page = browser.new_page()

            for i, name in enumerate(batch_names):
                global_idx = batch_start + i + 1
                print(f'\n[{global_idx}/{total}] 處理中...')

                # 最多重試 2 次
                for attempt in range(2):
                    results, status = query_single(page, name)

                    if status == 'success' and results:
                        # 存入 staging
                        for result in results:
                            save_to_staging(name, result)
                        completed.append(name)
                        break
                    elif status == 'captcha_error' and attempt < 1:
                        print('  重試中...')
                        continue
                    else:
                        failed.append({'name': name, 'status': status})
                        break

                # 儲存進度
                save_progress(completed, failed, names[global_idx:])

                # 隨機延遲（除了最後一個）
                if global_idx < total:
                    delay = random.uniform(MIN_DELAY, MAX_DELAY)
                    print(f'\n  ⏳ 等待 {delay:.1f} 秒...')
                    time.sleep(delay)

            browser.close()
            print(f'\n✓ 批次 {batch_num} 完成，browser 已關閉釋放記憶體')

        # 批次間休息
        if batch_end < total:
            print(f'\n⏳ 批次間休息 5 秒...')
            time.sleep(5)

    # 最終報告
    print('\n' + '='*60)
    print('批次查詢完成！')
    print(f'  成功: {len(completed)} 人')
    print(f'  失敗: {len(failed)} 人')
    if failed:
        print('  失敗清單:')
        for f in failed:
            print(f'    - {f["name"]}: {f["status"]}')

    return completed, failed

def show_staging_status():
    """顯示 staging 表狀態"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()

    cursor.execute('SELECT COUNT(*) FROM certificate_staging WHERE is_processed = 0')
    pending = cursor.fetchone()[0]

    cursor.execute('SELECT COUNT(*) FROM certificate_staging WHERE is_processed = 1')
    processed = cursor.fetchone()[0]

    print(f'\n=== Staging 表狀態 ===')
    print(f'待處理: {pending}')
    print(f'已處理: {processed}')

    if pending > 0:
        print('\n待處理記錄:')
        cursor.execute('''
            SELECT query_name, name, english_name, moi_certificate_number, query_time
            FROM certificate_staging
            WHERE is_processed = 0
            ORDER BY query_time DESC
            LIMIT 20
        ''')
        for row in cursor.fetchall():
            print(f'  {row[0]} -> {row[1]} ({row[2]}): {row[3]}')

    conn.close()

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='批次查詢估價師證書')
    parser.add_argument('--limit', type=int, default=None, help='查詢人數限制（不指定則查全部）')
    parser.add_argument('--status', action='store_true', help='顯示 staging 狀態')
    args = parser.parse_args()

    init_staging_table()

    if args.status:
        show_staging_status()
    else:
        # limit=0 或 limit=None 都代表查全部
        limit = args.limit if args.limit and args.limit > 0 else None
        names = get_pending_names(limit=limit)
        print(f'取得 {len(names)} 個待查詢姓名')
        batch_query(names)
        show_staging_status()
