#!/usr/bin/env python3
"""
MOPS 公告 LLM 結構化解析器
==========================
使用 Claude CLI 將原始公告內容解析成結構化 JSON。

使用方式：
    python parse_announcements.py              # 解析所有未解析的公告
    python parse_announcements.py --limit 10   # 只解析 10 筆
    python parse_announcements.py --reparse    # 重新解析所有公告
    python parse_announcements.py --id 123     # 解析特定 ID
    python parse_announcements.py --dry-run    # 測試模式，不寫入資料庫
    python parse_announcements.py --workers 4  # 4 個並行處理（建議 3-5）
"""

import sys
sys.stdout.reconfigure(encoding='utf-8')

import sqlite3
import subprocess
import json
import argparse
import logging
from pathlib import Path
from datetime import datetime
from typing import Optional, Dict, Any, List, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import time

# ===== 設定 =====
BASE_DIR = Path(__file__).parent
DB_PATH = BASE_DIR / "data" / "mops_announcements.db"
LOG_DIR = BASE_DIR / "logs"

# ===== JSON Schema =====
# 定義 LLM 輸出的結構化格式
PARSE_SCHEMA = {
    "type": "object",
    "properties": {
        "category": {
            "type": "string",
            "enum": ["A.不動產", "B.有價證券", "C.財務", "D.公司治理", "E.投資人關係", "F.法律", "G.併購重組", "Z.其他"],
            "description": "公告主類型"
        },
        "subcategory": {
            "type": "string",
            "enum": [
                "A1.取得", "A2.處分", "A3.使用權資產", "A4.都更合建", "A5.工程發包",
                "B1.取得", "B2.處分", "B3.增資私募", "B4.轉投資", "B5.公司債",
                "C1.背書保證", "C2.資金貸與", "C3.財報營收", "C4.減資", "C5.股利發放",
                "D1.董監事", "D2.經理人", "D3.股東會", "D4.會計發言人",
                "E1.法說會", "E2.重大訊息",
                "F1.訴訟仲裁", "F2.裁罰",
                "G1.合併收購", "G2.分割",
                "Z1.其他"
            ],
            "description": "子類型代碼"
        },
        "summary": {
            "type": "string",
            "description": "一句話摘要（30字內）"
        },
        "transaction": {
            "type": "object",
            "properties": {
                "type": {"type": "string", "description": "交易類型：取得/處分/租賃/都更/合建/增資/其他"},
                "amount": {"type": "number", "description": "交易金額（數字，單位：元）"},
                "currency": {"type": "string", "description": "幣別：TWD/USD/JPY/RMB/EUR"},
                "unit_price": {"type": "string", "description": "單價描述（如：620元/坪/月）"}
            }
        },
        "target": {
            "type": "object",
            "description": "標的物資訊（不動產相關才需要）",
            "properties": {
                "description": {"type": "string", "description": "標的物描述"},
                "location": {"type": "string", "description": "地點（縣市區）"},
                "type": {"type": "string", "description": "類型：土地/建物/廠房/辦公室/停車位"},
                "area": {"type": "string", "description": "面積（含單位）"}
            }
        },
        "counterparties": {
            "type": "array",
            "description": "交易對象列表（可能有多個）",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string", "description": "交易對象名稱"},
                    "is_related_party": {"type": "boolean", "description": "是否為關係人"},
                    "relationship": {"type": "string", "description": "關係說明（母公司/子公司/董事等）"}
                }
            }
        },
        "appraisal": {
            "type": "object",
            "description": "估價資訊",
            "properties": {
                "has_appraiser": {"type": "boolean", "description": "是否有委託估價師"},
                "appraisers": {
                    "type": "array",
                    "description": "估價事務所列表，每個事務所包含其估價師和金額",
                    "items": {
                        "type": "object",
                        "properties": {
                            "firm": {"type": "string", "description": "估價事務所名稱"},
                            "names": {"type": "array", "items": {"type": "string"}, "description": "該事務所簽名估價師姓名列表"},
                            "amount": {"type": "number", "description": "該事務所估價金額（元）"}
                        }
                    }
                },
                "result": {"type": "string", "description": "估價結果描述（如：共同負擔比率35%）"}
            }
        },
        "key_terms": {
            "type": "object",
            "description": "重要條款",
            "properties": {
                "lease_period": {"type": "string", "description": "租期"},
                "monthly_rent": {"type": "number", "description": "月租金（元）"},
                "build_ratio": {"type": "string", "description": "合建分配比例"},
                "purpose": {"type": "string", "description": "取得/處分目的"},
                "other": {"type": "string", "description": "其他重要條款"}
            }
        }
    },
    "required": ["category", "subcategory", "summary"]
}

# ===== LLM Prompt =====
SYSTEM_PROMPT = """你是一個專業的證券公告分析師。請將公告內容解析成結構化 JSON。

## 分類規則

主類型：
- A.不動產：土地、建物、廠房、使用權資產、都更、合建、租賃
- B.有價證券：股票、基金、私募、增資、轉投資、公司債
- C.財務：背書保證、資金貸與、財報、減資
- D.公司治理：董監事、經理人、股東會、會計主管、發言人
- E.投資人關係：法說會、重大訊息澄清
- F.法律：訴訟、仲裁、裁罰
- G.併購重組：合併、收購、分割
- Z.其他：無法歸類

子類型：
- A1.取得, A2.處分, A3.使用權資產, A4.都更合建, A5.工程發包
- B1.取得, B2.處分, B3.增資私募, B4.轉投資, B5.公司債
- C1.背書保證, C2.資金貸與, C3.財報營收, C4.減資
- D1.董監事, D2.經理人, D3.股東會, D4.會計發言人
- E1.法說會, E2.重大訊息
- F1.訴訟仲裁, F2.裁罰
- G1.合併收購, G2.分割
- Z1.其他

## 重要規則

1. 金額一律轉換成數字（元），外幣需標明幣別
2. 如果欄位資訊不存在或為「不適用」，該欄位設為 null
3. summary 用繁體中文，30字內，格式：「[公司]做了[什麼事]」
4. 仔細辨識是否有委託估價師，估價師欄位標示「不適用」代表沒有委託
5. 關係人交易要標註 is_related_party: true

## 估價師結構（重要）

appraisal.appraisers 是以「事務所」為單位的陣列，每個事務所包含：
- firm: 事務所名稱
- names: 該事務所簽名的估價師姓名列表（可能多人）
- amount: 該事務所的估價金額

範例：A事務所（估價1.2億，甲乙兩位估價師簽名）、B事務所（估價1.3億，丙估價師簽名）
```json
{
  "appraisers": [
    {"firm": "A事務所", "names": ["甲", "乙"], "amount": 120000000},
    {"firm": "B事務所", "names": ["丙"], "amount": 130000000}
  ]
}
```

## 交易對象結構

counterparties 是陣列，可能有多個交易對象，每個包含 name, is_related_party, relationship
"""

def setup_logging():
    """設定日誌"""
    LOG_DIR.mkdir(parents=True, exist_ok=True)
    log_file = LOG_DIR / f"parser_{datetime.now().strftime('%Y%m')}.log"

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        handlers=[
            logging.FileHandler(log_file, encoding='utf-8'),
            logging.StreamHandler(sys.stdout)
        ]
    )
    return logging.getLogger(__name__)

logger = setup_logging()


def parse_with_claude(subject: str, content: str) -> Optional[Dict[str, Any]]:
    """使用 Claude CLI 解析公告"""

    # 準備輸入（截取前 3000 字，避免太長）
    input_text = f"""## 主旨
{subject}

## 內容
{content[:3000]}
"""

    try:
        # 呼叫 Claude CLI - 透過 stdin 傳入資料
        cmd = [
            "claude", "-p",
            "--model", "sonnet",  # 明確指定使用 Sonnet 模型
            "--output-format", "json",
            "--json-schema", json.dumps(PARSE_SCHEMA),
            "--system-prompt", SYSTEM_PROMPT,
            "--max-budget-usd", "0.05",  # 限制單次成本
        ]

        result = subprocess.run(
            cmd,
            input=input_text,
            capture_output=True,
            text=True,
            timeout=120,
            encoding='utf-8',
            shell=True
        )

        if result.returncode != 0:
            logger.warning(f"Claude CLI 錯誤: {result.stderr}")
            return None

        if not result.stdout or not result.stdout.strip():
            logger.warning(f"Claude CLI 回傳空白。stderr: {result.stderr[:200] if result.stderr else 'EMPTY'}")
            return None

        # 解析輸出
        output = json.loads(result.stdout)

        # Claude CLI JSON 輸出格式：
        # - 使用 --json-schema 時，結構化輸出在 "structured_output" 欄位
        # - 一般模式時，文字輸出在 "result" 欄位
        if "structured_output" in output and output["structured_output"]:
            return output["structured_output"]
        elif "result" in output and output["result"]:
            # 嘗試解析 result 為 JSON
            try:
                return json.loads(output["result"])
            except json.JSONDecodeError:
                logger.warning(f"result 欄位非 JSON: {output['result'][:100]}")
                return None

        logger.warning("Claude CLI 輸出無有效資料")
        return None

    except subprocess.TimeoutExpired:
        logger.warning("Claude CLI 超時")
        return None
    except json.JSONDecodeError as e:
        logger.warning(f"JSON 解析錯誤: {e}")
        return None
    except Exception as e:
        logger.warning(f"解析錯誤: {e}")
        return None


def get_unparsed_announcements(conn: sqlite3.Connection, limit: int = None) -> list:
    """取得尚未解析的公告"""
    cursor = conn.cursor()

    query = """
        SELECT id, stock_code, company_name, subject, content
        FROM announcements
        WHERE parsed_data IS NULL
        ORDER BY announce_date DESC
    """

    if limit:
        query += f" LIMIT {limit}"

    cursor.execute(query)
    return cursor.fetchall()


def save_parsed_data(conn: sqlite3.Connection, announcement_id: int, parsed: Dict[str, Any]):
    """儲存解析結果"""
    cursor = conn.cursor()

    cursor.execute("""
        UPDATE announcements
        SET parsed_data = ?,
            category = ?,
            subcategory = ?,
            parsed_at = ?
        WHERE id = ?
    """, (
        json.dumps(parsed, ensure_ascii=False),
        parsed.get("category"),
        parsed.get("subcategory"),
        datetime.now().isoformat(),
        announcement_id
    ))

    conn.commit()


def parse_single(conn: sqlite3.Connection, announcement_id: int, dry_run: bool = False):
    """解析單一公告"""
    cursor = conn.cursor()
    cursor.execute("""
        SELECT id, stock_code, company_name, subject, content
        FROM announcements WHERE id = ?
    """, (announcement_id,))

    row = cursor.fetchone()
    if not row:
        logger.error(f"找不到公告 ID: {announcement_id}")
        return

    ann_id, stock_code, company_name, subject, content = row

    logger.info(f"解析: [{stock_code}] {company_name} - {subject[:40]}")

    parsed = parse_with_claude(subject, content)

    if parsed:
        logger.info(f"  類型: {parsed.get('category')} / {parsed.get('subcategory')}")
        logger.info(f"  摘要: {parsed.get('summary')}")

        if not dry_run:
            save_parsed_data(conn, ann_id, parsed)
            logger.info(f"  已儲存")
        else:
            logger.info(f"  [Dry-run] 不儲存")
            print(json.dumps(parsed, ensure_ascii=False, indent=2))
    else:
        logger.warning(f"  解析失敗")


def parse_single_task(task: Tuple[int, int, str, str, str, int]) -> Tuple[int, str, str, Optional[Dict[str, Any]]]:
    """
    執行緒任務：解析單一公告

    Args:
        task: (index, ann_id, stock_code, subject, content, total)

    Returns:
        (ann_id, stock_code, subject, parsed_result)
    """
    index, ann_id, stock_code, subject, content, total = task

    logger.info(f"[{index}/{total}] [{stock_code}] {subject[:40]}")

    parsed = parse_with_claude(subject, content)

    if parsed:
        logger.info(f"  → {parsed.get('category')} | {parsed.get('summary', '')[:30]}")
    else:
        logger.warning(f"  → 解析失敗")

    return (ann_id, stock_code, subject, parsed)


def parse_batch(conn: sqlite3.Connection, limit: int = None, dry_run: bool = False, workers: int = 1):
    """批次解析公告（支援並行處理）"""
    announcements = get_unparsed_announcements(conn, limit)

    logger.info(f"待解析公告: {len(announcements)} 筆")

    if workers > 1:
        logger.info(f"並行模式: {workers} workers")

    if not announcements:
        logger.info("沒有待解析的公告")
        return

    success = 0
    failed = 0
    total = len(announcements)

    # 準備任務列表
    tasks = [
        (i, row[0], row[1], row[3], row[4], total)  # index, ann_id, stock_code, subject, content, total
        for i, row in enumerate(announcements, 1)
    ]

    if workers == 1:
        # 循序處理（原本的行為）
        for task in tasks:
            ann_id, stock_code, subject, parsed = parse_single_task(task)

            if parsed:
                if not dry_run:
                    save_parsed_data(conn, ann_id, parsed)
                success += 1
            else:
                failed += 1
    else:
        # 並行處理
        results = []

        with ThreadPoolExecutor(max_workers=workers) as executor:
            # 提交任務（間隔 0.5 秒，降低 rate limit 風險）
            future_to_task = {}
            for i, task in enumerate(tasks):
                future = executor.submit(parse_single_task, task)
                future_to_task[future] = task
                if i < len(tasks) - 1:  # 最後一個不用等
                    time.sleep(0.5)

            # 收集結果
            for future in as_completed(future_to_task):
                try:
                    result = future.result()
                    results.append(result)
                except Exception as e:
                    logger.error(f"執行緒錯誤: {e}")
                    failed += 1

        # 循序儲存（避免 SQLite 並發問題）
        logger.info(f"\n儲存結果中...")
        for ann_id, stock_code, subject, parsed in results:
            if parsed:
                if not dry_run:
                    save_parsed_data(conn, ann_id, parsed)
                success += 1
            else:
                failed += 1

    logger.info(f"\n完成: 成功 {success} 筆，失敗 {failed} 筆")


def main():
    parser = argparse.ArgumentParser(description='MOPS 公告 LLM 結構化解析器')
    parser.add_argument('--limit', type=int, help='限制解析筆數')
    parser.add_argument('--reparse', action='store_true', help='重新解析所有公告')
    parser.add_argument('--id', type=int, help='解析特定公告 ID')
    parser.add_argument('--dry-run', action='store_true', help='測試模式，不寫入資料庫')
    parser.add_argument('--workers', type=int, default=1, help='並行處理數（預設 1，建議 3-5）')

    args = parser.parse_args()

    if not DB_PATH.exists():
        logger.error(f"資料庫不存在: {DB_PATH}")
        sys.exit(1)

    conn = sqlite3.connect(DB_PATH)

    # 確保欄位存在
    cursor = conn.cursor()
    for col in ["parsed_data", "category", "subcategory", "parsed_at"]:
        try:
            cursor.execute(f"ALTER TABLE announcements ADD COLUMN {col} TEXT")
        except sqlite3.OperationalError:
            pass

    if args.reparse:
        # 清除所有解析結果
        cursor.execute("UPDATE announcements SET parsed_data = NULL, category = NULL, subcategory = NULL, parsed_at = NULL")
        conn.commit()
        logger.info("已清除所有解析結果")

    if args.id:
        parse_single(conn, args.id, args.dry_run)
    else:
        parse_batch(conn, args.limit, args.dry_run, args.workers)

    conn.close()


if __name__ == "__main__":
    main()
