"""
學術期刊論文監控系統 - PDF 解析模組

使用 MinerU OCR 將 PDF 轉換為 Markdown，並解析論文結構。

功能：
- PDF → Markdown 轉換（整合 mineru-ocr skill，支援 GPU 加速）
- 論文結構解析（標題、摘要、章節、參考文獻）
- 全文內容提取供 LLM 分析
- 批次處理支援

使用前提：
- MinerU 已安裝（conda env: mineru）
- PDF 已下載到 data/pdfs/ 目錄

整合 mineru-ocr skill：
- GPU 模式：3-5 秒/頁
- CPU 模式：15-20 秒/頁
"""

import sys
sys.stdout.reconfigure(encoding='utf-8')

import json
import logging
import re
import sqlite3
from pathlib import Path
from typing import Optional, Literal

from config import DATABASE_PATH, DATA_DIR, PDF_STORAGE_DIR, PARSED_PAPERS_DIR

logger = logging.getLogger(__name__)

# 目錄設定
PDF_DIR = PDF_STORAGE_DIR  # 外部路徑（不 sync 到雲端）
PARSED_DIR = PARSED_PAPERS_DIR  # Research_zoo/papers/parsed/
PARSED_DIR.mkdir(parents=True, exist_ok=True)

# mineru-ocr skill 路徑
SKILL_DIR = Path(__file__).parent.parent / "claude-code-settings-sync" / "skills" / "mineru-ocr" / "scripts"

# 學術論文常見章節標題（用於結構解析）
SECTION_PATTERNS = {
    "abstract": [
        r"^#+\s*abstract",
        r"^abstract\s*$",
    ],
    "introduction": [
        r"^#+\s*\d*\.?\s*introduction",
        r"^introduction\s*$",
        r"^#+\s*\d*\.?\s*背景",
    ],
    "literature_review": [
        r"^#+\s*\d*\.?\s*literature\s*review",
        r"^#+\s*\d*\.?\s*related\s*work",
        r"^#+\s*\d*\.?\s*background",
        r"^#+\s*\d*\.?\s*文獻回顧",
    ],
    "methodology": [
        r"^#+\s*\d*\.?\s*method",
        r"^#+\s*\d*\.?\s*data\s*(and|&)?\s*method",
        r"^#+\s*\d*\.?\s*research\s*design",
        r"^#+\s*\d*\.?\s*研究方法",
    ],
    "results": [
        r"^#+\s*\d*\.?\s*results?",
        r"^#+\s*\d*\.?\s*findings?",
        r"^#+\s*\d*\.?\s*empirical\s*results?",
        r"^#+\s*\d*\.?\s*研究結果",
    ],
    "discussion": [
        r"^#+\s*\d*\.?\s*discussion",
        r"^#+\s*\d*\.?\s*討論",
    ],
    "conclusion": [
        r"^#+\s*\d*\.?\s*conclusion",
        r"^#+\s*\d*\.?\s*結論",
    ],
    "references": [
        r"^#+\s*references?",
        r"^references?\s*$",
        r"^#+\s*bibliography",
        r"^#+\s*參考文獻",
    ],
}


# ============================================================
# MinerU 整合（透過 mineru-ocr skill）
# ============================================================

def _import_skill_module(module_name: str):
    """動態匯入 mineru-ocr skill 的模組"""
    import importlib.util

    module_path = SKILL_DIR / f"{module_name}.py"
    if not module_path.exists():
        raise ImportError(f"找不到 skill 模組: {module_path}")

    spec = importlib.util.spec_from_file_location(module_name, module_path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return module


def check_mineru_available() -> tuple[bool, str]:
    """
    檢查 MinerU 是否可用（使用 skill 的檢查功能）

    Returns:
        tuple: (available, message)
    """
    try:
        mineru_converter = _import_skill_module("mineru_converter")
        return mineru_converter.check_mineru_available()
    except ImportError as e:
        # Fallback: 直接檢查
        from shutil import which
        home = Path.home()
        conda_paths = [
            home / "miniforge3" / "envs" / "mineru" / "Scripts" / "mineru.exe",
            home / "miniconda3" / "envs" / "mineru" / "Scripts" / "mineru.exe",
            home / "anaconda3" / "envs" / "mineru" / "Scripts" / "mineru.exe",
        ]

        for path in conda_paths:
            if path.exists():
                return (True, str(path))

        mineru_path = which("mineru")
        if mineru_path:
            return (True, mineru_path)

        return (False, "MinerU 未安裝。請在 conda 環境中安裝：mamba create -n mineru python=3.10 && pip install mineru")


def check_gpu_available() -> dict:
    """
    檢查 GPU 是否可用（使用 skill 的 GPU 偵測）

    Returns:
        {
            "available": bool,
            "device_name": str,
            "memory_free_gb": float,
            "recommendation": str,
        }
    """
    try:
        gpu_detector = _import_skill_module("gpu_detector")
        gpu_info = gpu_detector.detect_gpu()
        recommendation = gpu_detector.get_recommendation()
        return {
            "available": gpu_info["available"],
            "device_name": gpu_info["device_name"],
            "memory_free_gb": gpu_info["memory_free_gb"],
            "recommendation": recommendation,
        }
    except ImportError:
        return {
            "available": False,
            "device_name": "N/A",
            "memory_free_gb": 0.0,
            "recommendation": "無法載入 GPU 偵測模組",
        }


def convert_pdf_to_markdown(
    pdf_path: Path,
    output_dir: Optional[Path] = None,
    device: Literal["auto", "cuda", "cpu"] = "auto",
    lang: str = "en",
    timeout: int = 1800,
    validate_quality: bool = True,
) -> dict:
    """
    使用 MinerU 將 PDF 轉換為 Markdown（支援 GPU 加速）

    Args:
        pdf_path: PDF 檔案路徑
        output_dir: 輸出目錄（預設: PARSED_DIR）
        device: 裝置選擇 ("auto" | "cuda" | "cpu")
        lang: 語言代碼（"en" 或 "chinese_cht"）
        timeout: 超時秒數（預設 1800 = 30 分鐘）
        validate_quality: 是否驗證輸出品質

    Returns:
        {
            "success": bool,
            "markdown_path": str,
            "content": str,
            "elapsed_time": float,
            "device_used": str,
            "pages": int,
            "error": str,
        }
    """
    result = {
        "success": False,
        "markdown_path": "",
        "content": "",
        "elapsed_time": 0,
        "device_used": "",
        "pages": 0,
        "error": "",
    }

    # 設定輸出目錄
    if output_dir is None:
        output_dir = PARSED_DIR

    pdf_path = Path(pdf_path)
    if not pdf_path.exists():
        result["error"] = f"PDF 檔案不存在: {pdf_path}"
        return result

    try:
        # 使用 skill 的轉換器
        mineru_converter = _import_skill_module("mineru_converter")

        logger.info(f"使用 mineru-ocr skill 轉換: {pdf_path}")
        logger.info(f"裝置模式: {device}")

        conversion_result = mineru_converter.convert_pdf(
            pdf_path=str(pdf_path),
            output_dir=str(output_dir),
            device=device,
            lang=lang,
            timeout=timeout,
            validate_quality=validate_quality,
        )

        if conversion_result["success"]:
            # 讀取 Markdown 內容
            md_path = Path(conversion_result["markdown_file"])
            with open(md_path, "r", encoding="utf-8") as f:
                content = f.read()

            result["success"] = True
            result["markdown_path"] = str(md_path)
            result["content"] = content
            result["elapsed_time"] = conversion_result.get("elapsed_time", 0)
            result["device_used"] = conversion_result.get("device_used", "")
            result["pages"] = conversion_result.get("pages", 0)
        else:
            result["error"] = conversion_result.get("error", "轉換失敗")

    except ImportError as e:
        # Fallback: 使用簡化版本
        logger.warning(f"無法載入 skill 模組，使用簡化版本: {e}")
        result = _convert_pdf_fallback(pdf_path, output_dir, lang, timeout)
    except Exception as e:
        result["error"] = f"轉換錯誤: {e}"

    return result


def _convert_pdf_fallback(
    pdf_path: Path,
    output_dir: Path,
    lang: str,
    timeout: int,
) -> dict:
    """簡化版 PDF 轉換（當 skill 無法載入時使用）"""
    import subprocess
    from shutil import which

    result = {
        "success": False,
        "markdown_path": "",
        "content": "",
        "elapsed_time": 0,
        "device_used": "cpu",
        "pages": 0,
        "error": "",
    }

    # 找 MinerU
    available, mineru_path = check_mineru_available()
    if not available:
        result["error"] = mineru_path
        return result

    pdf_name = pdf_path.stem

    try:
        import time
        start_time = time.time()

        cmd = [mineru_path, "-p", str(pdf_path), "-o", str(output_dir), "-l", lang]
        process = subprocess.run(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            timeout=timeout,
            text=True,
            encoding="utf-8",
            errors="ignore",
        )

        result["elapsed_time"] = round(time.time() - start_time, 2)

        if process.returncode != 0:
            result["error"] = f"MinerU 執行失敗: {process.stderr}"
            return result

        # 找 Markdown 檔案
        md_path = output_dir / pdf_name / "auto" / f"{pdf_name}.md"
        if not md_path.exists():
            for alt in [output_dir / pdf_name / f"{pdf_name}.md", output_dir / f"{pdf_name}.md"]:
                if alt.exists():
                    md_path = alt
                    break

        if not md_path.exists():
            result["error"] = "轉換完成但找不到 Markdown 檔案"
            return result

        with open(md_path, "r", encoding="utf-8") as f:
            content = f.read()

        result["success"] = True
        result["markdown_path"] = str(md_path)
        result["content"] = content

    except subprocess.TimeoutExpired:
        result["error"] = f"MinerU 執行超時（超過 {timeout} 秒）"
    except Exception as e:
        result["error"] = f"轉換錯誤: {e}"

    return result


def batch_convert_pdfs(
    pdf_paths: list,
    output_dir: Optional[Path] = None,
    device: Literal["auto", "cuda", "cpu"] = "auto",
    lang: str = "en",
) -> dict:
    """
    批次轉換多個 PDF（使用 skill 的批次處理）

    Args:
        pdf_paths: PDF 檔案路徑列表
        output_dir: 輸出目錄
        device: 裝置選擇
        lang: 語言代碼

    Returns:
        {
            "total": int,
            "success": int,
            "failed": int,
            "results": list,
        }
    """
    if output_dir is None:
        output_dir = PARSED_DIR

    try:
        batch_processor = _import_skill_module("batch_processor")
        return batch_processor.batch_convert(
            pdf_paths=[str(p) for p in pdf_paths],
            output_dir=str(output_dir),
            device=device,
            lang=lang,
        )
    except ImportError:
        # Fallback: 序列處理
        results = []
        for pdf_path in pdf_paths:
            result = convert_pdf_to_markdown(pdf_path, output_dir, device, lang)
            results.append(result)

        return {
            "total": len(pdf_paths),
            "success": sum(1 for r in results if r["success"]),
            "failed": sum(1 for r in results if not r["success"]),
            "results": results,
        }


# ============================================================
# 論文結構解析
# ============================================================

def extract_sections(markdown_content: str) -> dict:
    """
    從 Markdown 內容中提取論文各章節

    Args:
        markdown_content: Markdown 全文

    Returns:
        {
            "title": str,
            "abstract": str,
            "introduction": str,
            "methodology": str,
            "results": str,
            "discussion": str,
            "conclusion": str,
            "references": str,
            "full_text": str,
        }
    """
    sections = {
        "title": "",
        "abstract": "",
        "introduction": "",
        "literature_review": "",
        "methodology": "",
        "results": "",
        "discussion": "",
        "conclusion": "",
        "references": "",
        "full_text": markdown_content,
    }

    lines = markdown_content.split("\n")

    # 提取標題（通常是第一個 # 標題）
    for line in lines:
        if line.strip().startswith("#") and not any(
            re.match(p, line.strip(), re.IGNORECASE)
            for patterns in SECTION_PATTERNS.values()
            for p in patterns
        ):
            sections["title"] = re.sub(r"^#+\s*", "", line.strip())
            break

    # 解析各章節
    current_section = None
    current_content = []

    for line in lines:
        line_lower = line.strip().lower()

        # 檢查是否是新章節開始
        new_section = None
        for section_name, patterns in SECTION_PATTERNS.items():
            for pattern in patterns:
                if re.match(pattern, line_lower, re.IGNORECASE):
                    new_section = section_name
                    break
            if new_section:
                break

        if new_section:
            # 儲存前一章節內容
            if current_section and current_content:
                sections[current_section] = "\n".join(current_content).strip()

            current_section = new_section
            current_content = []
        else:
            if current_section:
                current_content.append(line)

    # 儲存最後一個章節
    if current_section and current_content:
        sections[current_section] = "\n".join(current_content).strip()

    return sections


def extract_key_findings(sections: dict) -> list:
    """
    從論文章節中提取關鍵發現

    Args:
        sections: extract_sections() 的回傳值

    Returns:
        關鍵發現列表
    """
    findings = []

    # 從 Results 和 Conclusion 中提取
    for section_name in ["results", "conclusion", "abstract"]:
        content = sections.get(section_name, "")
        if not content:
            continue

        # 簡單的句子分割
        sentences = re.split(r"[.!?]\s+", content)

        for sentence in sentences:
            sentence = sentence.strip()
            # 過濾太短的句子
            if len(sentence) < 20:
                continue

            # 找包含關鍵詞的句子
            keywords = [
                "find", "found", "show", "shows", "demonstrate",
                "result", "suggest", "indicate", "reveal", "discover",
                "significant", "important", "conclusion", "evidence",
            ]
            if any(kw in sentence.lower() for kw in keywords):
                findings.append(sentence)

    return findings[:10]  # 最多 10 個


def extract_methodology_summary(sections: dict) -> str:
    """
    從 Methodology 章節提取方法摘要

    Args:
        sections: extract_sections() 的回傳值

    Returns:
        方法摘要字串
    """
    methodology = sections.get("methodology", "")
    if not methodology:
        return ""

    # 取前 500 字元作為摘要
    if len(methodology) > 500:
        return methodology[:500] + "..."

    return methodology


# ============================================================
# 資料庫整合
# ============================================================

def save_parsed_paper(paper_id: int, parsed_data: dict) -> bool:
    """
    將解析結果儲存到資料庫

    Args:
        paper_id: 論文 ID
        parsed_data: 解析資料

    Returns:
        是否成功
    """
    if not DATABASE_PATH.exists():
        logger.error("資料庫不存在")
        return False

    try:
        conn = sqlite3.connect(DATABASE_PATH)
        cursor = conn.cursor()

        # 檢查表是否存在，若不存在則建立
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS paper_parsed (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                paper_id INTEGER UNIQUE,
                markdown_path TEXT,
                title TEXT,
                abstract TEXT,
                introduction TEXT,
                methodology TEXT,
                results TEXT,
                discussion TEXT,
                conclusion TEXT,
                references TEXT,
                key_findings TEXT,
                full_text TEXT,
                parsed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                FOREIGN KEY (paper_id) REFERENCES papers(id)
            )
        """)

        # 插入或更新
        cursor.execute("""
            INSERT OR REPLACE INTO paper_parsed
            (paper_id, markdown_path, title, abstract, introduction,
             methodology, results, discussion, conclusion, references,
             key_findings, full_text)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """, (
            paper_id,
            parsed_data.get("markdown_path", ""),
            parsed_data.get("title", ""),
            parsed_data.get("abstract", ""),
            parsed_data.get("introduction", ""),
            parsed_data.get("methodology", ""),
            parsed_data.get("results", ""),
            parsed_data.get("discussion", ""),
            parsed_data.get("conclusion", ""),
            parsed_data.get("references", ""),
            json.dumps(parsed_data.get("key_findings", [])),
            parsed_data.get("full_text", ""),
        ))

        conn.commit()
        conn.close()
        return True

    except Exception as e:
        logger.error(f"儲存解析結果失敗: {e}")
        return False


def get_parsed_paper(paper_id: int) -> Optional[dict]:
    """
    取得已解析的論文資料

    Args:
        paper_id: 論文 ID

    Returns:
        解析資料或 None
    """
    if not DATABASE_PATH.exists():
        return None

    try:
        conn = sqlite3.connect(DATABASE_PATH)
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()

        cursor.execute("""
            SELECT * FROM paper_parsed WHERE paper_id = ?
        """, (paper_id,))

        row = cursor.fetchone()
        conn.close()

        if row:
            data = dict(row)
            if data.get("key_findings"):
                data["key_findings"] = json.loads(data["key_findings"])
            return data

        return None

    except Exception as e:
        logger.error(f"取得解析資料失敗: {e}")
        return None


# ============================================================
# 完整解析流程
# ============================================================

def parse_paper_pdf(pdf_path: str, paper_id: Optional[int] = None) -> dict:
    """
    完整的論文 PDF 解析流程

    Args:
        pdf_path: PDF 檔案路徑
        paper_id: 論文 ID（可選，用於資料庫儲存）

    Returns:
        {
            "success": bool,
            "markdown_path": str,
            "sections": dict,
            "key_findings": list,
            "methodology_summary": str,
            "error": str,
        }
    """
    result = {
        "success": False,
        "markdown_path": "",
        "sections": {},
        "key_findings": [],
        "methodology_summary": "",
        "error": "",
    }

    # Step 1: PDF → Markdown
    logger.info(f"解析 PDF: {pdf_path}")
    conversion = convert_pdf_to_markdown(Path(pdf_path))

    if not conversion["success"]:
        result["error"] = conversion["error"]
        return result

    result["markdown_path"] = conversion["markdown_path"]

    # Step 2: 提取章節
    sections = extract_sections(conversion["content"])
    result["sections"] = sections

    # Step 3: 提取關鍵發現
    result["key_findings"] = extract_key_findings(sections)

    # Step 4: 提取方法摘要
    result["methodology_summary"] = extract_methodology_summary(sections)

    # Step 5: 儲存到資料庫（如果有 paper_id）
    if paper_id:
        parsed_data = {
            "markdown_path": result["markdown_path"],
            **sections,
            "key_findings": result["key_findings"],
        }
        save_parsed_paper(paper_id, parsed_data)

    result["success"] = True
    return result


# ============================================================
# CLI
# ============================================================

def main():
    import argparse

    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(message)s",
    )

    parser = argparse.ArgumentParser(description="論文 PDF 解析工具（整合 mineru-ocr skill）")
    parser.add_argument("--check", action="store_true", help="檢查 MinerU 是否可用")
    parser.add_argument("--gpu", action="store_true", help="檢查 GPU 狀態")
    parser.add_argument("--parse", type=str, metavar="PDF", help="解析指定 PDF 檔案")
    parser.add_argument("--device", choices=["auto", "cuda", "cpu"], default="auto",
                        help="裝置選擇（預設: auto）")
    parser.add_argument("--paper-id", type=int, help="關聯的論文 ID（用於資料庫儲存）")
    args = parser.parse_args()

    if args.check:
        available, message = check_mineru_available()
        if available:
            print(f"✅ MinerU 可用: {message}")
        else:
            print(f"❌ MinerU 不可用: {message}")
        return

    if args.gpu:
        print("=" * 60)
        print("GPU 狀態檢查")
        print("=" * 60)
        gpu_info = check_gpu_available()
        if gpu_info["available"]:
            print(f"✅ GPU 可用")
            print(f"   型號: {gpu_info['device_name']}")
            print(f"   可用記憶體: {gpu_info['memory_free_gb']:.1f} GB")
        else:
            print(f"❌ GPU 不可用")
        print()
        print(gpu_info["recommendation"])
        print("=" * 60)
        return

    if args.parse:
        print(f"解析 PDF: {args.parse}")
        print(f"裝置模式: {args.device}")

        # 先顯示 GPU 狀態
        gpu_info = check_gpu_available()
        if gpu_info["available"]:
            print(f"GPU: {gpu_info['device_name']} ({gpu_info['memory_free_gb']:.1f} GB 可用)")
        else:
            print("GPU: 不可用，將使用 CPU")

        result = parse_paper_pdf(args.parse, args.paper_id)

        if result["success"]:
            print(f"\n✅ 解析成功")
            print(f"   Markdown 路徑: {result['markdown_path']}")
            print(f"   標題: {result['sections'].get('title', 'N/A')[:50]}...")
            print(f"   摘要長度: {len(result['sections'].get('abstract', ''))} 字元")
            print(f"   關鍵發現: {len(result['key_findings'])} 項")
        else:
            print(f"\n❌ 解析失敗: {result['error']}")
        return

    print("用法:")
    print("  --check           檢查 MinerU 是否可用")
    print("  --gpu             檢查 GPU 狀態")
    print("  --parse PDF       解析指定 PDF 檔案")
    print("  --device MODE     裝置選擇（auto/cuda/cpu，預設: auto）")
    print("  --paper-id N      關聯的論文 ID")
    print()
    print("整合 mineru-ocr skill：")
    print("  - GPU 模式: 3-5 秒/頁")
    print("  - CPU 模式: 15-20 秒/頁")


if __name__ == "__main__":
    main()
