"""
批次 OCR 處理腳本
依序處理多個資料夾的 PDF，完成後發送 Telegram 通知
"""
import sys
sys.stdout.reconfigure(encoding='utf-8')

import os
import subprocess
import time
from pathlib import Path
from datetime import datetime

# Telegram 通知
def send_telegram(message: str):
    script_path = Path("C:/Users/User/Documents/GitHub/Research_zoo/projects/telegram-notifier/notifier.py")
    try:
        subprocess.run(
            ["python", "-X", "utf8", str(script_path), message],
            capture_output=True, text=True, timeout=30
        )
    except Exception as e:
        print(f"Telegram 錯誤: {e}")

# OCR 單一檔案
def ocr_file(pdf_path: Path, output_dir: Path) -> bool:
    converter = Path("C:/Users/User/.claude/skills/mineru-ocr/scripts/mineru_converter.py")
    try:
        result = subprocess.run(
            ["python", "-X", "utf8", str(converter), str(pdf_path), "-o", str(output_dir), "-d", "auto"],
            capture_output=True, text=True, timeout=600
        )
        return result.returncode == 0
    except Exception as e:
        print(f"OCR 錯誤: {e}")
        return False

# 處理一個資料夾
def process_folder(name: str, pdf_dir: Path, md_dir: Path, missing_only: list = None):
    print(f"\n{'='*50}")
    print(f"處理 {name}")
    print(f"{'='*50}")

    if missing_only:
        pdfs = [pdf_dir / f for f in missing_only]
    else:
        pdfs = list(pdf_dir.glob("*.pdf"))

    total = len(pdfs)
    success = 0
    failed = []

    start_time = time.time()

    for i, pdf in enumerate(pdfs, 1):
        print(f"[{i}/{total}] {pdf.name}")
        if ocr_file(pdf, md_dir):
            success += 1
            print(f"  OK")
        else:
            failed.append(pdf.name)
            print(f"  FAILED")

    elapsed = (time.time() - start_time) / 60

    return {
        "name": name,
        "total": total,
        "success": success,
        "failed": failed,
        "elapsed": elapsed
    }

def main():
    base = Path("D:/Github/real-estate-reports")
    results = []

    send_telegram("🔄 開始批次 OCR 處理\n- cathay (5)\n- land-price-index (22)\n- sinyi-global (16)")

    # 1. cathay - 只處理缺少的 5 個
    cathay_missing = ["2017_Q3_news.pdf", "2018_Q1_news.pdf", "2019_Q2_news.pdf", "2019_Q3_news.pdf", "2021_Q2_news.pdf"]
    r = process_folder("cathay", base / "cathay/pdf", base / "cathay/md", cathay_missing)
    results.append(r)
    send_telegram(f"✅ cathay 完成\n成功: {r['success']}/{r['total']}\n耗時: {r['elapsed']:.1f} 分鐘")

    # 2. land-price-index
    r = process_folder("land-price-index", base / "land-price-index/pdf", base / "land-price-index/md")
    results.append(r)
    send_telegram(f"✅ land-price-index 完成\n成功: {r['success']}/{r['total']}\n耗時: {r['elapsed']:.1f} 分鐘")

    # 3. sinyi-global
    r = process_folder("sinyi-global", base / "sinyi-global/pdf", base / "sinyi-global/md")
    results.append(r)
    send_telegram(f"✅ sinyi-global 完成\n成功: {r['success']}/{r['total']}\n耗時: {r['elapsed']:.1f} 分鐘")

    # 總結
    total_files = sum(r['total'] for r in results)
    total_success = sum(r['success'] for r in results)
    total_time = sum(r['elapsed'] for r in results)

    summary = f"""🎉 批次 OCR 全部完成！

總計: {total_success}/{total_files} 個檔案
總耗時: {total_time:.1f} 分鐘

詳細:
- cathay: {results[0]['success']}/{results[0]['total']}
- land-price-index: {results[1]['success']}/{results[1]['total']}
- sinyi-global: {results[2]['success']}/{results[2]['total']}"""

    print(summary)
    send_telegram(summary)

if __name__ == "__main__":
    main()
