"""
MOPS 公司資料批次抓取工具
支援斷點續抓、進度追蹤、錯誤處理
"""
import sys
sys.stdout.reconfigure(encoding='utf-8')

import json
import time
from datetime import datetime, timedelta
from pathlib import Path

# 匯入 scraper 的功能
from scraper import scrape_company, save_snapshot, get_latest, compare_data, get_data_dir

def load_companies() -> list:
    """載入公司清單"""
    companies_file = get_data_dir() / "companies.json"
    if not companies_file.exists():
        print("錯誤：請先執行 sync_companies.py 同步公司清單")
        return []

    with open(companies_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data.get('companies', [])

def load_progress() -> dict:
    """載入抓取進度"""
    progress_file = get_data_dir() / "batch_progress.json"
    if progress_file.exists():
        with open(progress_file, 'r', encoding='utf-8') as f:
            return json.load(f)
    return {
        "completed": [],
        "failed": [],
        "started_at": None,
        "last_updated": None
    }

def save_progress(progress: dict):
    """儲存抓取進度"""
    progress_file = get_data_dir() / "batch_progress.json"
    progress["last_updated"] = datetime.now().isoformat()
    with open(progress_file, 'w', encoding='utf-8') as f:
        json.dump(progress, f, ensure_ascii=False, indent=2)

def format_duration(seconds: float) -> str:
    """格式化時間"""
    if seconds < 60:
        return f"{seconds:.0f} 秒"
    elif seconds < 3600:
        return f"{seconds/60:.1f} 分鐘"
    else:
        return f"{seconds/3600:.1f} 小時"

def batch_scrape(start_index: int = 0, limit: int = None, delay: float = 2.0,
                 skip_existing: bool = True, filter_type: str = None):
    """
    批次抓取公司資料

    Args:
        start_index: 從第幾間開始（0-based）
        limit: 最多抓幾間（None = 全部）
        delay: 每次抓取間隔秒數
        skip_existing: 是否跳過已抓取的公司
        filter_type: 過濾類型 ('listed'=上市, 'otc'=上櫃, None=全部)
    """
    companies = load_companies()
    if not companies:
        return

    # 過濾公司類型
    if filter_type == 'listed':
        # 上市公司代碼通常是 4 位數，1xxx-9xxx
        companies = [c for c in companies if len(c['code']) == 4 and c['code'][0] in '123456789']
    elif filter_type == 'otc':
        # 上櫃公司代碼通常是 4 位數，部分以 3, 4, 5, 6, 8 開頭
        companies = [c for c in companies if len(c['code']) == 4]

    progress = load_progress()
    completed_set = set(progress["completed"])
    failed_set = set(progress["failed"])

    # 計算要抓取的公司
    to_scrape = []
    for c in companies[start_index:]:
        if limit and len(to_scrape) >= limit:
            break
        if skip_existing and c['code'] in completed_set:
            continue
        to_scrape.append(c)

    total = len(to_scrape)
    if total == 0:
        print("沒有需要抓取的公司")
        print(f"已完成: {len(completed_set)} 間")
        print(f"失敗: {len(failed_set)} 間")
        return

    print(f"\n=== 批次抓取開始 ===")
    print(f"待抓取: {total} 間公司")
    print(f"已完成: {len(completed_set)} 間")
    print(f"間隔: {delay} 秒")
    print(f"預估時間: {format_duration(total * (delay + 25))}")
    print("-" * 50)

    if not progress["started_at"]:
        progress["started_at"] = datetime.now().isoformat()

    start_time = time.time()
    success_count = 0
    fail_count = 0

    for i, company in enumerate(to_scrape):
        code = company['code']
        name = company['name']

        # 進度顯示
        elapsed = time.time() - start_time
        if i > 0:
            avg_time = elapsed / i
            eta = avg_time * (total - i)
            eta_str = format_duration(eta)
        else:
            eta_str = "計算中..."

        print(f"\n[{i+1}/{total}] {code} {name} (ETA: {eta_str})")

        try:
            # 檢查是否已有資料
            existing = get_latest(code)
            is_first = existing is None

            # 抓取資料
            new_data = scrape_company(code)

            # 檢查是否成功
            if not new_data.get('data') or len(new_data['data']) < 5:
                raise Exception("抓取資料不完整")

            # 比較變更
            has_changes = True
            if existing:
                changes = compare_data(existing, new_data)
                has_changes = len(changes) > 0
                if has_changes:
                    print(f"  -> 偵測到 {len(changes)} 項變更")
                else:
                    print(f"  -> 資料無變更")
            else:
                print(f"  -> 首次抓取")

            # 儲存
            save_snapshot(code, new_data, has_changes=has_changes, is_first=is_first)

            # 更新進度
            progress["completed"].append(code)
            completed_set.add(code)
            success_count += 1

            print(f"  -> 完成 ({success_count} 成功 / {fail_count} 失敗)")

        except Exception as e:
            print(f"  -> 失敗: {e}")
            if code not in failed_set:
                progress["failed"].append(code)
                failed_set.add(code)
            fail_count += 1

        # 定期儲存進度
        if (i + 1) % 10 == 0:
            save_progress(progress)

        # 間隔
        if i < total - 1:
            time.sleep(delay)

    # 最終儲存
    save_progress(progress)

    # 統計
    total_time = time.time() - start_time
    print("\n" + "=" * 50)
    print(f"=== 批次抓取完成 ===")
    print(f"成功: {success_count} 間")
    print(f"失敗: {fail_count} 間")
    print(f"總耗時: {format_duration(total_time)}")
    print(f"平均: {total_time/total:.1f} 秒/間")

def show_progress():
    """顯示目前進度"""
    progress = load_progress()
    companies = load_companies()

    total = len(companies)
    completed = len(progress["completed"])
    failed = len(progress["failed"])
    remaining = total - completed

    print(f"\n=== 抓取進度 ===")
    print(f"公司總數: {total} 間")
    print(f"已完成: {completed} 間 ({completed/total*100:.1f}%)")
    print(f"失敗: {failed} 間")
    print(f"剩餘: {remaining} 間")

    if progress["started_at"]:
        print(f"\n開始時間: {progress['started_at']}")
    if progress["last_updated"]:
        print(f"最後更新: {progress['last_updated']}")

    if failed > 0:
        print(f"\n失敗的公司代碼:")
        for code in progress["failed"][:20]:
            print(f"  - {code}")
        if failed > 20:
            print(f"  ... 還有 {failed - 20} 間")

def retry_failed(delay: float = 2.0):
    """重試失敗的公司"""
    progress = load_progress()
    failed = progress["failed"]

    if not failed:
        print("沒有失敗的公司需要重試")
        return

    print(f"\n=== 重試失敗的公司 ({len(failed)} 間) ===")

    new_failed = []
    success_count = 0

    for i, code in enumerate(failed):
        print(f"\n[{i+1}/{len(failed)}] 重試 {code}")

        try:
            existing = get_latest(code)
            is_first = existing is None

            new_data = scrape_company(code)

            if not new_data.get('data') or len(new_data['data']) < 5:
                raise Exception("抓取資料不完整")

            has_changes = True
            if existing:
                changes = compare_data(existing, new_data)
                has_changes = len(changes) > 0

            save_snapshot(code, new_data, has_changes=has_changes, is_first=is_first)
            progress["completed"].append(code)
            success_count += 1
            print(f"  -> 成功")

        except Exception as e:
            print(f"  -> 仍然失敗: {e}")
            new_failed.append(code)

        if i < len(failed) - 1:
            time.sleep(delay)

    progress["failed"] = new_failed
    save_progress(progress)

    print(f"\n重試完成: {success_count} 成功, {len(new_failed)} 仍失敗")

def reset_progress():
    """重置進度"""
    progress_file = get_data_dir() / "batch_progress.json"
    if progress_file.exists():
        progress_file.unlink()
    print("進度已重置")

def main():
    import argparse

    parser = argparse.ArgumentParser(description='MOPS 公司資料批次抓取工具')
    parser.add_argument('--start', '-s', type=int, default=0, help='從第幾間開始（0-based）')
    parser.add_argument('--limit', '-l', type=int, help='最多抓幾間')
    parser.add_argument('--delay', '-d', type=float, default=2.0, help='每次抓取間隔秒數')
    parser.add_argument('--no-skip', action='store_true', help='不跳過已抓取的公司')
    parser.add_argument('--progress', '-p', action='store_true', help='顯示目前進度')
    parser.add_argument('--retry', action='store_true', help='重試失敗的公司')
    parser.add_argument('--reset', action='store_true', help='重置進度')
    parser.add_argument('--filter', choices=['listed', 'otc'], help='過濾公司類型')

    args = parser.parse_args()

    if args.progress:
        show_progress()
    elif args.retry:
        retry_failed(delay=args.delay)
    elif args.reset:
        reset_progress()
    else:
        batch_scrape(
            start_index=args.start,
            limit=args.limit,
            delay=args.delay,
            skip_existing=not args.no_skip,
            filter_type=args.filter
        )

if __name__ == '__main__':
    main()
