"""
MOPS 公司基本資料抓取器（API 版本）
使用 HTTP API 直接抓取，速度比 Playwright 快 17 倍

友善抓取策略：
- 隨機延遲 1-2 秒
- 最多 2 個併發
- 每 200 間休息 30 秒
- 遇錯指數退避
"""
import sys
sys.stdout.reconfigure(encoding='utf-8')

import json
import time
import random
import argparse
from pathlib import Path
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

import requests
from bs4 import BeautifulSoup

# 設定
API_URL = 'https://mopsov.twse.com.tw/mops/web/ajax_t05st03'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
TIMEOUT = 15

# 友善抓取參數
MIN_DELAY = 1.0  # 最小延遲（秒）
MAX_DELAY = 2.0  # 最大延遲（秒）
MAX_WORKERS = 2  # 併發數
BATCH_SIZE = 200  # 每批數量
BATCH_REST = 30  # 批次間休息（秒）
MAX_RETRIES = 3  # 最大重試次數

# 重要欄位（用於變更偵測）
IMPORTANT_FIELDS = [
    '董事長', '總經理', '發言人', '代理發言人',
    '簽證會計師1', '簽證會計師2', '簽證會計師事務所'
]

def get_data_dir():
    return Path(__file__).parent / "data"

def fetch_company(code: str, session: requests.Session) -> dict:
    """抓取單一公司資料"""
    data = {
        'encodeURIComponent': '1',
        'step': '1',
        'firstin': '1',
        'off': '1',
        'TYPEK': 'all',
        'co_id': code
    }

    response = session.post(API_URL, data=data, timeout=TIMEOUT)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')

    # 檢查是否有資料
    table = soup.find('table', class_='hasBorder')
    if not table:
        return None

    # 解析表格
    result = {}
    for row in table.find_all('tr'):
        cells = row.find_all(['th', 'td'])
        i = 0
        while i < len(cells):
            if cells[i].name == 'th':
                key = cells[i].get_text(strip=True)
                if i + 1 < len(cells):
                    value = cells[i + 1].get_text(strip=True)
                    if key and value and key not in result:
                        result[key] = value
                    i += 2
                else:
                    i += 1
            else:
                i += 1

    return result if result else None

def fetch_with_retry(code: str, session: requests.Session) -> tuple:
    """帶重試的抓取"""
    for attempt in range(MAX_RETRIES):
        try:
            # 隨機延遲
            delay = random.uniform(MIN_DELAY, MAX_DELAY)
            time.sleep(delay)

            data = fetch_company(code, session)
            return (code, data, None)
        except Exception as e:
            if attempt < MAX_RETRIES - 1:
                # 指數退避
                wait = (2 ** attempt) * 5
                time.sleep(wait)
            else:
                return (code, None, str(e))

    return (code, None, "Max retries exceeded")

def detect_changes(old_data: dict, new_data: dict) -> list:
    """偵測變更"""
    changes = []
    for key in new_data:
        old_val = old_data.get(key, '')
        new_val = new_data.get(key, '')
        if old_val != new_val:
            is_important = key in IMPORTANT_FIELDS
            changes.append({
                'field': key,
                'old': old_val,
                'new': new_val,
                'important': is_important
            })
    return changes

def save_company_data(code: str, data: dict, data_dir: Path) -> dict:
    """儲存公司資料（與現有格式相容）"""
    latest_file = data_dir / f"{code}_latest.json"
    history_file = data_dir / f"{code}_history.json"

    now = datetime.now().isoformat()

    # 檢查是否有舊資料
    old_data = None
    is_first = True
    if latest_file.exists():
        is_first = False
        with open(latest_file, 'r', encoding='utf-8') as f:
            old_data = json.load(f).get('data', {})

    # 偵測變更
    has_changes = True
    changes = []
    if old_data:
        changes = detect_changes(old_data, data)
        has_changes = len(changes) > 0

    # 儲存最新資料
    latest = {
        'company_code': code,
        'fetched_at': now,
        'data': data
    }
    with open(latest_file, 'w', encoding='utf-8') as f:
        json.dump(latest, f, ensure_ascii=False, indent=2)

    # 更新歷史記錄
    history = []
    if history_file.exists():
        with open(history_file, 'r', encoding='utf-8') as f:
            history = json.load(f)

    # 混合儲存策略
    if is_first or has_changes:
        record = {
            'company_code': code,
            'fetched_at': now,
            'record_type': 'full',
            'data': data
        }
    else:
        record = {
            'company_code': code,
            'fetched_at': now,
            'record_type': 'no_change'
        }

    history.insert(0, record)
    history = history[:50]  # 保留最多 50 筆

    with open(history_file, 'w', encoding='utf-8') as f:
        json.dump(history, f, ensure_ascii=False, indent=2)

    return {
        'is_first': is_first,
        'has_changes': has_changes,
        'changes': changes
    }

def update_index(code: str, data: dict, data_dir: Path):
    """更新索引"""
    index_file = data_dir / "index.json"

    index = {}
    if index_file.exists():
        with open(index_file, 'r', encoding='utf-8') as f:
            index = json.load(f)

    index[code] = {
        'name': data.get('公司名稱', ''),
        'industry': data.get('產業類別', ''),
        'last_fetched': datetime.now().isoformat()
    }

    with open(index_file, 'w', encoding='utf-8') as f:
        json.dump(index, f, ensure_ascii=False, indent=2)

def load_companies(data_dir: Path) -> dict:
    """載入公司清單"""
    companies_file = data_dir / "companies.json"
    if not companies_file.exists():
        print("錯誤：companies.json 不存在，請先執行 sync_companies.py")
        return {}

    with open(companies_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # companies.json 格式: {"total": N, "companies": [{"code": "1101", "name": "..."}, ...]}
    if isinstance(data, dict) and 'companies' in data:
        return {c['code']: c['name'] for c in data['companies']}
    return data

def load_progress(data_dir: Path) -> dict:
    """載入進度"""
    progress_file = data_dir / "api_progress.json"
    if progress_file.exists():
        with open(progress_file, 'r', encoding='utf-8') as f:
            return json.load(f)
    return {
        'completed': [],
        'failed': [],
        'started_at': None,
        'last_updated': None
    }

def save_progress(progress: dict, data_dir: Path):
    """儲存進度"""
    progress_file = data_dir / "api_progress.json"
    progress['last_updated'] = datetime.now().isoformat()
    with open(progress_file, 'w', encoding='utf-8') as f:
        json.dump(progress, f, ensure_ascii=False, indent=2)

def batch_scrape(limit: int = None, reset: bool = False, skip_existing: bool = True):
    """批次抓取"""
    data_dir = get_data_dir()
    data_dir.mkdir(exist_ok=True)

    # 載入公司清單
    companies = load_companies(data_dir)
    if not companies:
        return

    # 載入進度
    progress = load_progress(data_dir)
    if reset:
        progress = {'completed': [], 'failed': [], 'started_at': None, 'last_updated': None}

    if not progress['started_at']:
        progress['started_at'] = datetime.now().isoformat()

    # 檢查已有資料的公司（跳過已存在的）
    existing_files = set()
    if skip_existing:
        for f in data_dir.glob('*_latest.json'):
            code = f.stem.replace('_latest', '')
            existing_files.add(code)

    # 過濾待抓取的公司
    completed_set = set(progress['completed'])
    pending = [code for code in companies.keys()
               if code not in completed_set and code not in existing_files]

    if limit:
        pending = pending[:limit]

    total = len(pending)
    if total == 0:
        print("所有公司都已抓取完成！")
        return

    print(f"\n=== API 批次抓取開始 ===")
    print(f"公司總數: {len(companies)} 間")
    print(f"已有資料: {len(existing_files)} 間（跳過）")
    print(f"待抓取: {total} 間公司")
    print(f"併發數: {MAX_WORKERS}")
    print(f"延遲: {MIN_DELAY}-{MAX_DELAY} 秒")
    print(f"每 {BATCH_SIZE} 間休息 {BATCH_REST} 秒")
    print(f"預估時間: {total * 1.5 / 60:.1f} 分鐘")
    print("-" * 50)
    print()

    # 建立 Session
    session = requests.Session()
    session.headers.update({'User-Agent': USER_AGENT})

    success = 0
    failed = 0
    start_time = time.time()

    # 分批處理
    for batch_start in range(0, total, BATCH_SIZE):
        batch_end = min(batch_start + BATCH_SIZE, total)
        batch = pending[batch_start:batch_end]
        batch_num = batch_start // BATCH_SIZE + 1
        total_batches = (total + BATCH_SIZE - 1) // BATCH_SIZE

        print(f"[批次 {batch_num}/{total_batches}] 抓取 {len(batch)} 間...")

        # 使用 ThreadPoolExecutor 併發
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            futures = {executor.submit(fetch_with_retry, code, session): code for code in batch}

            for future in as_completed(futures):
                code, data, error = future.result()

                if error:
                    print(f"  {code}: 失敗 - {error}")
                    progress['failed'].append(code)
                    failed += 1
                elif data:
                    # 儲存資料
                    result = save_company_data(code, data, data_dir)
                    update_index(code, data, data_dir)
                    progress['completed'].append(code)
                    success += 1

                    # 顯示進度
                    name = data.get('公司名稱', '')[:12]
                    status = "首次" if result['is_first'] else ("有變更" if result['has_changes'] else "無變更")
                    elapsed = time.time() - start_time
                    rate = success / elapsed if elapsed > 0 else 0
                    eta = (total - success - failed) / rate / 60 if rate > 0 else 0

                    print(f"  {code} {name}: {status} ({success + failed}/{total}, ETA: {eta:.1f}分)")
                else:
                    print(f"  {code}: 無資料")
                    progress['failed'].append(code)
                    failed += 1

        # 儲存進度
        save_progress(progress, data_dir)

        # 批次間休息
        if batch_end < total:
            print(f"\n  休息 {BATCH_REST} 秒...\n")
            time.sleep(BATCH_REST)

    # 完成統計
    elapsed = time.time() - start_time
    print()
    print("=" * 50)
    print(f"=== 抓取完成 ===")
    print(f"成功: {success} 間")
    print(f"失敗: {failed} 間")
    print(f"耗時: {elapsed / 60:.1f} 分鐘")
    print(f"速度: {success / elapsed:.2f} 間/秒")

def show_progress():
    """顯示進度"""
    data_dir = get_data_dir()
    progress = load_progress(data_dir)
    companies = load_companies(data_dir)

    total = len(companies)
    completed = len(progress['completed'])
    failed = len(progress['failed'])
    remaining = total - completed

    print(f"\n=== API 抓取進度 ===")
    print(f"公司總數: {total} 間")
    print(f"已完成: {completed} 間 ({completed * 100 / total:.1f}%)")
    print(f"失敗: {failed} 間")
    print(f"剩餘: {remaining} 間")

    if progress['started_at']:
        print(f"\n開始時間: {progress['started_at']}")
    if progress['last_updated']:
        print(f"最後更新: {progress['last_updated']}")

    if failed > 0:
        print(f"\n失敗的公司代碼:")
        for code in progress['failed'][:20]:
            print(f"  - {code}")
        if failed > 20:
            print(f"  ... 還有 {failed - 20} 間")

def retry_failed():
    """重試失敗的公司"""
    data_dir = get_data_dir()
    progress = load_progress(data_dir)

    failed = progress['failed']
    if not failed:
        print("沒有失敗的公司需要重試")
        return

    print(f"重試 {len(failed)} 間失敗的公司...")

    # 清空失敗清單
    progress['failed'] = []
    save_progress(progress, data_dir)

    # 重新抓取
    session = requests.Session()
    session.headers.update({'User-Agent': USER_AGENT})

    success = 0
    still_failed = 0

    for code in failed:
        code, data, error = fetch_with_retry(code, session)

        if data:
            result = save_company_data(code, data, data_dir)
            update_index(code, data, data_dir)
            progress['completed'].append(code)
            success += 1
            print(f"  {code}: 成功")
        else:
            progress['failed'].append(code)
            still_failed += 1
            print(f"  {code}: 仍然失敗 - {error}")

    save_progress(progress, data_dir)
    print(f"\n重試完成: 成功 {success} 間, 仍失敗 {still_failed} 間")

def main():
    parser = argparse.ArgumentParser(description='MOPS 公司資料抓取器（API 版本）')
    parser.add_argument('--progress', '-p', action='store_true', help='顯示進度')
    parser.add_argument('--limit', '-l', type=int, help='限制抓取數量')
    parser.add_argument('--reset', action='store_true', help='重置進度')
    parser.add_argument('--retry', '-r', action='store_true', help='重試失敗的公司')

    args = parser.parse_args()

    if args.progress:
        show_progress()
    elif args.retry:
        retry_failed()
    else:
        batch_scrape(limit=args.limit, reset=args.reset)

if __name__ == '__main__':
    main()
