#!/usr/bin/env python3
"""
建設公司資料抓取腳本
====================
從 TWSE/TPEX OpenAPI 抓取所有建設業公司資料。

使用方式：
    python scripts/fetch_companies.py
"""

import sys
sys.stdout.reconfigure(encoding='utf-8')

import json
import sqlite3
import requests
import time
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Optional

# ===== 路徑設定 =====
BASE_DIR = Path(__file__).parent.parent
DATA_DIR = BASE_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
DB_PATH = DATA_DIR / "companies.db"

# 確保目錄存在
DATA_DIR.mkdir(exist_ok=True)
RAW_DIR.mkdir(exist_ok=True)

# ===== API 端點 =====
TWSE_COMPANY_API = "https://openapi.twse.com.tw/v1/opendata/t187ap03_L"
TPEX_COMPANY_API = "https://www.tpex.org.tw/openapi/v1/mopsfin_t187ap03_O"
TWSE_FINANCIAL_API = "https://openapi.twse.com.tw/v1/exchangeReport/BWIBBU_ALL"

# 建設業產業代碼
CONSTRUCTION_INDUSTRY_CODE = "14"


def create_database():
    """建立資料庫 Schema"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()

    # 建設公司基本資料
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS companies (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            stock_code TEXT UNIQUE NOT NULL,
            company_name TEXT NOT NULL,
            company_short_name TEXT,
            market_type TEXT NOT NULL,

            -- 經營團隊
            chairman TEXT,
            ceo TEXT,
            founder TEXT,
            founder_story TEXT,
            management_team TEXT,

            -- 公司基本資料
            establishment_date TEXT,
            listing_date TEXT,
            capital INTEGER,
            accounting_firm TEXT,

            -- 建築風格與定位
            building_style TEXT,
            target_market TEXT,
            geographic_focus TEXT,
            representative_projects TEXT,

            -- 連結
            official_website TEXT,
            ir_page TEXT,

            -- 元資料
            created_at TEXT DEFAULT CURRENT_TIMESTAMP,
            updated_at TEXT DEFAULT CURRENT_TIMESTAMP,
            research_status TEXT DEFAULT 'pending'
        )
    ''')

    # 財務快照
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS financial_snapshots (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            stock_code TEXT NOT NULL,
            snapshot_date TEXT NOT NULL,
            close_price REAL,
            pe_ratio REAL,
            dividend_yield REAL,
            pb_ratio REAL,
            created_at TEXT DEFAULT CURRENT_TIMESTAMP,
            UNIQUE(stock_code, snapshot_date)
        )
    ''')

    # 重大訊息（從 mops-collector 同步）
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS announcements (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            stock_code TEXT NOT NULL,
            announcement_date TEXT NOT NULL,
            title TEXT NOT NULL,
            content TEXT,
            has_appraiser_info INTEGER DEFAULT 0,
            appraiser_firms TEXT,
            created_at TEXT DEFAULT CURRENT_TIMESTAMP
        )
    ''')

    # 建立索引
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_companies_stock_code ON companies(stock_code)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_financial_stock_code ON financial_snapshots(stock_code)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_announcements_stock_code ON announcements(stock_code)')

    conn.commit()
    conn.close()
    print(f"[OK] 資料庫建立完成：{DB_PATH}")


def fetch_twse_companies() -> List[Dict[str, Any]]:
    """抓取上市公司資料"""
    print("[INFO] 抓取上市公司資料...")

    try:
        response = requests.get(TWSE_COMPANY_API, timeout=30)
        response.raise_for_status()
        data = response.json()

        # 儲存原始資料
        raw_file = RAW_DIR / f"twse_companies_{datetime.now().strftime('%Y%m%d')}.json"
        with open(raw_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

        # 過濾建設業（欄位名稱是「產業別」）
        construction_companies = [
            company for company in data
            if company.get('產業別') == CONSTRUCTION_INDUSTRY_CODE
        ]

        print(f"[OK] 上市公司總數：{len(data)}，建設業：{len(construction_companies)}")
        return construction_companies

    except Exception as e:
        print(f"[ERROR] 抓取上市公司失敗：{e}")
        return []


def fetch_tpex_companies() -> List[Dict[str, Any]]:
    """抓取上櫃公司資料"""
    print("[INFO] 抓取上櫃公司資料...")

    try:
        response = requests.get(TPEX_COMPANY_API, timeout=30)
        response.raise_for_status()
        data = response.json()

        # 儲存原始資料
        raw_file = RAW_DIR / f"tpex_companies_{datetime.now().strftime('%Y%m%d')}.json"
        with open(raw_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

        # 過濾建設業（TPEX 用英文欄位名「SecuritiesIndustryCode」）
        construction_companies = [
            company for company in data
            if company.get('SecuritiesIndustryCode') == CONSTRUCTION_INDUSTRY_CODE
        ]

        print(f"[OK] 上櫃公司總數：{len(data)}，建設業：{len(construction_companies)}")
        return construction_companies

    except Exception as e:
        print(f"[ERROR] 抓取上櫃公司失敗：{e}")
        return []


def fetch_financial_data() -> Dict[str, Dict[str, Any]]:
    """抓取財務指標資料"""
    print("[INFO] 抓取財務指標資料...")

    try:
        response = requests.get(TWSE_FINANCIAL_API, timeout=30)
        response.raise_for_status()
        data = response.json()

        # 轉換為 dict，以股票代號為 key
        financial_dict = {}
        for item in data:
            stock_code = item.get('證券代號', '').strip()
            if stock_code:
                financial_dict[stock_code] = {
                    'pe_ratio': item.get('本益比', ''),
                    'dividend_yield': item.get('殖利率(%)', ''),
                    'pb_ratio': item.get('股價淨值比', '')
                }

        print(f"[OK] 財務資料筆數：{len(financial_dict)}")
        return financial_dict

    except Exception as e:
        print(f"[ERROR] 抓取財務資料失敗：{e}")
        return {}


def parse_capital(capital_str: str) -> Optional[int]:
    """解析資本額字串"""
    if not capital_str:
        return None
    try:
        # 移除逗號和空格
        cleaned = capital_str.replace(',', '').replace(' ', '').strip()
        return int(float(cleaned))
    except:
        return None


def save_companies(twse_companies: List[Dict], tpex_companies: List[Dict], financial_data: Dict):
    """儲存公司資料到資料庫"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()

    today = datetime.now().strftime('%Y-%m-%d')
    inserted = 0
    updated = 0

    # 處理上市公司
    for company in twse_companies:
        stock_code = company.get('公司代號', '').strip()
        if not stock_code:
            continue

        # 檢查是否已存在
        cursor.execute('SELECT id FROM companies WHERE stock_code = ?', (stock_code,))
        exists = cursor.fetchone()

        capital = parse_capital(company.get('實收資本額', ''))

        if exists:
            # 更新
            cursor.execute('''
                UPDATE companies SET
                    company_name = ?,
                    company_short_name = ?,
                    chairman = ?,
                    ceo = ?,
                    establishment_date = ?,
                    listing_date = ?,
                    capital = ?,
                    accounting_firm = ?,
                    updated_at = ?
                WHERE stock_code = ?
            ''', (
                company.get('公司名稱', ''),
                company.get('公司簡稱', ''),
                company.get('董事長', ''),
                company.get('總經理', ''),
                company.get('成立日期', ''),
                company.get('上市日期', ''),
                capital,
                company.get('簽證會計師事務所', ''),
                today,
                stock_code
            ))
            updated += 1
        else:
            # 新增
            cursor.execute('''
                INSERT INTO companies (
                    stock_code, company_name, company_short_name, market_type,
                    chairman, ceo, establishment_date, listing_date,
                    capital, accounting_firm, created_at, updated_at
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                stock_code,
                company.get('公司名稱', ''),
                company.get('公司簡稱', ''),
                '上市',
                company.get('董事長', ''),
                company.get('總經理', ''),
                company.get('成立日期', ''),
                company.get('上市日期', ''),
                capital,
                company.get('簽證會計師事務所', ''),
                today,
                today
            ))
            inserted += 1

    # 處理上櫃公司（TPEX 使用英文欄位名稱）
    for company in tpex_companies:
        stock_code = company.get('SecuritiesCompanyCode', '').strip()
        if not stock_code:
            continue

        cursor.execute('SELECT id FROM companies WHERE stock_code = ?', (stock_code,))
        exists = cursor.fetchone()

        capital = parse_capital(company.get('Paidin.Capital.NTDollars', ''))

        if exists:
            cursor.execute('''
                UPDATE companies SET
                    company_name = ?,
                    company_short_name = ?,
                    chairman = ?,
                    ceo = ?,
                    establishment_date = ?,
                    listing_date = ?,
                    capital = ?,
                    accounting_firm = ?,
                    updated_at = ?
                WHERE stock_code = ?
            ''', (
                company.get('CompanyName', ''),
                company.get('CompanyAbbreviation', ''),
                company.get('Chairman', ''),
                company.get('GeneralManager', ''),
                company.get('DateOfIncorporation', ''),
                company.get('DateOfListing', ''),
                capital,
                company.get('AccountingFirm', ''),
                today,
                stock_code
            ))
            updated += 1
        else:
            cursor.execute('''
                INSERT INTO companies (
                    stock_code, company_name, company_short_name, market_type,
                    chairman, ceo, establishment_date, listing_date,
                    capital, accounting_firm, created_at, updated_at
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                stock_code,
                company.get('CompanyName', ''),
                company.get('CompanyAbbreviation', ''),
                '上櫃',
                company.get('Chairman', ''),
                company.get('GeneralManager', ''),
                company.get('DateOfIncorporation', ''),
                company.get('DateOfListing', ''),
                capital,
                company.get('AccountingFirm', ''),
                today,
                today
            ))
            inserted += 1

    # 儲存財務資料
    financial_inserted = 0
    for stock_code, fin_data in financial_data.items():
        # 只處理建設業公司
        cursor.execute('SELECT id FROM companies WHERE stock_code = ?', (stock_code,))
        if not cursor.fetchone():
            continue

        try:
            pe = float(fin_data['pe_ratio']) if fin_data['pe_ratio'] else None
            dy = float(fin_data['dividend_yield']) if fin_data['dividend_yield'] else None
            pb = float(fin_data['pb_ratio']) if fin_data['pb_ratio'] else None

            cursor.execute('''
                INSERT OR REPLACE INTO financial_snapshots
                (stock_code, snapshot_date, pe_ratio, dividend_yield, pb_ratio)
                VALUES (?, ?, ?, ?, ?)
            ''', (stock_code, today, pe, dy, pb))
            financial_inserted += 1
        except:
            pass

    conn.commit()
    conn.close()

    print(f"[OK] 資料儲存完成：新增 {inserted}，更新 {updated}，財務快照 {financial_inserted}")


def calculate_data_quality(company: Dict) -> Dict:
    """計算資料品質層級

    Tier 定義：
    - verified: 經人工審核確認或來自官方 API
    - ai_generated: AI 自動產生，待人工驗證
    - raw: 原始資料，未加工
    """
    # 定義各欄位的資料來源類型
    api_fields = ['chairman', 'ceo', 'capital', 'establishment_date', 'listing_date',
                  'accounting_firm', 'company_name', 'company_short_name', 'market_type']
    research_fields = ['founder', 'founder_story', 'building_style', 'target_market',
                       'geographic_focus', 'representative_projects', 'management_team']

    quality = {}
    verified_count = 0
    ai_count = 0
    total_key_fields = len(api_fields) + len(research_fields)

    # API 欄位：有值即為 verified
    for field in api_fields:
        value = company.get(field)
        if value and str(value).strip():
            quality[field] = {'tier': 'verified', 'source': 'twse_tpex_api'}
            verified_count += 1
        else:
            quality[field] = {'tier': 'raw', 'source': 'missing'}

    # 研究欄位：根據 research_status 和欄位值判斷
    research_status = company.get('research_status', 'pending')

    for field in research_fields:
        value = company.get(field)
        if value and str(value).strip():
            # 有值的研究欄位
            if research_status == 'completed':
                quality[field] = {'tier': 'verified', 'source': 'manual_research'}
                verified_count += 1
            else:
                quality[field] = {'tier': 'ai_generated', 'source': 'auto_fill'}
                ai_count += 1
        else:
            quality[field] = {'tier': 'raw', 'source': 'missing'}

    # 計算整體層級
    verified_ratio = verified_count / total_key_fields if total_key_fields > 0 else 0

    if research_status == 'completed' and verified_ratio >= 0.6:
        overall_tier = 'verified'
    elif verified_ratio >= 0.3 or ai_count > 0:
        overall_tier = 'ai_generated'
    else:
        overall_tier = 'raw'

    return {
        'overall_tier': overall_tier,
        'verified_count': verified_count,
        'ai_generated_count': ai_count,
        'completeness': round(verified_ratio * 100, 1),
        'fields': quality
    }


def generate_json_for_web():
    """生成 JSON 供網頁使用"""
    conn = sqlite3.connect(DB_PATH)
    conn.row_factory = sqlite3.Row
    cursor = conn.cursor()

    # 取得所有公司
    cursor.execute('''
        SELECT c.*,
               f.pe_ratio, f.dividend_yield, f.pb_ratio, f.snapshot_date
        FROM companies c
        LEFT JOIN (
            SELECT stock_code, pe_ratio, dividend_yield, pb_ratio, snapshot_date,
                   ROW_NUMBER() OVER (PARTITION BY stock_code ORDER BY snapshot_date DESC) as rn
            FROM financial_snapshots
        ) f ON c.stock_code = f.stock_code AND f.rn = 1
        ORDER BY c.stock_code
    ''')

    companies = []
    tier_stats = {'verified': 0, 'ai_generated': 0, 'raw': 0}

    for row in cursor.fetchall():
        company = dict(row)
        # 轉換資本額為億元
        if company['capital']:
            company['capital_billion'] = round(company['capital'] / 100000000, 2)
        else:
            company['capital_billion'] = None

        # 計算資料品質
        data_quality = calculate_data_quality(company)
        company['data_tier'] = data_quality['overall_tier']
        company['data_quality'] = data_quality

        tier_stats[data_quality['overall_tier']] += 1
        companies.append(company)

    conn.close()

    # 輸出 JSON
    output_path = BASE_DIR / "web" / "js" / "companies.json"
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(companies, f, ensure_ascii=False, indent=2)

    print(f"[OK] JSON 輸出完成：{output_path}（{len(companies)} 家公司）")
    print(f"    資料層級：Verified={tier_stats['verified']}, AI={tier_stats['ai_generated']}, Raw={tier_stats['raw']}")
    return companies


def main():
    """主程式"""
    print("=" * 60)
    print("台灣上市櫃建設公司資料抓取")
    print("=" * 60)
    print()

    # 1. 建立資料庫
    create_database()
    print()

    # 2. 抓取公司資料
    twse_companies = fetch_twse_companies()
    time.sleep(1)  # 避免請求過快

    tpex_companies = fetch_tpex_companies()
    time.sleep(1)

    # 3. 抓取財務資料
    financial_data = fetch_financial_data()
    print()

    # 4. 儲存到資料庫
    save_companies(twse_companies, tpex_companies, financial_data)
    print()

    # 5. 生成 JSON
    companies = generate_json_for_web()
    print()

    # 6. 顯示摘要
    print("=" * 60)
    print("抓取完成摘要")
    print("=" * 60)
    print(f"上市建設公司：{len(twse_companies)} 家")
    print(f"上櫃建設公司：{len(tpex_companies)} 家")
    print(f"總計：{len(companies)} 家")
    print()

    # 顯示公司清單
    print("公司清單：")
    print("-" * 60)
    for c in companies:
        market = c.get('market_type', '')
        name = c.get('company_short_name') or c.get('company_name', '')
        code = c.get('stock_code', '')
        chairman = c.get('chairman', '')
        capital = c.get('capital_billion', 0) or 0
        print(f"  {code} {name:<20} [{market}] 董事長：{chairman:<10} 資本額：{capital:>6.1f}億")


if __name__ == '__main__':
    main()
