#!/usr/bin/env python3
"""
Claude Code JSONL 檔案結構分析器

詳細分析 JSONL 檔案中各種內容的組成，
並說明哪些可以壓縮或刪除。
"""
import sys
sys.stdout.reconfigure(encoding='utf-8')

import json
from pathlib import Path
from collections import defaultdict

def analyze_jsonl(filepath: Path) -> dict:
    """分析單個 JSONL 檔案的結構"""

    stats = {
        "total_bytes": 0,
        "total_lines": 0,
        "by_type": defaultdict(lambda: {"count": 0, "bytes": 0}),
        "metadata_fields": defaultdict(lambda: {"count": 0, "bytes": 0}),
        "content_breakdown": {
            "user_instruction": 0,      # 用戶的指令文字
            "tool_result_content": 0,   # 工具執行結果（讀檔內容等）
            "thinking": 0,              # AI 的思考過程
            "assistant_text": 0,        # AI 給用戶看的輸出
            "tool_use": 0,              # 工具呼叫定義
            "progress": 0,              # 執行進度
            "metadata": 0,              # 重複的 metadata 欄位
            "other": 0,
        },
        "metadata_examples": {},        # 記錄 metadata 範例
        "tool_result_examples": [],     # 記錄 tool_result 範例
    }

    # 這些欄位在每一行都會重複出現，是 "overhead"
    repeated_metadata_fields = [
        "parentUuid", "isSidechain", "userType", "cwd", "sessionId",
        "version", "gitBranch", "uuid", "timestamp", "requestId",
        "thinkingMetadata", "permissionMode", "slug", "toolUseID",
        "parentToolUseID", "isSnapshotUpdate"
    ]

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            stats["total_lines"] += 1
            line_bytes = len(line.encode('utf-8'))
            stats["total_bytes"] += line_bytes

            try:
                obj = json.loads(line)
                msg_type = obj.get('type', 'unknown')
                stats["by_type"][msg_type]["count"] += 1
                stats["by_type"][msg_type]["bytes"] += line_bytes

                # 計算 metadata 開銷
                metadata_bytes = 0
                for field in repeated_metadata_fields:
                    if field in obj:
                        field_str = json.dumps({field: obj[field]}, ensure_ascii=False)
                        field_bytes = len(field_str.encode('utf-8'))
                        metadata_bytes += field_bytes
                        stats["metadata_fields"][field]["count"] += 1
                        stats["metadata_fields"][field]["bytes"] += field_bytes

                        # 記錄範例
                        if field not in stats["metadata_examples"]:
                            stats["metadata_examples"][field] = obj[field]

                stats["content_breakdown"]["metadata"] += metadata_bytes

                # 分析具體內容
                if msg_type == 'progress':
                    stats["content_breakdown"]["progress"] += line_bytes

                elif msg_type == 'user':
                    message = obj.get('message', {})
                    content = message.get('content', '')

                    # 判斷是用戶指令還是 tool_result
                    if isinstance(content, list):
                        for item in content:
                            if isinstance(item, dict) and item.get('type') == 'tool_result':
                                result_content = item.get('content', '')
                                result_bytes = len(str(result_content).encode('utf-8'))
                                stats["content_breakdown"]["tool_result_content"] += result_bytes

                                # 記錄範例
                                if len(stats["tool_result_examples"]) < 3:
                                    stats["tool_result_examples"].append({
                                        "tool_use_id": item.get('tool_use_id', ''),
                                        "content_preview": str(result_content)[:200],
                                        "bytes": result_bytes
                                    })
                            else:
                                stats["content_breakdown"]["user_instruction"] += len(str(item).encode('utf-8'))
                    else:
                        content_bytes = len(str(content).encode('utf-8'))
                        stats["content_breakdown"]["user_instruction"] += content_bytes

                elif msg_type == 'assistant':
                    message = obj.get('message', {})
                    content = message.get('content', [])

                    if isinstance(content, list):
                        for item in content:
                            if isinstance(item, dict):
                                item_str = json.dumps(item, ensure_ascii=False)
                                item_bytes = len(item_str.encode('utf-8'))
                                item_type = item.get('type', '')

                                if item_type == 'thinking':
                                    stats["content_breakdown"]["thinking"] += item_bytes
                                elif item_type == 'text':
                                    stats["content_breakdown"]["assistant_text"] += item_bytes
                                elif item_type == 'tool_use':
                                    stats["content_breakdown"]["tool_use"] += item_bytes
                                else:
                                    stats["content_breakdown"]["other"] += item_bytes

                else:
                    stats["content_breakdown"]["other"] += line_bytes - metadata_bytes

            except json.JSONDecodeError:
                stats["content_breakdown"]["other"] += line_bytes

    return stats


def print_analysis(filepath: Path, stats: dict):
    """印出分析結果"""

    print("=" * 70)
    print(f"📁 檔案: {filepath.name}")
    print(f"   大小: {stats['total_bytes']:,} bytes ({stats['total_bytes']/1024/1024:.2f} MB)")
    print(f"   行數: {stats['total_lines']:,}")
    print("=" * 70)

    # 按 type 分類
    print("\n📊 按訊息類型分類:")
    print("-" * 70)
    print(f"{'Type':<25} {'Count':>8} {'Size':>12} {'比例':>8}")
    print("-" * 70)

    for msg_type, data in sorted(stats["by_type"].items(), key=lambda x: x[1]["bytes"], reverse=True):
        pct = data["bytes"] / stats["total_bytes"] * 100
        size_str = f"{data['bytes']/1024:.1f} KB"
        print(f"{msg_type:<25} {data['count']:>8} {size_str:>12} {pct:>7.1f}%")

    # 內容細分
    print("\n📋 內容細分（關鍵洞察）:")
    print("-" * 70)
    print(f"{'內容類型':<35} {'Size':>12} {'比例':>8} {'建議':<15}")
    print("-" * 70)

    breakdown = stats["content_breakdown"]
    total = stats["total_bytes"]

    categories = [
        ("🔄 Progress (執行進度)", breakdown["progress"], "❌ 可刪除"),
        ("📦 Tool Result (讀檔結果)", breakdown["tool_result_content"], "⚠️ 可壓縮"),
        ("🏷️ Metadata (重複欄位)", breakdown["metadata"], "⚠️ 可壓縮"),
        ("🤔 Thinking (AI思考)", breakdown["thinking"], "❌ 可刪除"),
        ("🔧 Tool Use (工具呼叫)", breakdown["tool_use"], "⚠️ 可壓縮"),
        ("💬 User Instruction (你的指令)", breakdown["user_instruction"], "✅ 保留"),
        ("📝 Assistant Text (AI輸出)", breakdown["assistant_text"], "✅ 保留"),
        ("❓ Other", breakdown["other"], "—"),
    ]

    for name, bytes_val, suggestion in sorted(categories, key=lambda x: x[1], reverse=True):
        if bytes_val > 0:
            pct = bytes_val / total * 100
            size_str = f"{bytes_val/1024:.1f} KB"
            print(f"{name:<35} {size_str:>12} {pct:>7.1f}% {suggestion:<15}")

    # Metadata 欄位詳情
    print("\n🏷️ Metadata 重複欄位詳情:")
    print("-" * 70)
    print("這些欄位在每一行都會重複出現：")
    print()

    for field, data in sorted(stats["metadata_fields"].items(), key=lambda x: x[1]["bytes"], reverse=True)[:8]:
        example = stats["metadata_examples"].get(field, "")
        example_str = str(example)[:50] + "..." if len(str(example)) > 50 else str(example)
        print(f"  • {field}: {data['bytes']/1024:.1f} KB ({data['count']}次)")
        print(f"    範例: {example_str}")

    # Tool Result 範例
    if stats["tool_result_examples"]:
        print("\n📦 Tool Result 範例（這是讀檔/執行結果）:")
        print("-" * 70)
        for i, ex in enumerate(stats["tool_result_examples"][:2], 1):
            print(f"  {i}. ID: {ex['tool_use_id'][:30]}...")
            print(f"     大小: {ex['bytes']/1024:.1f} KB")
            print(f"     內容預覽: {ex['content_preview'][:100]}...")
            print()

    # 節省空間估算
    print("\n💡 壓縮效益估算:")
    print("-" * 70)

    deletable = breakdown["progress"] + breakdown["thinking"]
    compressible = breakdown["tool_result_content"] + breakdown["metadata"] + breakdown["tool_use"]
    important = breakdown["user_instruction"] + breakdown["assistant_text"]

    print(f"  可直接刪除: {deletable/1024:.1f} KB ({deletable/total*100:.1f}%)")
    print(f"  可壓縮內容: {compressible/1024:.1f} KB ({compressible/total*100:.1f}%)")
    print(f"  重要保留:   {important/1024:.1f} KB ({important/total*100:.1f}%)")
    print()
    print(f"  預估壓縮後: {important/1024:.1f} ~ {(important + compressible*0.3)/1024:.1f} KB")
    print(f"  節省空間:   {(deletable + compressible*0.7)/total*100:.1f}% ~ {(total - important)/total*100:.1f}%")

    return {
        "original": stats["total_bytes"],
        "deletable": deletable,
        "compressible": compressible,
        "important": important,
    }


def main():
    samples_dir = Path(__file__).parent / "samples"

    if not samples_dir.exists():
        print(f"❌ 找不到 samples 目錄: {samples_dir}")
        return

    all_results = []

    for jsonl_file in sorted(samples_dir.glob("*.jsonl")):
        print("\n")
        stats = analyze_jsonl(jsonl_file)
        result = print_analysis(jsonl_file, stats)
        result["filename"] = jsonl_file.name
        all_results.append(result)
        print("\n" + "=" * 70 + "\n")

    # 總結
    if all_results:
        print("\n" + "🎯" * 35)
        print("\n📊 所有檔案總結:")
        print("-" * 70)
        print(f"{'檔案':<35} {'原始':>10} {'可刪':>10} {'可壓':>10} {'重要':>10}")
        print("-" * 70)

        total_original = 0
        total_deletable = 0
        total_compressible = 0
        total_important = 0

        for r in all_results:
            print(f"{r['filename'][:35]:<35} {r['original']/1024:>9.0f}K {r['deletable']/1024:>9.0f}K {r['compressible']/1024:>9.0f}K {r['important']/1024:>9.0f}K")
            total_original += r["original"]
            total_deletable += r["deletable"]
            total_compressible += r["compressible"]
            total_important += r["important"]

        print("-" * 70)
        print(f"{'合計':<35} {total_original/1024:>9.0f}K {total_deletable/1024:>9.0f}K {total_compressible/1024:>9.0f}K {total_important/1024:>9.0f}K")
        print()
        print(f"💾 總原始大小: {total_original/1024/1024:.2f} MB")
        print(f"❌ 可刪除: {total_deletable/1024/1024:.2f} MB ({total_deletable/total_original*100:.1f}%)")
        print(f"⚠️ 可壓縮: {total_compressible/1024/1024:.2f} MB ({total_compressible/total_original*100:.1f}%)")
        print(f"✅ 重要保留: {total_important/1024/1024:.2f} MB ({total_important/total_original*100:.1f}%)")
        print()
        print(f"🎯 預估最小保留: {total_important/1024/1024:.2f} MB (只保留指令+輸出)")
        print(f"🎯 預估壓縮後: {(total_important + total_compressible*0.3)/1024/1024:.2f} MB (含壓縮後的工具結果)")


if __name__ == "__main__":
    main()
