"""
文字提取器 - 從研究筆記或任意文字檔案提取內容

功能：
1. 解析 Markdown 檔案的 YAML frontmatter
2. 提取「快速摘要」區塊內容（summary 模式）
3. 提取完整文章內容（full 模式）
4. 將 Markdown 轉換為適合語音朗讀的純文字
"""

import re
import yaml
from pathlib import Path
from dataclasses import dataclass
from typing import Optional, Tuple
from datetime import date

from .config import SUMMARY_START_MARKER, FULL_REPORT_MARKER

# 提取模式
EXTRACT_MODE_SUMMARY = "summary"  # 只提取快速摘要
EXTRACT_MODE_FULL = "full"        # 提取完整內容


@dataclass
class ResearchNote:
    """研究筆記資料結構"""
    file_path: Path
    title: str
    note_id: str
    created_at: date
    summary_text: str
    summary_markdown: str
    yaml_metadata: dict
    extract_mode: str = EXTRACT_MODE_SUMMARY  # 提取模式


class TextExtractor:
    """文字提取器 - 支援快速摘要和全文兩種模式"""

    def __init__(self, mode: str = EXTRACT_MODE_SUMMARY):
        """
        初始化提取器

        Args:
            mode: 提取模式，'summary' 或 'full'
        """
        self.mode = mode

        # Markdown 清理規則
        self.cleanup_patterns = [
            # 移除 Markdown 連結，保留文字
            (r'\[([^\]]+)\]\([^)]+\)', r'\1'),
            # 移除圖片
            (r'!\[([^\]]*)\]\([^)]+\)', r''),
            # 移除粗體標記
            (r'\*\*([^*]+)\*\*', r'\1'),
            # 移除斜體標記
            (r'\*([^*]+)\*', r'\1'),
            (r'_([^_]+)_', r'\1'),
            # 移除行內程式碼
            (r'`([^`]+)`', r'\1'),
            # 移除程式碼區塊（保留說明）
            (r'```[\s\S]*?```', r''),
            # 移除 HTML 標籤
            (r'<[^>]+>', r''),
            # 移除標題的 # 符號
            (r'^#{1,6}\s+', r'', re.MULTILINE),
            # 移除表格分隔線
            (r'\|[-:]+\|[-:|\s]+\|', r''),
            # 移除多餘的空行
            (r'\n{3,}', r'\n\n'),
        ]

        # 特殊符號轉換（讓語音更自然）
        self.symbol_replacements = [
            # 表格符號
            ('|', '，'),
            # 星星評分
            (r'⭐{5}', '五星'),
            (r'⭐{4}', '四星'),
            (r'⭐{3}', '三星'),
            (r'⭐{2}', '二星'),
            (r'⭐{1}', '一星'),
            # Emoji 替換
            ('✅', ''),
            ('❌', ''),
            ('⚠️', '注意，'),
            ('🎯', ''),
            ('📋', ''),
            ('📖', ''),
            ('⬇️', ''),
            ('🔊', ''),
            ('📝', ''),
            ('🟡', ''),
            ('🟢', ''),
            ('🔴', ''),
            # 項目符號
            ('- ', '，'),
            ('* ', '，'),
        ]

    def extract(self, file_path: Path) -> ResearchNote:
        """
        從研究筆記或文字檔案中提取資訊

        Args:
            file_path: 檔案路徑（支援 .md, .txt）

        Returns:
            ResearchNote 物件
        """
        content = file_path.read_text(encoding='utf-8')

        # 解析 YAML frontmatter（如果有的話）
        yaml_data, body = self._parse_frontmatter(content)

        # 根據模式提取內容
        if self.mode == EXTRACT_MODE_FULL:
            # 全文模式：提取整個文章內容
            extracted_md = self._extract_full_content(body)
            intro_text = "全文朗讀"
        else:
            # 摘要模式：只提取快速摘要區塊
            extracted_md = self._extract_summary_section(body)
            intro_text = "快速摘要"

        # 轉換為語音文字
        title = yaml_data.get('title', file_path.stem)
        speech_text = self._convert_to_speech_text(extracted_md, title, intro_text)

        return ResearchNote(
            file_path=file_path,
            title=title,
            note_id=yaml_data.get('id', file_path.stem),
            created_at=yaml_data.get('created_at', date.today()),
            summary_text=speech_text,
            summary_markdown=extracted_md,
            yaml_metadata=yaml_data,
            extract_mode=self.mode
        )

    def _parse_frontmatter(self, content: str) -> Tuple[dict, str]:
        """解析 YAML frontmatter"""
        if not content.startswith('---'):
            return {}, content

        # 找到 frontmatter 結束位置
        end_match = re.search(r'\n---\n', content[3:])
        if not end_match:
            return {}, content

        yaml_text = content[3:end_match.start() + 3]
        body = content[end_match.end() + 3:]

        try:
            yaml_data = yaml.safe_load(yaml_text) or {}
        except yaml.YAMLError:
            yaml_data = {}

        return yaml_data, body

    def _extract_summary_section(self, body: str) -> str:
        """提取快速摘要區塊"""
        # 找到快速摘要開始位置
        start_match = re.search(
            r'##\s*📋?\s*快速摘要',
            body,
            re.IGNORECASE
        )

        if not start_match:
            # 嘗試其他可能的標題格式
            start_match = re.search(r'##\s*快速摘要', body, re.IGNORECASE)

        if not start_match:
            raise ValueError("找不到「快速摘要」區塊")

        start_pos = start_match.start()

        # 找到完整報告開始位置作為結束點
        end_match = re.search(
            r'##\s*📖?\s*完整研究報告',
            body[start_pos:],
            re.IGNORECASE
        )

        if end_match:
            end_pos = start_pos + end_match.start()
        else:
            # 如果找不到完整報告區塊，取到下一個 ## 或文件結尾
            next_section = re.search(r'\n##\s+[^#]', body[start_pos + 10:])
            if next_section:
                end_pos = start_pos + 10 + next_section.start()
            else:
                end_pos = len(body)

        summary = body[start_pos:end_pos].strip()

        # 移除最後的分隔線
        summary = re.sub(r'\n---\s*$', '', summary)

        return summary

    def _extract_full_content(self, body: str) -> str:
        """
        提取完整文章內容

        Args:
            body: 去除 frontmatter 後的文章內容

        Returns:
            清理後的完整內容
        """
        text = body.strip()

        # 移除語音測試區（如果有的話）
        text = re.sub(
            r'##\s*🎧?\s*語音測試區[\s\S]*?(?=\n##\s+[^#]|\n---\s*\n|\Z)',
            '',
            text,
            flags=re.IGNORECASE
        )

        # 移除音訊嵌入區塊
        text = re.sub(r'>\s*🔊.*?(?=\n[^>]|\Z)', '', text, flags=re.DOTALL)

        # 移除 Multi-Agent 協作記錄區塊（通常放在最後，使用者不需要聽）
        text = re.sub(
            r'##\s*🤖?\s*Multi-Agent.*?(?=\n##\s+[^#]|\Z)',
            '',
            text,
            flags=re.IGNORECASE | re.DOTALL
        )

        # 移除模板版本記錄區塊
        text = re.sub(
            r'##\s*📝?\s*模板版本.*?(?=\n##\s+[^#]|\Z)',
            '',
            text,
            flags=re.IGNORECASE | re.DOTALL
        )

        # 移除後續行動區塊（待辦事項）
        text = re.sub(
            r'##\s*🔧?\s*後續行動.*?(?=\n##\s+[^#]|\Z)',
            '',
            text,
            flags=re.IGNORECASE | re.DOTALL
        )

        # 移除參考資料區塊（大量連結不適合朗讀）
        text = re.sub(
            r'##\s*參考資料.*?(?=\n##\s+[^#]|\Z)',
            '',
            text,
            flags=re.IGNORECASE | re.DOTALL
        )

        # 移除資料來源與引用區塊（表格不適合朗讀）
        text = re.sub(
            r'##\s*資料來源與引用.*?(?=\n##\s+[^#]|\Z)',
            '',
            text,
            flags=re.IGNORECASE | re.DOTALL
        )

        return text.strip()

    def _convert_to_speech_text(self, markdown: str, title: str, intro_type: str = "快速摘要") -> str:
        """將 Markdown 轉換為適合語音朗讀的文字"""
        text = markdown

        # 應用清理規則
        for pattern, replacement, *flags in self.cleanup_patterns:
            flag = flags[0] if flags else 0
            text = re.sub(pattern, replacement, text, flags=flag)

        # 應用符號替換
        for old, new in self.symbol_replacements:
            if old.startswith(r'⭐'):
                text = re.sub(old, new, text)
            else:
                text = text.replace(old, new)

        # 處理表格（轉換為列表形式）
        text = self._convert_tables(text)

        # 處理標題層級（加入停頓）
        text = self._process_headings(text)

        # 加入開場白
        intro = f"以下是「{title}」的{intro_type}。\n\n"

        # 加入結尾（根據類型調整）
        if intro_type == "全文朗讀":
            outro = "\n\n以上是全文朗讀的全部內容。"
        else:
            outro = "\n\n以上是快速摘要的全部內容。如需了解更多細節，請閱讀完整研究報告。"

        # 清理多餘空白
        text = re.sub(r'\n{3,}', '\n\n', text)
        text = re.sub(r'[ \t]+', ' ', text)
        text = text.strip()

        return intro + text + outro

    def _convert_tables(self, text: str) -> str:
        """將表格轉換為適合朗讀的格式"""
        lines = text.split('\n')
        result = []
        in_table = False
        headers = []

        for line in lines:
            line = line.strip()

            # 檢測表格行
            if line.startswith('|') and line.endswith('|'):
                cells = [c.strip() for c in line.strip('|').split('|')]

                # 跳過分隔線
                if all(re.match(r'^[-:]+$', c) for c in cells):
                    continue

                if not in_table:
                    # 第一行是標題
                    headers = cells
                    in_table = True
                else:
                    # 資料行：轉換為「標題：值」格式
                    parts = []
                    for i, cell in enumerate(cells):
                        if i < len(headers) and cell:
                            parts.append(f"{headers[i]}是{cell}")
                    if parts:
                        result.append("，".join(parts) + "。")
            else:
                if in_table:
                    in_table = False
                    headers = []
                result.append(line)

        return '\n'.join(result)

    def _process_headings(self, text: str) -> str:
        """處理標題，加入適當的停頓"""
        # 三級標題
        text = re.sub(r'^### (.+)$', r'\n\1：\n', text, flags=re.MULTILINE)
        # 二級標題
        text = re.sub(r'^## (.+)$', r'\n\n\1\n\n', text, flags=re.MULTILINE)

        return text


def extract_summary(file_path: str | Path, mode: str = EXTRACT_MODE_SUMMARY) -> ResearchNote:
    """
    便捷函數：從檔案提取內容

    Args:
        file_path: 檔案路徑
        mode: 提取模式，'summary' 或 'full'

    Returns:
        ResearchNote 物件
    """
    extractor = TextExtractor(mode=mode)
    return extractor.extract(Path(file_path))


def extract_full_text(file_path: str | Path) -> ResearchNote:
    """
    便捷函數：從檔案提取全文

    Args:
        file_path: 檔案路徑

    Returns:
        ResearchNote 物件
    """
    return extract_summary(file_path, mode=EXTRACT_MODE_FULL)


# 保持向後相容的別名
SummaryExtractor = TextExtractor