"""
File discovery utilities for Aurora framework.

This module provides general-purpose file discovery mechanisms that work
with any directory structure. No dataset-specific code is included.

Example:
    >>> files = FileDiscovery.discover(Path("data/"), pattern="*.pkl")
    >>> files = FileDiscovery.discover(Path("data/"), pattern="**/*.json", recursive=True)
"""

from __future__ import annotations

import json
import pickle
import re
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union


class FileDiscovery:
    """
    General-purpose file discovery utility.

    Provides static methods for finding files by pattern, content, and grouping.

    Example:
        >>> files = FileDiscovery.discover(Path("data/"), pattern="*.pkl")
        >>> grouped = FileDiscovery.group_by_pattern(files, {"hcc": r"hcc_.*"})
    """

    @staticmethod
    def discover(
        directory: Union[Path, str],
        pattern: str = "*",
        recursive: bool = False,
        extensions: Optional[List[str]] = None,
    ) -> List[Path]:
        """
        Find files matching pattern in directory.

        Args:
            directory: Base directory to search
            pattern: Glob pattern (e.g., "*.pkl", "results_*.json")
            recursive: If True, search subdirectories
            extensions: Filter by extensions (e.g., [".pkl", ".json"])

        Returns:
            List of matching file paths, sorted by name

        Raises:
            FileNotFoundError: If directory doesn't exist

        Example:
            >>> files = FileDiscovery.discover(Path("data/"), "*.pkl")
            >>> files = FileDiscovery.discover(Path("data/"), "**/*.json", recursive=True)
        """
        directory = Path(directory)

        if not directory.exists():
            raise FileNotFoundError(f"Directory not found: {directory}")

        if not directory.is_dir():
            raise ValueError(f"Path is not a directory: {directory}")

        # Use glob or rglob based on recursive flag
        if recursive:
            # For recursive, ensure pattern handles subdirs
            if not pattern.startswith("**"):
                pattern = f"**/{pattern}"
            files = list(directory.glob(pattern))
        else:
            files = list(directory.glob(pattern))

        # Filter to files only (not directories)
        files = [f for f in files if f.is_file()]

        # Filter by extensions if specified
        if extensions:
            # Normalize extensions to have leading dot
            normalized_exts = [
                ext if ext.startswith(".") else f".{ext}"
                for ext in extensions
            ]
            files = [f for f in files if f.suffix in normalized_exts]

        # Sort by name for consistent ordering
        return sorted(files, key=lambda p: p.name)

    @staticmethod
    def discover_by_content(
        directory: Union[Path, str],
        required_fields: List[str],
        sample_size: int = 1,
        extensions: Optional[List[str]] = None,
    ) -> List[Path]:
        """
        Find files containing specific fields (peek inside).

        Supports JSON and pickle files. Checks if the first record(s) contain
        all required fields.

        Args:
            directory: Base directory to search
            required_fields: Fields that must be present (e.g., ["Predictions", "Labels"])
            sample_size: Number of records to check per file
            extensions: File extensions to check (default: [".pkl", ".json"])

        Returns:
            List of files containing all required fields

        Example:
            >>> files = FileDiscovery.discover_by_content(
            ...     Path("data/"),
            ...     required_fields=["Predictions", "Labels", "Uncertainties"]
            ... )
        """
        directory = Path(directory)
        extensions = extensions or [".pkl", ".json"]

        # First discover all files
        all_files = FileDiscovery.discover(directory, pattern="*", extensions=extensions)

        matching_files = []
        for filepath in all_files:
            if FileDiscovery._file_has_fields(filepath, required_fields, sample_size):
                matching_files.append(filepath)

        return sorted(matching_files, key=lambda p: p.name)

    @staticmethod
    def _file_has_fields(
        filepath: Path,
        required_fields: List[str],
        sample_size: int,
    ) -> bool:
        """Check if file contains required fields in its records."""
        try:
            data = FileDiscovery._load_file(filepath)

            # Handle different data structures
            if isinstance(data, list):
                records = data[:sample_size]
            elif isinstance(data, dict):
                records = [data]
            else:
                return False

            if not records:
                return False

            # Check if all required fields are in the first record
            first_record = records[0]
            if not isinstance(first_record, dict):
                return False

            return all(field in first_record for field in required_fields)

        except Exception:
            # If we can't load or parse, skip this file
            return False

    @staticmethod
    def _load_file(filepath: Path) -> Any:
        """Load file based on extension."""
        if filepath.suffix == ".json":
            with open(filepath, "r") as f:
                return json.load(f)
        elif filepath.suffix == ".pkl":
            with open(filepath, "rb") as f:
                return pickle.load(f)
        else:
            raise ValueError(f"Unsupported file type: {filepath.suffix}")

    @staticmethod
    def group_by_pattern(
        files: List[Path],
        pattern_groups: Dict[str, str],
    ) -> Dict[str, List[Path]]:
        """
        Group files by matching patterns.

        Args:
            files: List of files to group
            pattern_groups: Mapping of group name to regex pattern

        Returns:
            Dict mapping group names to matching files

        Example:
            >>> grouped = FileDiscovery.group_by_pattern(files, {
            ...     "hcc": r"hcc_.*\\.json",
            ...     "cade": r"cade_.*\\.json",
            ... })
        """
        result: Dict[str, List[Path]] = {name: [] for name in pattern_groups}

        for filepath in files:
            filename = filepath.name
            for group_name, pattern in pattern_groups.items():
                if re.match(pattern, filename):
                    result[group_name].append(filepath)

        return result

    @staticmethod
    def filter_files(
        files: List[Path],
        predicate: Callable[[Path], bool],
    ) -> List[Path]:
        """
        Filter files using a predicate function.

        Args:
            files: List of files to filter
            predicate: Function that takes Path and returns bool

        Returns:
            Filtered list of files

        Example:
            >>> # Keep only files modified today
            >>> recent = FileDiscovery.filter_files(files, lambda p: p.stat().st_mtime > today)
        """
        return [f for f in files if predicate(f)]
