"""
Aurora Analyzer - Main API for Aurora Framework

This module provides the high-level interface for computing Aurora metrics
on experiment results. It handles grouping, metric computation, and result
aggregation.

Usage:
    from aurora.ingestion import PickleResultsLoader
    from aurora.analyzer import AuroraAnalyzer

    # Load data
    loader = PickleResultsLoader(auto_validate=True)
    collection = loader.load("results.pkl")

    # Create analyzer
    analyzer = AuroraAnalyzer(collection)

    # Compute base metrics
    metrics = analyzer.compute_base_metrics(group_by=["dataset", "monthly_label_budget"])
    print(metrics)
"""

from typing import List, Dict, Optional, Union, Tuple
import pandas as pd
import numpy as np
import numpy.typing as npt
from collections import defaultdict

from .schema import ResultsCollection, ExperimentResult
from .metrics import ReliabilityMetrics, StabilityMetrics, DrawdownMetrics
from .uncertainty import UncertaintyQualityMetrics


class AuroraAnalyzer:
    """
    Main analyzer for Aurora framework

    Provides high-level API for computing Aurora metrics on experiment results.

    Three core metric dimensions:
    1. **Reliability**: F1, FNR, FPR (traditional performance)
    2. **Stability**: CV[F1], CV[FNR], CV[FPR] (temporal consistency)
    3. **Drawdown**: Min[F1], Max[F1] (worst/best case)

    Example:
        >>> from aurora.ingestion import PickleResultsLoader
        >>> from aurora.analyzer import AuroraAnalyzer
        >>>
        >>> # Load and validate data
        >>> loader = PickleResultsLoader(auto_validate=True)
        >>> collection = loader.load("results.pkl")
        >>>
        >>> # Create analyzer
        >>> analyzer = AuroraAnalyzer(collection)
        >>>
        >>> # Compute base metrics (without rejection)
        >>> metrics = analyzer.compute_base_metrics(
        ...     group_by=["dataset", "monthly_label_budget", "base_name"]
        ... )
        >>>
        >>> # View results
        >>> print(metrics)
        >>> #                                          F1    FNR    FPR  CV[F1]  CV[FNR]  CV[FPR]  Min[F1]  Max[F1]
        >>> # Dataset   Budget  Base-Name
        >>> # androzoo  200     DeepDrebin-MSP        0.89  0.08   0.03   0.05    0.12     0.15     0.85     0.92
    """

    def __init__(self, collection: ResultsCollection):
        """
        Initialize analyzer with validated results collection

        Args:
            collection: ResultsCollection containing experiment results
                       (should be validated before passing to analyzer)

        Example:
            >>> analyzer = AuroraAnalyzer(collection)
        """
        self.collection = collection

    def compute_base_metrics(
        self,
        group_by: Optional[List[str]] = None,
        metrics: Optional[List[str]] = None,
        include_monthly: bool = False
    ) -> pd.DataFrame:
        """
        Compute base Aurora metrics (without rejection simulation)

        For each group, computes:
        - Reliability: F1, FNR, FPR (aggregated across all months)
        - Stability: CV[F1], CV[FNR], CV[FPR] (temporal variation)
        - Drawdown: Min[F1], Max[F1] (worst/best month)

        Args:
            group_by: List of fields to group by. Can include both core fields
                     (e.g., "dataset", "monthly_label_budget", "base_name") and
                     hyperparameters (e.g., "Num-Epochs", "Random-Seed").
                     If None, treats entire collection as one group.
            metrics: List of metric names to compute. If None, computes all.
                    Available: ['F1', 'FNR', 'FPR', 'CV[F1]', 'CV[FNR]', 'CV[FPR]',
                               'Min[F1]', 'Max[F1]']
            include_monthly: If True, also include per-month metrics in result

        Returns:
            DataFrame with MultiIndex (group keys) and columns (metrics)

        Example:
            >>> # Group by dataset and budget
            >>> metrics = analyzer.compute_base_metrics(
            ...     group_by=["dataset", "monthly_label_budget"]
            ... )
            >>>
            >>> # Group by dataset, budget, and hyperparameters
            >>> metrics = analyzer.compute_base_metrics(
            ...     group_by=["dataset", "monthly_label_budget", "Num-Epochs", "Random-Seed"]
            ... )
            >>>
            >>> # Compute only specific metrics
            >>> metrics = analyzer.compute_base_metrics(
            ...     group_by=["dataset"],
            ...     metrics=["F1", "CV[F1]", "Min[F1]"]
            ... )
        """
        # Default grouping: by base_name only
        if group_by is None:
            group_by = ["base_name"]

        # Default metrics: all base metrics (uncertainty not included by default)
        if metrics is None:
            metrics = ['F1', 'FNR', 'FPR', 'CV[F1]', 'CV[FNR]', 'CV[FPR]', 'Min[F1]', 'Max[F1]']

        # Validate metric names
        valid_metrics = {'F1', 'FNR', 'FPR', 'CV[F1]', 'CV[FNR]', 'CV[FPR]', 'Min[F1]', 'Max[F1]', 'AURC', 'E-AURC'}
        for metric in metrics:
            if metric not in valid_metrics:
                raise ValueError(f"Invalid metric: {metric}. Valid metrics: {valid_metrics}")

        # Group results
        grouped = self.collection.group_by(*group_by)

        # Compute metrics for each group
        results = []

        for group_key, group_results in grouped.items():
            # Ensure group_key is always a tuple
            if not isinstance(group_key, tuple):
                group_key = (group_key,)

            # Extract predictions and labels for this group
            # Sort by test_month to ensure temporal order
            sorted_results = sorted(group_results, key=lambda r: r.test_month)

            predictions = [r.predictions for r in sorted_results]
            labels = [r.labels for r in sorted_results]

            # Compute requested metrics
            row = {}

            # Reliability metrics (aggregated)
            if any(m in metrics for m in ['F1', 'FNR', 'FPR']):
                reliability = ReliabilityMetrics.compute_aggregated(predictions, labels)
                for metric in ['F1', 'FNR', 'FPR']:
                    if metric in metrics:
                        row[metric] = reliability[metric]

            # Stability and drawdown require per-month metrics
            if any(m.startswith('CV[') or m.startswith('Min[') or m.startswith('Max[') for m in metrics):
                monthly = ReliabilityMetrics.compute_per_month(predictions, labels)

                # Stability metrics
                if any(m.startswith('CV[') for m in metrics):
                    stability = StabilityMetrics.compute_cv_metrics(monthly)
                    for metric in ['CV[F1]', 'CV[FNR]', 'CV[FPR]']:
                        if metric in metrics:
                            row[metric] = stability[metric]

                # Drawdown metrics (for F1)
                if 'Min[F1]' in metrics or 'Max[F1]' in metrics:
                    drawdown = DrawdownMetrics.compute_minmax(monthly['F1'])
                    if 'Min[F1]' in metrics:
                        row['Min[F1]'] = drawdown['Min[F1]']
                    if 'Max[F1]' in metrics:
                        row['Max[F1]'] = drawdown['Max[F1]']

                # Include monthly data if requested
                if include_monthly:
                    row['Monthly F1'] = monthly['F1']
                    row['Monthly FNR'] = monthly['FNR']
                    row['Monthly FPR'] = monthly['FPR']

            # Uncertainty quality metrics (require uncertainties)
            if 'AURC' in metrics or 'E-AURC' in metrics:
                # Check if uncertainties are available
                if hasattr(sorted_results[0], 'uncertainties_month_ahead'):
                    uncertainties = [r.uncertainties_month_ahead for r in sorted_results]

                    if 'AURC' in metrics and 'E-AURC' in metrics:
                        # Compute both efficiently
                        aurc, eaurc = UncertaintyQualityMetrics.compute_both(
                            predictions, labels, uncertainties
                        )
                        row['AURC'] = aurc
                        row['E-AURC'] = eaurc
                    elif 'AURC' in metrics:
                        aurc = UncertaintyQualityMetrics.compute_aurc(
                            predictions, labels, uncertainties
                        )
                        row['AURC'] = aurc
                    elif 'E-AURC' in metrics:
                        eaurc = UncertaintyQualityMetrics.compute_eaurc(
                            predictions, labels, uncertainties
                        )
                        row['E-AURC'] = eaurc
                else:
                    # No uncertainties available - set to NaN
                    if 'AURC' in metrics:
                        row['AURC'] = np.nan
                    if 'E-AURC' in metrics:
                        row['E-AURC'] = np.nan

            # Add group key to row
            row['_group_key'] = group_key
            results.append(row)

        # Convert to DataFrame
        if len(results) == 0:
            # Empty collection
            df = pd.DataFrame(columns=['_group_key'] + metrics)
        else:
            df = pd.DataFrame(results)

        # Set multi-index from group keys
        if len(results) > 0:
            # Split group key into separate columns
            group_key_df = pd.DataFrame(
                [row['_group_key'] for row in results],
                columns=group_by
            )
            df = pd.concat([group_key_df, df.drop('_group_key', axis=1)], axis=1)
            df = df.set_index(group_by)

        # Ensure columns are in consistent order
        metric_cols = [m for m in metrics if m in df.columns]
        if include_monthly:
            monthly_cols = [c for c in df.columns if c.startswith('Monthly')]
            df = df[metric_cols + monthly_cols]
        else:
            df = df[metric_cols]

        return df

    def get_group_summary(self, group_by: Optional[List[str]] = None) -> pd.DataFrame:
        """
        Get summary statistics for each group (without computing metrics)

        Useful for understanding the structure of your data before analysis.

        Args:
            group_by: Fields to group by (same as compute_base_metrics)

        Returns:
            DataFrame with columns:
            - n_results: Number of experiment results
            - n_months: Number of unique months
            - month_range: String describing month range (e.g., "0-22")
            - total_samples: Total number of test samples across all months

        Example:
            >>> summary = analyzer.get_group_summary(group_by=["dataset", "monthly_label_budget"])
            >>> print(summary)
            >>> #                      n_results  n_months  month_range  total_samples
            >>> # Dataset   Budget
            >>> # androzoo  200            23        23        0-22         145000
            >>> # androzoo  400            23        23        0-22         145000
        """
        if group_by is None:
            group_by = ["base_name"]

        grouped = self.collection.group_by(*group_by)

        results = []
        for group_key, group_results in grouped.items():
            if not isinstance(group_key, tuple):
                group_key = (group_key,)

            months = sorted(set(r.test_month for r in group_results))
            total_samples = sum(len(r.predictions) for r in group_results)

            row = {
                '_group_key': group_key,
                'n_results': len(group_results),
                'n_months': len(months),
                'month_range': f"{min(months)}-{max(months)}" if months else "N/A",
                'total_samples': total_samples
            }
            results.append(row)

        df = pd.DataFrame(results)

        # Set multi-index
        if len(results) > 0:
            group_key_df = pd.DataFrame(
                [row['_group_key'] for row in results],
                columns=group_by
            )
            df = pd.concat([group_key_df, df.drop('_group_key', axis=1)], axis=1)
            df = df.set_index(group_by)

        return df

    def compare_groups(
        self,
        group_by: List[str],
        sort_by: str = 'F1',
        ascending: bool = False,
        top_n: Optional[int] = None
    ) -> pd.DataFrame:
        """
        Compute metrics and compare groups, sorted by a specific metric

        Args:
            group_by: Fields to group by
            sort_by: Metric to sort by (default: 'F1')
            ascending: Sort order (default: False = descending = best first)
            top_n: If specified, return only top N groups

        Returns:
            DataFrame sorted by the specified metric

        Example:
            >>> # Find best configurations by F1 score
            >>> best = analyzer.compare_groups(
            ...     group_by=["dataset", "base_name"],
            ...     sort_by="F1",
            ...     top_n=10
            ... )
            >>>
            >>> # Find most stable configurations
            >>> stable = analyzer.compare_groups(
            ...     group_by=["dataset", "base_name"],
            ...     sort_by="CV[F1]",
            ...     ascending=True,  # Lower CV = more stable
            ...     top_n=10
            ... )
        """
        # Compute all metrics
        df = self.compute_base_metrics(group_by=group_by)

        # Validate sort_by column exists
        if sort_by not in df.columns:
            raise ValueError(f"Metric '{sort_by}' not in computed metrics: {list(df.columns)}")

        # Sort
        df_sorted = df.sort_values(by=sort_by, ascending=ascending)

        # Return top N if specified
        if top_n is not None:
            df_sorted = df_sorted.head(top_n)

        return df_sorted

    def get_monthly_trends(
        self,
        group_by: Optional[List[str]] = None
    ) -> pd.DataFrame:
        """
        Get per-month metrics for trend analysis

        Returns a long-form DataFrame with one row per (group, month) combination.
        Useful for plotting temporal trends.

        Args:
            group_by: Fields to group by

        Returns:
            DataFrame with columns:
            - group columns (from group_by)
            - test_month
            - F1, FNR, FPR (monthly metrics)
            - n_samples (number of test samples that month)

        Example:
            >>> trends = analyzer.get_monthly_trends(group_by=["dataset", "base_name"])
            >>>
            >>> # Plot with pandas
            >>> import matplotlib.pyplot as plt
            >>> for (dataset, base_name), group_df in trends.groupby(["dataset", "base_name"]):
            ...     plt.plot(group_df["test_month"], group_df["F1"], label=f"{dataset}-{base_name}")
            >>> plt.legend()
            >>> plt.show()
        """
        if group_by is None:
            group_by = ["base_name"]

        grouped = self.collection.group_by(*group_by)

        rows = []
        for group_key, group_results in grouped.items():
            if not isinstance(group_key, tuple):
                group_key = (group_key,)

            # Sort by month
            sorted_results = sorted(group_results, key=lambda r: r.test_month)

            # Compute metrics for each month
            for result in sorted_results:
                reliability = ReliabilityMetrics.compute_aggregated(
                    [result.predictions],
                    [result.labels]
                )

                row = dict(zip(group_by, group_key))
                row.update({
                    'test_month': result.test_month,
                    'F1': reliability['F1'],
                    'FNR': reliability['FNR'],
                    'FPR': reliability['FPR'],
                    'n_samples': len(result.predictions)
                })
                rows.append(row)

        return pd.DataFrame(rows)

    def filter_collection(
        self,
        **kwargs
    ) -> 'AuroraAnalyzer':
        """
        Create new analyzer with filtered collection

        Args:
            **kwargs: Keyword arguments for filtering
                     - Core fields: dataset="androzoo", monthly_label_budget=200
                     - Hyperparameters: prefix with "hp_", e.g., hp_Num_Epochs=30

        Returns:
            New AuroraAnalyzer with filtered collection

        Example:
            >>> # Filter for specific dataset
            >>> androzoo_analyzer = analyzer.filter_collection(dataset="androzoo")
            >>>
            >>> # Filter by hyperparameter
            >>> epoch30_analyzer = analyzer.filter_collection(hp_Num_Epochs=30)
            >>>
            >>> # Chain filters
            >>> filtered = analyzer.filter_collection(
            ...     dataset="androzoo",
            ...     monthly_label_budget=200,
            ...     hp_Num_Epochs=30
            ... )
        """
        def filter_fn(result: ExperimentResult) -> bool:
            for key, value in kwargs.items():
                if key.startswith('hp_'):
                    # Hyperparameter filter
                    hp_key = key[3:].replace('_', '-')  # hp_Num_Epochs -> Num-Epochs
                    if result.get_hyperparameter(hp_key) != value:
                        return False
                else:
                    # Core field filter
                    if getattr(result, key, None) != value:
                        return False
            return True

        filtered_collection = self.collection.filter(filter_fn)
        return AuroraAnalyzer(filtered_collection)

    def compute_uncertainty_metrics(
        self,
        group_by: Optional[List[str]] = None,
        uncertainty_field: str = "uncertainties_month_ahead"
    ) -> pd.DataFrame:
        """
        Compute uncertainty quality metrics (AURC, E-AURC)

        Args:
            group_by: Fields to group by (same as compute_base_metrics)
            uncertainty_field: Which uncertainty field to use
                              ("uncertainties_month_ahead" or "uncertainties_past_month")

        Returns:
            DataFrame with columns: AURC, E-AURC

        Example:
            >>> uncertainty_metrics = analyzer.compute_uncertainty_metrics(
            ...     group_by=["dataset", "base_name"]
            ... )
            >>> print(uncertainty_metrics.sort_values('E-AURC'))
        """
        if group_by is None:
            group_by = ["base_name"]

        grouped = self.collection.group_by(*group_by)

        results = []
        for group_key, group_results in grouped.items():
            if not isinstance(group_key, tuple):
                group_key = (group_key,)

            # Extract data
            sorted_results = sorted(group_results, key=lambda r: r.test_month)
            predictions = [r.predictions for r in sorted_results]
            labels = [r.labels for r in sorted_results]

            # Check if uncertainties available
            if hasattr(sorted_results[0], uncertainty_field):
                uncertainties = [getattr(r, uncertainty_field) for r in sorted_results]

                # Compute both AURC and E-AURC
                aurc, eaurc = UncertaintyQualityMetrics.compute_both(
                    predictions, labels, uncertainties
                )

                row = {
                    '_group_key': group_key,
                    'AURC': aurc,
                    'E-AURC': eaurc
                }
            else:
                # No uncertainties
                row = {
                    '_group_key': group_key,
                    'AURC': np.nan,
                    'E-AURC': np.nan
                }

            results.append(row)

        # Convert to DataFrame
        df = pd.DataFrame(results)

        if len(results) > 0:
            group_key_df = pd.DataFrame(
                [row['_group_key'] for row in results],
                columns=group_by
            )
            df = pd.concat([group_key_df, df.drop('_group_key', axis=1)], axis=1)
            df = df.set_index(group_by)

        return df

    def get_risk_coverage_curves(
        self,
        group_by: Optional[List[str]] = None,
        uncertainty_field: str = "uncertainties_month_ahead",
        n_bins: int = 100
    ) -> Dict[Tuple, Dict[str, npt.NDArray]]:
        """
        Compute risk-coverage curves for plotting

        Args:
            group_by: Fields to group by
            uncertainty_field: Which uncertainty field to use
            n_bins: Number of points in the curve

        Returns:
            Dictionary mapping group_key -> curve_data
            where curve_data has keys: 'coverage', 'risk', 'n_samples'

        Example:
            >>> curves = analyzer.get_risk_coverage_curves(
            ...     group_by=["dataset", "base_name"]
            ... )
            >>>
            >>> # Plot curves
            >>> import matplotlib.pyplot as plt
            >>> for group_key, curve in curves.items():
            ...     plt.plot(curve['coverage'], curve['risk'], label=str(group_key))
            >>> plt.xlabel('Coverage')
            >>> plt.ylabel('Risk')
            >>> plt.legend()
            >>> plt.show()
        """
        if group_by is None:
            group_by = ["base_name"]

        grouped = self.collection.group_by(*group_by)

        curves = {}
        for group_key, group_results in grouped.items():
            if not isinstance(group_key, tuple):
                group_key = (group_key,)

            # Extract data
            sorted_results = sorted(group_results, key=lambda r: r.test_month)
            predictions = [r.predictions for r in sorted_results]
            labels = [r.labels for r in sorted_results]

            # Check if uncertainties available
            if hasattr(sorted_results[0], uncertainty_field):
                uncertainties = [getattr(r, uncertainty_field) for r in sorted_results]

                # Compute curve
                curve_data = UncertaintyQualityMetrics.compute_risk_coverage_curve(
                    predictions, labels, uncertainties, n_bins=n_bins
                )

                curves[group_key] = curve_data

        return curves

    def compile_results(
        self,
        group_by: List[str] = ["monthly_label_budget", "sampler_mode", "base_name"],
        metrics: List[str] = None,
        format_as_percentages: bool = True
    ) -> pd.DataFrame:
        """
        Compile results across multiple datasets into unified hierarchical table

        Creates a table with:
        - Row index: hierarchical grouping (e.g., Budget -> Sampler -> Method)
        - Columns: multi-level (Dataset, Metric) pairs

        This replaces the old manual `compile_results()` function with integrated logic.

        Args:
            group_by: Fields to group by (defines row hierarchy)
            metrics: Metrics to include (defaults to all base metrics)
            format_as_percentages: If True, format F1/FNR/FPR as percentages

        Returns:
            DataFrame with MultiIndex rows and MultiIndex columns

        Example:
            >>> # Compile across all datasets
            >>> table = analyzer.compile_results(
            ...     group_by=["monthly_label_budget", "sampler_mode", "base_name"]
            ... )
            >>> print(table)
            >>>                                         androzoo                    apigraph
            >>>                                         F1     CV[F1]  Min[F1]    F1     CV[F1]  Min[F1]
            >>> Budget Sampler      Method
            >>> 200    random       DeepDrebin-MSP      89.2%  5%      85.1%      87.3%  7%      82.4%
            >>> 200    random       SVM (C=1.0)         85.1%  3%      83.2%      84.0%  4%      81.5%
        """
        if metrics is None:
            metrics = ['F1', 'FNR', 'FPR', 'CV[F1]', 'CV[FNR]', 'CV[FPR]', 'Min[F1]', 'Max[F1]']

        # Get unique datasets
        datasets = sorted(self.collection.get_unique_values("dataset").get("dataset", []))

        # Compute metrics for each dataset separately
        dataset_metrics = {}
        for dataset in datasets:
            # Filter to this dataset
            filtered = self.collection.filter(lambda r: r.dataset == dataset)
            filtered_analyzer = AuroraAnalyzer(filtered)

            # Compute metrics with grouping
            dataset_metrics[dataset] = filtered_analyzer.compute_base_metrics(
                group_by=group_by,
                metrics=metrics
            )

        # Combine into hierarchical columns
        combined_dfs = []
        for dataset, df in dataset_metrics.items():
            # Create multi-level column index (dataset, metric)
            df.columns = pd.MultiIndex.from_product([[dataset], df.columns])
            combined_dfs.append(df)

        # Concatenate horizontally (align by index)
        result = pd.concat(combined_dfs, axis=1)

        # Format as percentages if requested
        if format_as_percentages:
            for col in result.columns:
                dataset, metric = col
                if metric in ["F1", "FNR", "FPR", "Min[F1]", "Max[F1]"]:
                    result[col] = result[col].apply(
                        lambda x: f"{x*100:.1f}%" if pd.notna(x) else None
                    )
                elif metric in ["CV[F1]", "CV[FNR]", "CV[FPR]"]:
                    result[col] = result[col].apply(
                        lambda x: f"{x*100:.0f}%" if pd.notna(x) else None
                    )

        return result

    def select_best_methods(
        self,
        group_by: List[str] = ["monthly_label_budget", "sampler_mode"],
        select_by: str = "F1",
        ascending: bool = False,
        month_cutoff: Optional[int] = None,
        return_remaining_period: bool = False
    ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, pd.DataFrame]]:
        """
        Select best method for each configuration (parameterizable)

        This replaces the old `compile_best_methods_for_experiment()` function
        with a cleaner, more flexible interface.

        Args:
            group_by: Fields defining each configuration (e.g., budget + sampler)
            select_by: Metric to use for selection (e.g., "F1", "E-AURC", "Min[F1]")
            ascending: If True, select lowest value; if False, select highest
            month_cutoff: If provided, split analysis at this month
            return_remaining_period: If True and month_cutoff set, return metrics
                                     for both periods

        Returns:
            If return_remaining_period=False: DataFrame with best methods
            If return_remaining_period=True: (best_initial, best_remaining) tuple

        Example:
            >>> # Select best by F1 score (default)
            >>> best = analyzer.select_best_methods(
            ...     group_by=["monthly_label_budget", "sampler_mode"],
            ...     select_by="F1"
            ... )
            >>>
            >>> # Select best by stability (lowest CV[F1])
            >>> best = analyzer.select_best_methods(
            ...     select_by="CV[F1]",
            ...     ascending=True  # Lower is better
            ... )
            >>>
            >>> # Select best by uncertainty quality
            >>> best = analyzer.select_best_methods(
            ...     select_by="E-AURC",
            ...     ascending=True  # Lower is better
            ... )
            >>>
            >>> # Split analysis by time period
            >>> best_early, best_late = analyzer.select_best_methods(
            ...     month_cutoff=5,
            ...     return_remaining_period=True
            ... )
        """
        # Filter by month if cutoff specified
        if month_cutoff is not None:
            initial_collection = self.collection.filter(lambda r: r.test_month <= month_cutoff)
            initial_analyzer = AuroraAnalyzer(initial_collection)
        else:
            initial_analyzer = self

        # Compute metrics including the selection metric
        grouping_with_method = group_by + ["base_name"]

        # Determine which metrics to compute
        metrics_to_compute = ['F1', 'FNR', 'FPR', 'CV[F1]', 'CV[FNR]', 'CV[FPR]', 'Min[F1]', 'Max[F1]']
        if select_by in ['AURC', 'E-AURC']:
            metrics_to_compute.extend(['AURC', 'E-AURC'])

        # Get full metrics table
        full_metrics = initial_analyzer.compile_results(
            group_by=grouping_with_method,
            metrics=metrics_to_compute,
            format_as_percentages=False  # Keep as numbers for selection
        )

        # Select best method for each configuration
        best_methods = {}
        datasets = sorted(self.collection.get_unique_values("dataset").get("dataset", []))

        for dataset in datasets:
            # Get this dataset's metrics
            dataset_cols = [col for col in full_metrics.columns if col[0] == dataset]
            dataset_metrics = full_metrics[dataset_cols]

            # Drop the dataset level from column index
            dataset_metrics.columns = dataset_metrics.columns.droplevel(0)

            # Group by configuration (excluding base_name)
            for config in dataset_metrics.index.droplevel(-1).unique():
                # Get all methods for this configuration
                if isinstance(config, tuple):
                    methods_for_config = dataset_metrics.loc[config]
                else:
                    methods_for_config = dataset_metrics.loc[[config]]

                # Select best by specified metric
                if select_by in methods_for_config.columns:
                    best_idx = methods_for_config[select_by].idxmax() if not ascending else methods_for_config[select_by].idxmin()

                    # Store the best method (including base_name)
                    key = (dataset,) + (config if isinstance(config, tuple) else (config,))
                    best_row = methods_for_config.loc[best_idx].copy()
                    best_row["base_name"] = best_idx  # base_name is the index (method name)
                    best_methods[key] = best_row

        # Convert to DataFrame
        best_df = pd.DataFrame.from_dict(best_methods, orient='index')
        best_df.index.names = ["dataset"] + group_by

        # Format as percentages
        for col in best_df.columns:
            if col in ["F1", "FNR", "FPR", "Min[F1]", "Max[F1]"]:
                best_df[col] = best_df[col].apply(lambda x: f"{x*100:.1f}%" if pd.notna(x) else None)
            elif col in ["CV[F1]", "CV[FNR]", "CV[FPR]"]:
                best_df[col] = best_df[col].apply(lambda x: f"{x*100:.0f}%" if pd.notna(x) else None)

        # If not splitting by time, return single result
        if not return_remaining_period or month_cutoff is None:
            return best_df

        # Compute metrics for remaining period
        remaining_collection = self.collection.filter(lambda r: r.test_month > month_cutoff)
        remaining_analyzer = AuroraAnalyzer(remaining_collection)

        # Filter to only the best methods
        best_method_names = best_df["base_name"].unique()
        remaining_filtered = remaining_collection.filter(lambda r: r.base_name in best_method_names)
        remaining_analyzer = AuroraAnalyzer(remaining_filtered)

        # Compile metrics for remaining period
        remaining_df = remaining_analyzer.compile_results(
            group_by=grouping_with_method,
            metrics=metrics_to_compute,
            format_as_percentages=True
        )

        return best_df, remaining_df

    def __repr__(self) -> str:
        """String representation"""
        n_results = len(self.collection)
        datasets = self.collection.get_unique_values("dataset").get("dataset", [])
        return (
            f"AuroraAnalyzer(\n"
            f"  results={n_results},\n"
            f"  datasets={sorted(datasets)},\n"
            f"  experiment='{self.collection.metadata.experiment_name}'\n"
            f")"
        )
