Confidence intervals for strength metric 🔬

Confidence intervals for strength metric 🔬#

In this tutorial we show how to add confidence intervals to the strength metric via bootstrapping.`

import pandas as pd
import feedback_forensics as ff
import pathlib

# Load results (e.g. Arena data)
dataset_name = "chatbot_arena.json"
dataset = ff.DatasetHandler()
data_path = pathlib.Path("../../data/output/results_sets/feedback-forensics-results-paper")
dataset.add_data_from_path(data_path / dataset_name)
df = dataset.first_handler.df

annotator_metadata = dataset.get_available_annotators()
metrics = dataset.get_annotator_metrics()

# Get top and bottom 5 annotators according to strength metric
strength_metrics = metrics["chatbot_arena"]["metrics"]["strength"]
annotators = list(strength_metrics.keys())
top_annotators = sorted(annotators, key=lambda x: strength_metrics[x], reverse=True)
top5_annotators = top_annotators[:5]
bottom5_annotators = top_annotators[-5:][::-1]

def get_annotator_key(in_row_name: str) -> str:
    for annotator_key, metadata in annotator_metadata.items():
        if metadata["annotator_in_row_name"] in in_row_name:
            return annotator_key
    return None

annotators = {
    "top5": {
        annotator_name: {"key": get_annotator_key(annotator_name), "name": annotator_name}
        for annotator_name in top5_annotators
    },
    "bottom5": {
        annotator_name: {"key": get_annotator_key(annotator_name), "name": annotator_name}
        for annotator_name in bottom5_annotators
    }
}

default_annotator_key = [key for key, info in annotator_metadata.items() if info["variant"] == "default_annotator"][0]
human_data = df[default_annotator_key]

for category, annotator_subset in annotators.items():
    for annotator_name in annotator_subset.keys():
        annotator_key = annotator_subset[annotator_name]["key"]
        annotator_data = df[annotator_key]
        annotator_subset[annotator_name]["data"] = annotator_data

        # create a combined dataset of human and annotator data
        combined_data = []
        assert len(human_data) == len(annotator_data), "Human and annotator data have different lengths"
        for i in range(len(human_data)):
            combined_data.append([human_data.iloc[i], annotator_data.iloc[i]])

        annotator_subset[annotator_name]["combined_data"] = combined_data

/home/docs/checkouts/readthedocs.org/user_builds/feedback-forensics/envs/latest/lib/python3.11/site-packages/alpaca_eval/utils.py:20: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
  import pkg_resources

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[1], line 9
      7 dataset = ff.DatasetHandler()
      8 data_path = pathlib.Path("../../data/output/results_sets/feedback-forensics-results-paper")
----> 9 dataset.add_data_from_path(data_path / dataset_name)
     10 df = dataset.first_handler.df
     12 annotator_metadata = dataset.get_available_annotators()

File ~/checkouts/readthedocs.org/user_builds/feedback-forensics/envs/latest/lib/python3.11/site-packages/feedback_forensics/data/handler.py:352, in DatasetHandler.add_data_from_path(self, path, name)
    350     name = str(path).split("/")[-1].split(".")[0]
    351 handler = ColumnHandler(cache=self.cache, avail_datasets=self.avail_datasets)
--> 352 handler.load_data_from_path(path)
    353 self.add_col_handler(name, handler)

File ~/checkouts/readthedocs.org/user_builds/feedback-forensics/envs/latest/lib/python3.11/site-packages/feedback_forensics/data/handler.py:206, in ColumnHandler.load_data_from_path(self, dataset_path)
    203 """Load data from a given path."""
    204 dataset_path = Path(dataset_path)
--> 206 base_votes_dict = get_votes_dict(dataset_path, cache=self.cache)
    207 votes_dict = add_virtual_annotators(
    208     base_votes_dict,
    209     cache=self.cache,
   (...)    212     target_models=[],
    213 )
    215 self.load_from_votes_dict(votes_dict)

File ~/checkouts/readthedocs.org/user_builds/feedback-forensics/envs/latest/lib/python3.11/site-packages/feedback_forensics/data/loader.py:56, in get_votes_dict(results_path, cache)
     53     cache = {}
     55 if not results_path.exists():
---> 56     raise FileNotFoundError(f"Results directory not found in path '{results_path}'")
     58 if "votes_dict" in cache and results_path in cache["votes_dict"]:
     59     return cache["votes_dict"][results_path]

FileNotFoundError: Results directory not found in path '../../data/output/results_sets/feedback-forensics-results-paper/chatbot_arena.json'

import sklearn.metrics
import scipy.stats
import numpy as np
import time

def get_strength_metric(human_annotations, trait_annotations, axis=None):
    """Custom version of strength metric that is compatible with scipy bootstrapping.

    Takes different input from the main metric implementation in ff.app.metrics.

    Data is a list of tuples, where each tuple contains a human annotation and an trait annotation.
    """

    # Create boolean mask for relevant annotations
    relevant_mask = np.isin(trait_annotations, ["text_a", "text_b"])

    # Get relevant trait annotations using mask
    relevant_trait_annotations = np.array(trait_annotations)[relevant_mask]

    relevance = len(relevant_trait_annotations) / len(trait_annotations)

    # Get relevant human annotations using same mask
    relevant_human_annotations = np.array(human_annotations)[relevant_mask]

    kappa = sklearn.metrics.cohen_kappa_score(
        relevant_human_annotations,
        relevant_trait_annotations,
    )

    return kappa * relevance





for category, annotator_subset in annotators.items():
    for annotator_name, annotator_data in annotator_subset.items():
        print(f"Processing '{annotator_name}' from '{category}'")
        combined_data = annotator_data["combined_data"][:10000]
        human_annotations = [x[0] for x in combined_data]
        trait_annotations = [x[1] for x in combined_data]
        start_time = time.time()
        annotator_data["strength_metric"] = get_strength_metric(human_annotations, trait_annotations)
        end_time = time.time()
        print(f"Time taken for single metric: {end_time - start_time:.2f} seconds")
        print(f"Starting bootstrap")
        start_time = time.time()
        annotator_data["strength_metric_confidence_interval"] = scipy.stats.bootstrap(
            (human_annotations, trait_annotations),
            get_strength_metric,
            confidence_level=0.95,
            n_resamples=100,
            vectorized=False,
            paired=True,
            axis=0,
            method="percentile",
        )
        end_time = time.time()
        print(f"Time taken: {end_time - start_time:.2f} seconds")

Processing 'is more verbose' from 'top5'
Time taken for single metric: 0.02 seconds
Starting bootstrap
Time taken: 1.67 seconds
Processing 'has more structured formatting' from 'top5'
Time taken for single metric: 0.02 seconds
Starting bootstrap
Time taken: 1.29 seconds
Processing 'makes more confident statements' from 'top5'
Time taken for single metric: 0.02 seconds
Starting bootstrap
Time taken: 1.14 seconds
Processing 'is more factually correct' from 'top5'
Time taken for single metric: 0.02 seconds
Starting bootstrap
Time taken: 0.92 seconds
Processing 'more strictly follows the requested output format' from 'top5'
Time taken for single metric: 0.02 seconds
Starting bootstrap
Time taken: 0.98 seconds
Processing 'is more concise' from 'bottom5'
Time taken for single metric: 0.02 seconds
Starting bootstrap
Time taken: 1.67 seconds
Processing 'has a more avoidant tone' from 'bottom5'
Time taken for single metric: 0.01 seconds
Starting bootstrap
Time taken: 0.34 seconds
Processing 'refuses to answer the question' from 'bottom5'
Time taken for single metric: 0.01 seconds
Starting bootstrap
Time taken: 0.31 seconds
Processing 'ends with a follow-up question' from 'bottom5'
Time taken for single metric: 0.01 seconds
Starting bootstrap
Time taken: 0.40 seconds
Processing 'is more polite' from 'bottom5'
Time taken for single metric: 0.01 seconds
Starting bootstrap
Time taken: 0.89 seconds

print("Annotator name | Strength | Low (CI 95%) | High (CI 95%)")
print("---|---|---|---")

for category, annotator_subset in annotators.items():
    for annotator_name, annotator_data in annotator_subset.items():
        std_error = annotator_data['strength_metric_confidence_interval'].standard_error
        cfdnc_interval = annotator_data['strength_metric_confidence_interval'].confidence_interval
        print(f"{annotator_name} | {annotator_data['strength_metric']:.2f} | {cfdnc_interval.low:.2f} | {cfdnc_interval.high:.2f}")

Annotator name | Strength | Low (CI 95%) | High (CI 95%)
---|---|---|---
is more verbose | 0.14 | 0.12 | 0.16
has more structured formatting | 0.13 | 0.12 | 0.15
makes more confident statements | 0.12 | 0.11 | 0.13
is more factually correct | 0.11 | 0.10 | 0.12
more strictly follows the requested output format | 0.09 | 0.07 | 0.10
is more concise | -0.14 | -0.16 | -0.12
has a more avoidant tone | -0.05 | -0.05 | -0.04
refuses to answer the question | -0.04 | -0.05 | -0.04
ends with a follow-up question | -0.01 | -0.02 | -0.00
is more polite | -0.00 | -0.01 | 0.01