Source code for gpt_engineer.benchmark.benchmarks.apps.load

"""
Module for loading APPS evaluation tasks.

This module provides functionality to load tasks for evaluating GPT-based models
on smaller, more focused tasks. It defines a set of tasks with predefined prompts
and assertions to benchmark the performance of AI models.

Functions
---------
load_apps : function
    Loads the APPS benchmark, which consists of a series coding problems.
"""
from pathlib import Path
from subprocess import TimeoutExpired
from typing import Union

from datasets import Dataset, DatasetDict, load_dataset, load_from_disk

from gpt_engineer.benchmark.bench_config import AppsConfig
from gpt_engineer.benchmark.benchmarks.apps.problem import Problem
from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv
from gpt_engineer.core.files_dict import FilesDict
from gpt_engineer.core.prompt import Prompt

DATASET_PATH = Path(__file__).parent / "dataset"


class AppsAssertion:
    def __init__(self, expected: str, command: str):
        self.expected_output = self._format(expected)
        self.command = command

    def evaluate(self, assertable: Assertable) -> bool:
        # Create new execution environment for every run to avoid side effects
        env = DiskExecutionEnv()
        env.upload(assertable.files)
        pro = env.popen(self.command)
        try:
            stdout, stderr = pro.communicate(timeout=2)
            stdout, stderr = stdout.decode("utf-8"), stderr.decode("utf-8")
        except TimeoutExpired:
            print("Execution Timeout")
            return False

        return self.expected_output in self._format(stdout)

    def _format(self, string: str) -> str:
        return string.replace(" ", "").replace("\n", "")


def _get_dataset() -> Union[Dataset, DatasetDict]:
    try:
        return load_from_disk(str(DATASET_PATH))
    except FileNotFoundError:
        print("Dataset not found locally, downloading...")

    dataset = load_dataset("codeparrot/apps", trust_remote_code=True)
    dataset.save_to_disk(str(DATASET_PATH))

    return dataset



[docs]
def load_apps(config: AppsConfig) -> Benchmark:
    """
    Loads the APPS benchmark, which consists of a series coding problems.

    Returns
    -------
    Benchmark
        A Benchmark object containing a list of Task objects for the APPS evaluation.
    """
    dataset = _get_dataset()
    tasks = []
    problems = list()
    for dataset_type in ["test", "train"]:
        problems += [
            Problem(
                id=problem["problem_id"],
                question=problem["question"],
                input_output=problem["input_output"],
                starter_code=problem["starter_code"],
            )
            for index, problem in enumerate(dataset[dataset_type])
            if (index < config.__getattribute__(dataset_type + "_end_index"))
            and (index >= config.__getattribute__(dataset_type + "_start_index"))
        ]

    for problem in problems:
        prompt = Prompt(
            problem.question
            + "\nThe program, including its inputs, should be run from the command "
            "line like 'python main \"input1 input2 etc \"', with all inputs inside "
            "the quotation marks. The program should not read inputs from stdin."
        )

        tasks.append(
            Task(
                name=str(problem.id),
                initial_code=FilesDict({"main.py": problem.starter_code}),
                command=None,  # Explicitly setting `None` because each assertion specifies its command
                prompt=prompt,
                assertions={
                    f"correct output {i}": AppsAssertion(
                        expected=problem.outputs[i],
                        command="python main.py" + ' "' + problem.inputs[i] + '"',
                    ).evaluate
                    for i in range(
                        min(len(problem.outputs), config.examples_per_problem)
                    )
                },
            )
        )

    return Benchmark(
        name="apps",
        tasks=tasks,
    )