Source code for gpt_engineer.benchmark.benchmarks.mbpp.load

"""
Module for loading MBPP evaluation tasks.

This module provides functionality to load tasks for evaluating GPT-based models
on smaller, more focused tasks. It defines a set of tasks with predefined prompts
and assertions to benchmark the performance of AI models.

Functions
---------
load_mbpp : function
    Loads the MBPP benchmark, which consists of a series coding problems.
"""
from pathlib import Path
from subprocess import TimeoutExpired
from typing import Union

from datasets import Dataset, DatasetDict, load_dataset, load_from_disk

from gpt_engineer.benchmark.bench_config import MbppConfig
from gpt_engineer.benchmark.benchmarks.mbpp.problem import Problem
from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv
from gpt_engineer.core.files_dict import FilesDict
from gpt_engineer.core.prompt import Prompt

DATASET_PATH = Path(__file__).parent / "dataset"


class MbppAssertion:
    def __init__(self, assertion: str):
        self.assertion = assertion

    def evaluate(self, assertable: Assertable) -> bool:
        generated_code = assertable.files["main.py"]
        code_with_assertion = f"{generated_code}\n{self.assertion}"

        # Create new execution environment for every run to avoid side effects
        env = DiskExecutionEnv()
        env.upload(FilesDict({"main.py": code_with_assertion}))
        pro = env.popen("python main.py")

        try:
            stdout, stderr = pro.communicate(timeout=2)
            stdout, stderr = stdout.decode("utf-8"), stderr.decode("utf-8")
        except TimeoutExpired:
            print("Execution Timeout")
            return False

        return not stderr


def _get_dataset() -> Union[Dataset, DatasetDict]:
    try:
        return load_from_disk(str(DATASET_PATH))
    except FileNotFoundError:
        print("Dataset not found locally, downloading...")

    dataset = load_dataset("mbpp", "sanitized", trust_remote_code=True)
    dataset.save_to_disk(str(DATASET_PATH))

    return dataset



[docs]
def load_mbpp(config: MbppConfig) -> Benchmark:
    """
    Loads the MBPP benchmark, which consists of a series coding problems.

    Returns
    -------
    Benchmark
        A Benchmark object containing a list of Task objects for the MBPP evaluation.
    """
    dataset = _get_dataset()
    tasks = []
    problems = []
    for dataset_type in ["test", "train"]:
        problems += [
            Problem(
                source_file=problem["source_file"],
                task_id=problem["task_id"],
                prompt=problem["prompt"],
                code=problem["code"],
                test_imports=problem["test_imports"],
                test_list=problem["test_list"],
            )
            for index, problem in enumerate(dataset[dataset_type])
            if index < config.__getattribute__(dataset_type + "_len")
        ]

    for problem in problems:
        prompt = Prompt(
            problem.prompt
            + "Please extend given function without changing it's declaration including arguments."
        )

        tasks.append(
            Task(
                name=str(problem.task_id),
                initial_code=FilesDict({"main.py": problem.starting_code}),
                command=None,  # Explicitly setting `None` because each assertion runs code
                prompt=prompt,
                assertions={
                    f"correct assertion {i}": MbppAssertion(
                        assertion=assertion
                    ).evaluate
                    for i, assertion in enumerate(problem.test_list)
                },
            )
        )

    return Benchmark(
        name="mbpp",
        tasks=tasks,
    )