Skip to content

Commit

Permalink
fix can't pickle bug
Browse files Browse the repository at this point in the history
  • Loading branch information
haesleinhuepf committed Mar 23, 2024
1 parent 312c5e5 commit 628fd26
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 49 deletions.
107 changes: 58 additions & 49 deletions human_eval/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,68 +9,77 @@
import signal
import tempfile

def unsafe_execute(problem, completion, result, timeout):

with create_tempdir():

# These system calls are needed when cleaning up tempdir.
import os
import shutil
rmtree = shutil.rmtree
rmdir = os.rmdir
chdir = os.chdir

# Disable functionalities that can make destructive changes to the test.
reliability_guard()

# Construct the check program and run it.
check_program = (
problem["prompt"] + completion + "\n" +
problem["test"] + "\n" +
f"check({problem['entry_point']})"
)

try:
exec_globals = {}
with swallow_io():
with time_limit(timeout):
# WARNING
# This program exists to execute untrusted model-generated code. Although
# it is highly unlikely that model-generated code will do something overtly
# malicious in response to this test suite, model-generated code may act
# destructively due to a lack of model capability or alignment.
# Users are strongly encouraged to sandbox this evaluation suite so that it
# does not perform destructive actions on their host or network. For more
# information on how OpenAI sandboxes its code, see the accompanying paper.
# Once you have read this disclaimer and taken appropriate precautions,
# uncomment the following line and proceed at your own risk:
exec(check_program, exec_globals)
result.append("passed")
except TimeoutException:
result.append("timed out")
except BaseException as e:
result.append(f"failed: {e}")

# Needed for cleaning up.
shutil.rmtree = rmtree
os.rmdir = rmdir
os.chdir = chdir


def check_correctness(problem: Dict, completion: str, timeout: float,
completion_id: Optional[int] = None) -> Dict:
"""
Evaluates the functional correctness of a completion by running the test
suite provided in the problem.
suite provided in the problem.
:param completion_id: an optional completion ID so we can match
the results later even if execution finishes asynchronously.
"""

def unsafe_execute():

with create_tempdir():

# These system calls are needed when cleaning up tempdir.
import os
import shutil
rmtree = shutil.rmtree
rmdir = os.rmdir
chdir = os.chdir

# Disable functionalities that can make destructive changes to the test.
reliability_guard()

# Construct the check program and run it.
check_program = (
problem["prompt"] + completion + "\n" +
problem["test"] + "\n" +
f"check({problem['entry_point']})"
)

try:
exec_globals = {}
with swallow_io():
with time_limit(timeout):
# WARNING
# This program exists to execute untrusted model-generated code. Although
# it is highly unlikely that model-generated code will do something overtly
# malicious in response to this test suite, model-generated code may act
# destructively due to a lack of model capability or alignment.
# Users are strongly encouraged to sandbox this evaluation suite so that it
# does not perform destructive actions on their host or network. For more
# information on how OpenAI sandboxes its code, see the accompanying paper.
# Once you have read this disclaimer and taken appropriate precautions,
# uncomment the following line and proceed at your own risk:
# exec(check_program, exec_globals)
result.append("passed")
except TimeoutException:
result.append("timed out")
except BaseException as e:
result.append(f"failed: {e}")

# Needed for cleaning up.
shutil.rmtree = rmtree
os.rmdir = rmdir
os.chdir = chdir

manager = multiprocessing.Manager()
result = manager.list()

p = multiprocessing.Process(target=unsafe_execute)
# p = multiprocessing.Process(target=unsafe_execute)
p = multiprocessing.Process(
target=unsafe_execute,
args=(
problem,
completion,
result,
timeout
),
)
p.start()
p.join(timeout=timeout + 1)
if p.is_alive():
Expand Down
4 changes: 4 additions & 0 deletions notebooks/samples_gpt-3.5-turbo-1106.jsonl_results.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"task_id": "./human-eval-bia/human_eval_bia_0.ipynb", "completion": "\n labeled_image, num_labels = skimage.measure.label(binary_image, background=0, return_num=True)\n return num_labels\n", "result": "failed: module 'signal' has no attribute 'setitimer'", "passed": false}
{"task_id": "./human-eval-bia/human_eval_bia_1.ipynb", "completion": "\nfrom skimage import filters\nimport numpy as np\n\ndef apply_otsu_threshold_and_count_postiive_pixels(image):\n threshold_value = filters.threshold_otsu(image)\n binary_image = image > threshold_value\n positive_pixels_count = np.sum(binary_image)\n return binary_image, positive_pixels_count\n", "result": "failed: module 'signal' has no attribute 'setitimer'", "passed": false}
{"task_id": "./human-eval-bia/human_eval_bia_0.ipynb", "completion": "\n # Convert the image to uint8\n binary_image = binary_image.astype(np.uint8)\n\n # Perform connected component labeling\n labeled_image, num_labels = skimage.measure.label(binary_image, return_num=True)\n\n # Return the count of labeled objects\n return num_labels\n", "result": "failed: module 'signal' has no attribute 'setitimer'", "passed": false}
{"task_id": "./human-eval-bia/human_eval_bia_1.ipynb", "completion": "\nimport skimage.filters\n\ndef apply_otsu_threshold_and_count_postiive_pixels(image):\n \"\"\"\n Takes an image, applies Otus's threshold method to it to create a binary image and \n counts the positive pixels.\n \"\"\"\n # Apply Otsu's thresholding to create a binary image\n threshold = skimage.filters.threshold_otsu(image)\n binary_image = image > threshold\n \n # Count positive pixels\n positive_pixels_count = np.sum(binary_image)\n\n return positive_pixels_count\n", "result": "failed: module 'signal' has no attribute 'setitimer'", "passed": false}

0 comments on commit 628fd26

Please sign in to comment.