Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add OpenAIBatch backend and refactor RequestProcessor to be compatible #28

Merged
merged 19 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 26 additions & 21 deletions build.py → build_pkg.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,55 @@
import os
import subprocess
import shutil
import subprocess
import sys
from pathlib import Path


def run_command(command, cwd=None):
result = subprocess.run(command, shell=True, cwd=cwd, check=True)
return result


def npm_install():
print("Running npm install")
run_command("npm install", cwd="bespoke-dataset-viewer")


def nextjs_build():
print("Running Next.js build")
run_command("npm run build", cwd="bespoke-dataset-viewer")
print("Copying build artifacts to static folder")

# Source and target directories
source_base = Path("bespoke-dataset-viewer")
target_base = Path("src/bespokelabs/curator/viewer/static")

# Ensure target directory exists
if target_base.exists():
shutil.rmtree(target_base)
target_base.mkdir(parents=True, exist_ok=True)

# Copy only the necessary files, excluding node_modules
files_to_copy = [
'.next',
'app',
'components',
'lib',
'public',
'types',
'package.json',
'package-lock.json',
'next.config.ts',
'next-env.d.ts',
'tsconfig.json',
'postcss.config.mjs',
'tailwind.config.ts',
'components.json'
".next",
"app",
"components",
"lib",
"public",
"types",
"package.json",
"package-lock.json",
"next.config.ts",
"next-env.d.ts",
"tsconfig.json",
"postcss.config.mjs",
"tailwind.config.ts",
"components.json",
]

for item in files_to_copy:
source = source_base / item
target = target_base / item

if source.exists():
if source.is_file():
shutil.copy2(source, target)
Expand All @@ -60,6 +62,7 @@ def nextjs_build():
else:
print(f"Warning: {source} not found")


def run_pytest():
print("Running pytest")
try:
Expand All @@ -68,11 +71,13 @@ def run_pytest():
print("Pytest failed. Aborting build.")
sys.exit(1)


def main():
npm_install()
nextjs_build()
run_pytest()
print("Build completed successfully.")


if __name__ == "__main__":
main()
main()
18 changes: 5 additions & 13 deletions examples/camel.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from typing import List

import pandas as pd
from pydantic import BaseModel, Field

from bespokelabs import curator
Expand All @@ -24,18 +23,14 @@ class QAs(BaseModel):


subject_prompter = curator.Prompter(
prompt_func=lambda: {
"user_prompt": f"Generate a diverse list of 3 subjects. Keep it high-level (e.g. Math, Science)."
},
prompt_func=lambda: f"Generate a diverse list of 3 subjects. Keep it high-level (e.g. Math, Science).",
parse_func=lambda _, subjects: [subject for subject in subjects.subjects],
model_name="gpt-4o-mini",
response_format=Subjects,
)
subject_dataset = subject_prompter()
subsubject_prompter = curator.Prompter(
prompt_func=lambda subject: {
"user_prompt": f"For the given subject {subject}. Generate 3 diverse subsubjects. No explanation."
},
prompt_func=lambda subject: f"For the given subject {subject}. Generate 3 diverse subsubjects. No explanation.",
parse_func=lambda subject, subsubjects: [
{"subject": subject["subject"], "subsubject": subsubject.subject}
for subsubject in subsubjects.subjects
Expand All @@ -46,9 +41,7 @@ class QAs(BaseModel):
subsubject_dataset = subsubject_prompter(subject_dataset)

qa_prompter = curator.Prompter(
prompt_func=lambda subsubject: {
"user_prompt": f"For the given subsubject {subsubject}. Generate 3 diverse questions and answers. No explanation."
},
prompt_func=lambda subsubject: f"For the given subsubject {subsubject}. Generate 3 diverse questions and answers. No explanation.",
model_name="gpt-4o-mini",
response_format=QAs,
parse_func=lambda subsubject, qas: [
Expand All @@ -63,6 +56,5 @@ class QAs(BaseModel):
)
qa_dataset = qa_prompter(subsubject_dataset)

qa_hf_dataset = qa_dataset.to_huggingface()
qa_hf_dataset.map(lambda row: {"answer": row["answer"].strip()}, num_proc=2)
print(qa_hf_dataset.to_pandas())
qa_dataset.map(lambda row: {"answer": row["answer"].strip()}, num_proc=2)
print(qa_dataset.to_pandas())
26 changes: 26 additions & 0 deletions examples/distill.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from bespokelabs import curator
from datasets import load_dataset
import logging

logging.basicConfig(level=logging.INFO)

dataset = load_dataset("allenai/WildChat", split="train")
dataset = dataset.select(range(3_000))


def prompt_func(row):
return row["conversation"][0]["content"]


def parse_func(row, response):
instruction = row["conversation"][0]["content"]
return {"instruction": instruction, "new_response": response}


distill_prompter = curator.Prompter(
prompt_func=prompt_func, parse_func=parse_func, model_name="gpt-4o-mini", batch=True
)

distilled_dataset = distill_prompter(dataset)
print(distilled_dataset)
print(distilled_dataset[0])
6 changes: 2 additions & 4 deletions examples/poetry.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
from bespokelabs import curator

poet = curator.Prompter(
prompt_func=lambda: {
"user_prompt": "Write a poem about the beauty of computer science"
},
prompt_func=lambda: "Write a poem about the beauty of computer science",
model_name="gpt-4o-mini",
)

poem = poet()
print(poem.to_list()[0])
print(poem["response"][0])
Loading