Skip to content

Commit

Permalink
Merge pull request opendatahub-io#200 from HumairAK/parameterize-samp…
Browse files Browse the repository at this point in the history
…lesize

chore: adjust sdg pcd ratio
  • Loading branch information
HumairAK authored Nov 21, 2024
2 parents 3baa602 + ea6ff46 commit 2b25734
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 24 deletions.
5 changes: 4 additions & 1 deletion pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def pipeline(
sdg_scale_factor: int = 2, # Renamed upstream https://github.com/instructlab/instructlab/blob/f7d40f6ed5112d59132dd832bd332fa6fbbe7010/src/instructlab/configuration.py#L279-L290
sdg_pipeline: str = SDG_PIPELINE,
sdg_max_batch_len: int = MAX_BATCH_LEN,
sdg_sample_size: float = 1.0,
# Training phase
train_nproc_per_node: int = 3,
train_nnodes: int = 2,
Expand Down Expand Up @@ -154,6 +155,7 @@ def pipeline(
sdg_scale_factor: SDG parameter. The total number of instructions to be generated.
sdg_pipeline: SDG parameter. Data generation pipeline to use. Available: 'simple', 'full', or a valid path to a directory of pipeline workflow YAML files. Note that 'full' requires a larger teacher model, Mixtral-8x7b.
sdg_max_batch_len: SDG parameter. Maximum tokens per gpu for each batch that will be handled in a single step.
sdg_sample_size: SDG parameter. Represents the sdg skills recipe sampling size as percentage in decimal form.
train_nproc_per_node: Training parameter. Number of GPUs per each node/worker to use for training.
train_nnodes: Training parameter. Number of nodes/workers to train on.
Expand Down Expand Up @@ -204,6 +206,7 @@ def pipeline(
pipeline=sdg_pipeline,
repo_branch=sdg_repo_branch,
repo_pr=sdg_repo_pr,
sdg_sampling_size=sdg_sample_size,
)
sdg_task.set_env_variable("HOME", "/tmp")
sdg_task.set_env_variable("HF_HOME", "/tmp")
Expand Down Expand Up @@ -554,7 +557,7 @@ def gen_standalone():
# The list of executor names to extract details from to generate the standalone script
executors = {
"exec-data-processing-op": 'data_processing_op(max_seq_len={MAX_SEQ_LEN}, max_batch_len={MAX_BATCH_LEN}, sdg_path="{DATA_PVC_SDG_PATH}", model_path="{DATA_PVC_MODEL_PATH}", skills_path="{PREPROCESSED_DATA_SKILLS_PATH}", knowledge_path="{PREPROCESSED_DATA_KNOWLEDGE_PATH}")',
"exec-sdg-op": 'sdg_op(num_instructions_to_generate={num_instructions_to_generate}, pipeline="{sdg_pipeline}", repo_branch="{exec_git_clone_op_repo_branch or ""}", repo_pr={exec_git_clone_op_repo_pr or 0}, taxonomy_path="{TAXONOMY_DATA_PATH}", sdg_path="{DATA_PVC_SDG_PATH}")',
"exec-sdg-op": 'sdg_op(num_instructions_to_generate={num_instructions_to_generate}, pipeline="{sdg_pipeline}", repo_branch="{exec_git_clone_op_repo_branch or ""}", repo_pr={exec_git_clone_op_repo_pr or 0}, taxonomy_path="{TAXONOMY_DATA_PATH}", sdg_path="{DATA_PVC_SDG_PATH}", sdg_sampling_size={sdg_sampling_size})',
"exec-git-clone-op": {},
"exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="{REPO_GRANITE_7B_IMAGE}", model_path="{DATA_PVC_MODEL_PATH}")',
"exec-run-mt-bench-op": 'run_mt_bench_op(best_score_file="{MT_BENCH_SCORES_PATH}",output_path="{MT_BENCH_OUTPUT_PATH}",models_folder="{CANDIDATE_MODEL_PATH_PREFIX}",models_path_prefix="{CANDIDATE_MODEL_PATH_PREFIX}", max_workers="{MAX_WORKERS}", merge_system_user_message={MERGE_SYSTEM_USER_MESSAGE})',
Expand Down
28 changes: 20 additions & 8 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# sdg_repo_branch: str
# sdg_repo_pr: int
# sdg_repo_url: str [Default: 'https://github.com/instructlab/taxonomy.git']
# sdg_sample_size: float [Default: 1.0]
# sdg_scale_factor: int [Default: 2.0]
# train_effective_batch_size_phase_1: int [Default: 3840.0]
# train_effective_batch_size_phase_2: int [Default: 3840.0]
Expand Down Expand Up @@ -565,6 +566,10 @@ components:
defaultValue: /data/sdg
isOptional: true
parameterType: STRING
sdg_sampling_size:
defaultValue: 1.0
isOptional: true
parameterType: NUMBER_DOUBLE
taxonomy_path:
defaultValue: /data/taxonomy
isOptional: true
Expand Down Expand Up @@ -1541,11 +1546,12 @@ deploymentSpec:
\ *\n\ndef sdg_op(\n num_instructions_to_generate: int,\n pipeline:\
\ str,\n repo_branch: Optional[str],\n repo_pr: Optional[int],\n \
\ taxonomy_path: str = \"/data/taxonomy\",\n sdg_path: str = \"/data/sdg\"\
,\n):\n from os import getenv, path\n\n import openai\n import\
\ yaml\n from instructlab.sdg import generate_data\n from instructlab.sdg.utils.taxonomy\
\ import read_taxonomy\n\n SAMPLING_SIZE = 70\n\n def set_precomputed_skills_data_ratio(sampling_size):\n\
\ skills_recipe = \"/usr/share/instructlab/sdg/default_data_recipes/skills.yaml\"\
\n if path.exists(skills_recipe):\n with open(skills_recipe,\
,\n sdg_sampling_size: float = 1.0,\n):\n from os import getenv, path\n\
\n import openai\n import yaml\n from instructlab.sdg import generate_data\n\
\ from instructlab.sdg.utils.taxonomy import read_taxonomy\n\n def\
\ set_precomputed_skills_data_ratio(sampling_size: float):\n skills_recipe\
\ = \"/usr/share/instructlab/sdg/default_data_recipes/skills.yaml\"\n \
\ if path.exists(skills_recipe):\n with open(skills_recipe,\
\ \"r\") as file:\n skills_yaml = yaml.load(file, Loader=yaml.Loader)\n\
\n skills_yaml[\"datasets\"][0][\"sampling_size\"] = sampling_size\n\
\n with open(skills_recipe, \"w\", encoding=\"utf-8\") as file:\n\
Expand All @@ -1558,9 +1564,7 @@ deploymentSpec:
\ openai.OpenAI(base_url=endpoint, api_key=api_key)\n\n taxonomy_base\
\ = \"main\" if repo_branch or (repo_pr and int(repo_pr) > 0) else \"empty\"\
\n\n print(\"Generating synthetic dataset for:\")\n print()\n print(read_taxonomy(taxonomy_path,\
\ taxonomy_base))\n\n # Temporary measure to limit the amount of precomputed\
\ skills data used to construct the SDG dataset.\n # Need during development\
\ to decrease training loop times and the cost of model quality.\n set_precomputed_skills_data_ratio(sampling_size=SAMPLING_SIZE)\n\
\ taxonomy_base))\n\n set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)\n\
\n # generate_data has a magic word for its taxonomy_base argument -\
\ 'empty'\n # it allows generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
\ generate_data(\n client=client,\n num_instructions_to_generate=num_instructions_to_generate,\n\
Expand Down Expand Up @@ -2054,6 +2058,8 @@ root:
componentInputParameter: sdg_repo_branch
repo_pr:
componentInputParameter: sdg_repo_pr
sdg_sampling_size:
componentInputParameter: sdg_sample_size
taskInfo:
name: sdg-op
sdg-to-artifact-op:
Expand Down Expand Up @@ -2167,6 +2173,12 @@ root:
description: SDG parameter. Points to a taxonomy git repository
isOptional: true
parameterType: STRING
sdg_sample_size:
defaultValue: 1.0
description: SDG parameter. Represents the sdg skills recipe sampling size
as percentage in decimal form.
isOptional: true
parameterType: NUMBER_DOUBLE
sdg_scale_factor:
defaultValue: 2.0
description: SDG parameter. The total number of instructions to be generated.
Expand Down
9 changes: 3 additions & 6 deletions sdg/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def sdg_op(
repo_pr: Optional[int],
taxonomy_path: str = "/data/taxonomy",
sdg_path: str = "/data/sdg",
sdg_sampling_size: float = 1.0,
):
from os import getenv, path

Expand All @@ -43,9 +44,7 @@ def sdg_op(
from instructlab.sdg import generate_data
from instructlab.sdg.utils.taxonomy import read_taxonomy

SAMPLING_SIZE = 70

def set_precomputed_skills_data_ratio(sampling_size):
def set_precomputed_skills_data_ratio(sampling_size: float):
skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"
if path.exists(skills_recipe):
with open(skills_recipe, "r") as file:
Expand Down Expand Up @@ -76,9 +75,7 @@ def set_precomputed_skills_data_ratio(sampling_size):
print()
print(read_taxonomy(taxonomy_path, taxonomy_base))

# Temporary measure to limit the amount of precomputed skills data used to construct the SDG dataset.
# Need during development to decrease training loop times and the cost of model quality.
set_precomputed_skills_data_ratio(sampling_size=SAMPLING_SIZE)
set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)

# generate_data has a magic word for its taxonomy_base argument - 'empty'
# it allows generating from the whole repo, see:
Expand Down
3 changes: 3 additions & 0 deletions standalone/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,9 @@ evaluation
* `--sdg-serving-model-ca-cert-cm-key`: Name of the Key in the Kubernetes ConfigMap containing the SDG serving model CA cert.
`SDG_SERVING_MODEL_CA_CERT_CM_KEY` environment variable can be used as well. **Optional**
* `--sdg-serving-model-api-key`: The API key for the model to use for Synthetic Data Generation. **Optional**
* `--sdg-sampling-size`: Allows you to tune how much data is used from the default data skills recipe. The sampling size
represents the percentage of the sample to take, a value of 0.5 specifies a 50% value. This is useful for development
purposes, when testing the whole iLab pipeline and model performance is not a concern." **Optional**
* `--judge-serving-model-endpoint`: Serving endpoint for evaluation. e.g:
http://serving.kubeflow.svc.cluster.local:8080/v1 - **Optional**
* `--judge-serving-model-name`: The name of the model to use for evaluation. **Optional**
Expand Down
32 changes: 24 additions & 8 deletions standalone/standalone.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
SDG_CA_CERT_ENV_VAR_NAME = "SDG_CA_CERT_PATH"
SDG_CA_CERT_PATH = "/tmp/cert"
SDG_CA_CERT_CM_KEY = "ca-bundle.crt"
DEFAULT_SDG_SAMPLING_SIZE = 1.0

# SDG DATA PREPROCESSING (before doing training, data has to be converted)
MAX_SEQ_LEN = 4096
Expand Down Expand Up @@ -826,6 +827,16 @@ def show(
help="Name of the Key in the Kubernetes ConfigMap containing the SDG serving model CA cert.",
default=SDG_CA_CERT_CM_KEY,
)
@click.option(
"--sdg-sampling-size",
type=float,
envvar="SDG_SERVING_MODEL_SAMPLING_SIZE",
help="Allows you to tune how much data is used from the default data skills recipe. "
"The sampling size represents the percentage of the sample to take, a value of 0.5 "
"specifies a 50% value. This is useful for development purposes, when testing the "
"whole iLab pipeline and model performance is not a concern.",
default=DEFAULT_SDG_SAMPLING_SIZE,
)
@click.option(
"--force-pull",
help=(
Expand Down Expand Up @@ -908,6 +919,7 @@ def run(
sdg_serving_model_api_key: typing.Optional[str] = None,
sdg_serving_model_ca_cert: typing.Optional[str] = None,
sdg_serving_model_ca_cert_cm_key: typing.Optional[str] = None,
sdg_sampling_size: typing.Optional[float] = None,
force_pull: typing.Optional[bool] = False,
training_1_epoch_num: int = 7,
training_1_effective_batch_size: int = 3840,
Expand Down Expand Up @@ -966,6 +978,7 @@ def run(
sdg_in_cluster (bool): Run SDG in the cluster. Default is retrieve SDG Data from an object store.
sdg_pipeline (str): The pipeline type used for SDG, value must be 'simple', 'full', or a
valid path to a directory.
sdg_sampling_size (float): Represents the sdg skills recipe sampling size in decimal form.
Returns:
None
"""
Expand Down Expand Up @@ -998,6 +1011,7 @@ def run(
ctx.obj["sdg_serving_model_api_key"] = sdg_serving_model_api_key
ctx.obj["sdg_serving_model_ca_cert"] = sdg_serving_model_ca_cert
ctx.obj["sdg_serving_model_ca_cert_cm_key"] = sdg_serving_model_ca_cert_cm_key
ctx.obj["sdg_sampling_size"] = sdg_sampling_size
ctx.obj["force_pull"] = force_pull
ctx.obj["training_1_epoch_num"] = training_1_epoch_num
ctx.obj["training_1_effective_batch_size"] = training_1_effective_batch_size
Expand Down Expand Up @@ -1088,6 +1102,7 @@ def create_sdg_container(
exec_git_clone_op_repo_branch: str = "",
exec_git_clone_op_repo_pr: str = "",
sdg_pipeline: str = SDG_DEFAULT_PIPELINE,
sdg_sampling_size: float = DEFAULT_SDG_SAMPLING_SIZE,
) -> kubernetes.client.V1Container:
"""
Creates a Kubernetes V1Job container for generating synthetic data.
Expand Down Expand Up @@ -1115,6 +1130,7 @@ def sdg_op(
repo_pr: Optional[int],
taxonomy_path: str = "/data/taxonomy",
sdg_path: str = "/data/sdg",
sdg_sampling_size: float = 1.0,
):
from os import getenv, path
Expand All @@ -1123,9 +1139,7 @@ def sdg_op(
from instructlab.sdg import generate_data
from instructlab.sdg.utils.taxonomy import read_taxonomy
SAMPLING_SIZE = 70
def set_precomputed_skills_data_ratio(sampling_size):
def set_precomputed_skills_data_ratio(sampling_size: float):
skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"
if path.exists(skills_recipe):
with open(skills_recipe, "r") as file:
Expand Down Expand Up @@ -1156,9 +1170,7 @@ def set_precomputed_skills_data_ratio(sampling_size):
print()
print(read_taxonomy(taxonomy_path, taxonomy_base))
# Temporary measure to limit the amount of precomputed skills data used to construct the SDG dataset.
# Need during development to decrease training loop times and the cost of model quality.
set_precomputed_skills_data_ratio(sampling_size=SAMPLING_SIZE)
set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)
# generate_data has a magic word for its taxonomy_base argument - 'empty'
# it allows generating from the whole repo, see:
Expand All @@ -1176,7 +1188,7 @@ def set_precomputed_skills_data_ratio(sampling_size):
)
"""
exec_sdg_op_args = f"""
sdg_op(num_instructions_to_generate={num_instructions_to_generate}, pipeline="{sdg_pipeline}", repo_branch="{exec_git_clone_op_repo_branch or ''}", repo_pr={exec_git_clone_op_repo_pr or 0}, taxonomy_path="{TAXONOMY_DATA_PATH}", sdg_path="{DATA_PVC_SDG_PATH}")
sdg_op(num_instructions_to_generate={num_instructions_to_generate}, pipeline="{sdg_pipeline}", repo_branch="{exec_git_clone_op_repo_branch or ""}", repo_pr={exec_git_clone_op_repo_pr or 0}, taxonomy_path="{TAXONOMY_DATA_PATH}", sdg_path="{DATA_PVC_SDG_PATH}", sdg_sampling_size={sdg_sampling_size})
"""

return kubernetes.client.V1Container(
Expand Down Expand Up @@ -1215,6 +1227,7 @@ def create_data_job(
sdg_pipeline: str = SDG_DEFAULT_PIPELINE,
sdg_serving_model_ca_cert: str = None,
sdg_serving_model_ca_cert_cm_key: str = None,
sdg_sampling_size: float = DEFAULT_SDG_SAMPLING_SIZE,
) -> kubernetes.client.V1Job:
"""
Create a Kubernetes Job object.
Expand All @@ -1236,7 +1249,7 @@ def create_data_job(
valid path to a directory.
sdg_serving_model_ca_cert (str): The serving model CA cert for SDG.
sdg_serving_model_ca_cert_cm_key (str): The name of the Key in the Kubernetes ConfigMap.
sdg_sampling_size (float): Represents the sdg skills recipe sampling size in decimal form.
Returns:
kubernetes.client.V1Job: A Kubernetes Job object configured with the specified parameters.
Expand Down Expand Up @@ -1474,6 +1487,7 @@ def data_processing(train_args: TrainingArgs) -> None:
exec_git_clone_op_repo_branch=taxonomy_repo_branch,
exec_git_clone_op_repo_pr=taxonomy_repo_pr,
sdg_pipeline=sdg_pipeline,
sdg_sampling_size=sdg_sampling_size,
)

if sdg_serving_model_ca_cert:
Expand Down Expand Up @@ -2873,6 +2887,7 @@ def sdg(
taxonomy_repo_pr = ctx.obj["taxonomy_repo_pr"]
taxonomy_repo_branch = ctx.obj["taxonomy_repo_branch"]
sdg_pipeline = ctx.obj["sdg_pipeline"]
sdg_sampling_size = ctx.obj["sdg_sampling_size"]

v1 = kubernetes.client.CoreV1Api()
# Secret details validation here!
Expand Down Expand Up @@ -3014,6 +3029,7 @@ def decode_base64(data):
sdg_pipeline=sdg_pipeline,
sdg_serving_model_ca_cert=sdg_serving_model_ca_cert,
sdg_serving_model_ca_cert_cm_key=sdg_serving_model_ca_cert_cm_key,
sdg_sampling_size=sdg_sampling_size,
)

if dry_run:
Expand Down
Loading

0 comments on commit 2b25734

Please sign in to comment.