Merge pull request opendatahub-io#200 from HumairAK/parameterize-samp…

…lesize chore: adjust sdg pcd ratio
MichaelClifford · Nov 21, 2024 · 2b25734 · 2b25734
2 parents 3baa602 + ea6ff46
commit 2b25734
Show file tree

Hide file tree

Showing 6 changed files with 74 additions and 24 deletions.
diff --git a/pipeline.py b/pipeline.py
@@ -119,6 +119,7 @@ def pipeline(
         sdg_scale_factor: int = 2,  # Renamed upstream https://github.com/instructlab/instructlab/blob/f7d40f6ed5112d59132dd832bd332fa6fbbe7010/src/instructlab/configuration.py#L279-L290
         sdg_pipeline: str = SDG_PIPELINE,
         sdg_max_batch_len: int = MAX_BATCH_LEN,
+        sdg_sample_size: float = 1.0,
         # Training phase
         train_nproc_per_node: int = 3,
         train_nnodes: int = 2,
@@ -154,6 +155,7 @@ def pipeline(
             sdg_scale_factor: SDG parameter. The total number of instructions to be generated.
             sdg_pipeline: SDG parameter. Data generation pipeline to use. Available: 'simple', 'full', or a valid path to a directory of pipeline workflow YAML files. Note that 'full' requires a larger teacher model, Mixtral-8x7b.
             sdg_max_batch_len: SDG parameter. Maximum tokens per gpu for each batch that will be handled in a single step.
+            sdg_sample_size: SDG parameter. Represents the sdg skills recipe sampling size as percentage in decimal form.
 
             train_nproc_per_node: Training parameter. Number of GPUs per each node/worker to use for training.
             train_nnodes: Training parameter. Number of nodes/workers to train on.
@@ -204,6 +206,7 @@ def pipeline(
             pipeline=sdg_pipeline,
             repo_branch=sdg_repo_branch,
             repo_pr=sdg_repo_pr,
+            sdg_sampling_size=sdg_sample_size,
         )
         sdg_task.set_env_variable("HOME", "/tmp")
         sdg_task.set_env_variable("HF_HOME", "/tmp")
@@ -554,7 +557,7 @@ def gen_standalone():
     # The list of executor names to extract details from to generate the standalone script
     executors = {
         "exec-data-processing-op": 'data_processing_op(max_seq_len={MAX_SEQ_LEN}, max_batch_len={MAX_BATCH_LEN}, sdg_path="{DATA_PVC_SDG_PATH}", model_path="{DATA_PVC_MODEL_PATH}", skills_path="{PREPROCESSED_DATA_SKILLS_PATH}", knowledge_path="{PREPROCESSED_DATA_KNOWLEDGE_PATH}")',
-        "exec-sdg-op": 'sdg_op(num_instructions_to_generate={num_instructions_to_generate}, pipeline="{sdg_pipeline}", repo_branch="{exec_git_clone_op_repo_branch or ""}", repo_pr={exec_git_clone_op_repo_pr or 0}, taxonomy_path="{TAXONOMY_DATA_PATH}", sdg_path="{DATA_PVC_SDG_PATH}")',
+        "exec-sdg-op": 'sdg_op(num_instructions_to_generate={num_instructions_to_generate}, pipeline="{sdg_pipeline}", repo_branch="{exec_git_clone_op_repo_branch or ""}", repo_pr={exec_git_clone_op_repo_pr or 0}, taxonomy_path="{TAXONOMY_DATA_PATH}", sdg_path="{DATA_PVC_SDG_PATH}", sdg_sampling_size={sdg_sampling_size})',
         "exec-git-clone-op": {},
         "exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="{REPO_GRANITE_7B_IMAGE}", model_path="{DATA_PVC_MODEL_PATH}")',
         "exec-run-mt-bench-op": 'run_mt_bench_op(best_score_file="{MT_BENCH_SCORES_PATH}",output_path="{MT_BENCH_OUTPUT_PATH}",models_folder="{CANDIDATE_MODEL_PATH_PREFIX}",models_path_prefix="{CANDIDATE_MODEL_PATH_PREFIX}", max_workers="{MAX_WORKERS}", merge_system_user_message={MERGE_SYSTEM_USER_MESSAGE})',

diff --git a/pipeline.yaml b/pipeline.yaml
@@ -15,6 +15,7 @@
 #    sdg_repo_branch: str
 #    sdg_repo_pr: int
 #    sdg_repo_url: str [Default: 'https://github.com/instructlab/taxonomy.git']
+#    sdg_sample_size: float [Default: 1.0]
 #    sdg_scale_factor: int [Default: 2.0]
 #    train_effective_batch_size_phase_1: int [Default: 3840.0]
 #    train_effective_batch_size_phase_2: int [Default: 3840.0]
@@ -565,6 +566,10 @@ components:
           defaultValue: /data/sdg
           isOptional: true
           parameterType: STRING
+        sdg_sampling_size:
+          defaultValue: 1.0
+          isOptional: true
+          parameterType: NUMBER_DOUBLE
         taxonomy_path:
           defaultValue: /data/taxonomy
           isOptional: true
@@ -1541,11 +1546,12 @@ deploymentSpec:
           \ *\n\ndef sdg_op(\n    num_instructions_to_generate: int,\n    pipeline:\
           \ str,\n    repo_branch: Optional[str],\n    repo_pr: Optional[int],\n \
           \   taxonomy_path: str = \"/data/taxonomy\",\n    sdg_path: str = \"/data/sdg\"\
-          ,\n):\n    from os import getenv, path\n\n    import openai\n    import\
-          \ yaml\n    from instructlab.sdg import generate_data\n    from instructlab.sdg.utils.taxonomy\
-          \ import read_taxonomy\n\n    SAMPLING_SIZE = 70\n\n    def set_precomputed_skills_data_ratio(sampling_size):\n\
-          \        skills_recipe = \"/usr/share/instructlab/sdg/default_data_recipes/skills.yaml\"\
-          \n        if path.exists(skills_recipe):\n            with open(skills_recipe,\
+          ,\n    sdg_sampling_size: float = 1.0,\n):\n    from os import getenv, path\n\
+          \n    import openai\n    import yaml\n    from instructlab.sdg import generate_data\n\
+          \    from instructlab.sdg.utils.taxonomy import read_taxonomy\n\n    def\
+          \ set_precomputed_skills_data_ratio(sampling_size: float):\n        skills_recipe\
+          \ = \"/usr/share/instructlab/sdg/default_data_recipes/skills.yaml\"\n  \
+          \      if path.exists(skills_recipe):\n            with open(skills_recipe,\
           \ \"r\") as file:\n                skills_yaml = yaml.load(file, Loader=yaml.Loader)\n\
           \n            skills_yaml[\"datasets\"][0][\"sampling_size\"] = sampling_size\n\
           \n            with open(skills_recipe, \"w\", encoding=\"utf-8\") as file:\n\
@@ -1558,9 +1564,7 @@ deploymentSpec:
           \ openai.OpenAI(base_url=endpoint, api_key=api_key)\n\n    taxonomy_base\
           \ = \"main\" if repo_branch or (repo_pr and int(repo_pr) > 0) else \"empty\"\
           \n\n    print(\"Generating synthetic dataset for:\")\n    print()\n    print(read_taxonomy(taxonomy_path,\
-          \ taxonomy_base))\n\n    # Temporary measure to limit the amount of precomputed\
-          \ skills data used to construct the SDG dataset.\n    # Need during development\
-          \ to decrease training loop times and the cost of model quality.\n    set_precomputed_skills_data_ratio(sampling_size=SAMPLING_SIZE)\n\
+          \ taxonomy_base))\n\n    set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)\n\
           \n    # generate_data has a magic word for its taxonomy_base argument -\
           \ 'empty'\n    # it allows generating from the whole repo, see:\n    # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
           \    generate_data(\n        client=client,\n        num_instructions_to_generate=num_instructions_to_generate,\n\
@@ -2054,6 +2058,8 @@ root:
               componentInputParameter: sdg_repo_branch
             repo_pr:
               componentInputParameter: sdg_repo_pr
+            sdg_sampling_size:
+              componentInputParameter: sdg_sample_size
         taskInfo:
           name: sdg-op
       sdg-to-artifact-op:
@@ -2167,6 +2173,12 @@ root:
         description: SDG parameter. Points to a taxonomy git repository
         isOptional: true
         parameterType: STRING
+      sdg_sample_size:
+        defaultValue: 1.0
+        description: SDG parameter. Represents the sdg skills recipe sampling size
+          as percentage in decimal form.
+        isOptional: true
+        parameterType: NUMBER_DOUBLE
       sdg_scale_factor:
         defaultValue: 2.0
         description: SDG parameter. The total number of instructions to be generated.

diff --git a/sdg/components.py b/sdg/components.py
@@ -35,6 +35,7 @@ def sdg_op(
     repo_pr: Optional[int],
     taxonomy_path: str = "/data/taxonomy",
     sdg_path: str = "/data/sdg",
+    sdg_sampling_size: float = 1.0,
 ):
     from os import getenv, path
 
@@ -43,9 +44,7 @@ def sdg_op(
     from instructlab.sdg import generate_data
     from instructlab.sdg.utils.taxonomy import read_taxonomy
 
-    SAMPLING_SIZE = 70
-
-    def set_precomputed_skills_data_ratio(sampling_size):
+    def set_precomputed_skills_data_ratio(sampling_size: float):
         skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"
         if path.exists(skills_recipe):
             with open(skills_recipe, "r") as file:
@@ -76,9 +75,7 @@ def set_precomputed_skills_data_ratio(sampling_size):
     print()
     print(read_taxonomy(taxonomy_path, taxonomy_base))
 
-    # Temporary measure to limit the amount of precomputed skills data used to construct the SDG dataset.
-    # Need during development to decrease training loop times and the cost of model quality.
-    set_precomputed_skills_data_ratio(sampling_size=SAMPLING_SIZE)
+    set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)
 
     # generate_data has a magic word for its taxonomy_base argument - 'empty'
     # it allows generating from the whole repo, see:

diff --git a/standalone/README.md b/standalone/README.md
@@ -412,6 +412,9 @@ evaluation
 * `--sdg-serving-model-ca-cert-cm-key`: Name of the Key in the Kubernetes ConfigMap containing the SDG serving model CA cert.
   `SDG_SERVING_MODEL_CA_CERT_CM_KEY` environment variable can be used as well. **Optional**
 * `--sdg-serving-model-api-key`: The API key for the model to use for Synthetic Data Generation. **Optional**
+* `--sdg-sampling-size`: Allows you to tune how much data is used from the default data skills recipe. The sampling size
+  represents the percentage of the sample to take, a value of 0.5 specifies a 50% value. This is useful for development
+  purposes, when testing the whole iLab pipeline and model performance is not a concern." **Optional**
 * `--judge-serving-model-endpoint`: Serving endpoint for evaluation. e.g:
   http://serving.kubeflow.svc.cluster.local:8080/v1 - **Optional**
 * `--judge-serving-model-name`: The name of the model to use for evaluation. **Optional**

diff --git a/standalone/standalone.py b/standalone/standalone.py
@@ -60,6 +60,7 @@
 SDG_CA_CERT_ENV_VAR_NAME = "SDG_CA_CERT_PATH"
 SDG_CA_CERT_PATH = "/tmp/cert"
 SDG_CA_CERT_CM_KEY = "ca-bundle.crt"
+DEFAULT_SDG_SAMPLING_SIZE = 1.0
 
 # SDG DATA PREPROCESSING (before doing training, data has to be converted)
 MAX_SEQ_LEN = 4096
@@ -826,6 +827,16 @@ def show(
     help="Name of the Key in the Kubernetes ConfigMap containing the SDG serving model CA cert.",
     default=SDG_CA_CERT_CM_KEY,
 )
+@click.option(
+    "--sdg-sampling-size",
+    type=float,
+    envvar="SDG_SERVING_MODEL_SAMPLING_SIZE",
+    help="Allows you to tune how much data is used from the default data skills recipe. "
+    "The sampling size represents the percentage of the sample to take, a value of 0.5 "
+    "specifies a 50% value. This is useful for development purposes, when testing the "
+    "whole iLab pipeline and model performance is not a concern.",
+    default=DEFAULT_SDG_SAMPLING_SIZE,
+)
 @click.option(
     "--force-pull",
     help=(
@@ -908,6 +919,7 @@ def run(
     sdg_serving_model_api_key: typing.Optional[str] = None,
     sdg_serving_model_ca_cert: typing.Optional[str] = None,
     sdg_serving_model_ca_cert_cm_key: typing.Optional[str] = None,
+    sdg_sampling_size: typing.Optional[float] = None,
     force_pull: typing.Optional[bool] = False,
     training_1_epoch_num: int = 7,
     training_1_effective_batch_size: int = 3840,
@@ -966,6 +978,7 @@ def run(
         sdg_in_cluster (bool): Run SDG in the cluster. Default is retrieve SDG Data from an object store.
         sdg_pipeline (str): The pipeline type used for SDG, value must be 'simple', 'full', or a
         valid path to a directory.
+        sdg_sampling_size (float): Represents the sdg skills recipe sampling size in decimal form.
     Returns:
         None
     """
@@ -998,6 +1011,7 @@ def run(
     ctx.obj["sdg_serving_model_api_key"] = sdg_serving_model_api_key
     ctx.obj["sdg_serving_model_ca_cert"] = sdg_serving_model_ca_cert
     ctx.obj["sdg_serving_model_ca_cert_cm_key"] = sdg_serving_model_ca_cert_cm_key
+    ctx.obj["sdg_sampling_size"] = sdg_sampling_size
     ctx.obj["force_pull"] = force_pull
     ctx.obj["training_1_epoch_num"] = training_1_epoch_num
     ctx.obj["training_1_effective_batch_size"] = training_1_effective_batch_size
@@ -1088,6 +1102,7 @@ def create_sdg_container(
     exec_git_clone_op_repo_branch: str = "",
     exec_git_clone_op_repo_pr: str = "",
     sdg_pipeline: str = SDG_DEFAULT_PIPELINE,
+    sdg_sampling_size: float = DEFAULT_SDG_SAMPLING_SIZE,
 ) -> kubernetes.client.V1Container:
     """
     Creates a Kubernetes V1Job container for generating synthetic data.
@@ -1115,6 +1130,7 @@ def sdg_op(
     repo_pr: Optional[int],
     taxonomy_path: str = "/data/taxonomy",
     sdg_path: str = "/data/sdg",
+    sdg_sampling_size: float = 1.0,
 ):
     from os import getenv, path
 
@@ -1123,9 +1139,7 @@ def sdg_op(
     from instructlab.sdg import generate_data
     from instructlab.sdg.utils.taxonomy import read_taxonomy
 
-    SAMPLING_SIZE = 70
-
-    def set_precomputed_skills_data_ratio(sampling_size):
+    def set_precomputed_skills_data_ratio(sampling_size: float):
         skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"
         if path.exists(skills_recipe):
             with open(skills_recipe, "r") as file:
@@ -1156,9 +1170,7 @@ def set_precomputed_skills_data_ratio(sampling_size):
     print()
     print(read_taxonomy(taxonomy_path, taxonomy_base))
 
-    # Temporary measure to limit the amount of precomputed skills data used to construct the SDG dataset.
-    # Need during development to decrease training loop times and the cost of model quality.
-    set_precomputed_skills_data_ratio(sampling_size=SAMPLING_SIZE)
+    set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)
 
     # generate_data has a magic word for its taxonomy_base argument - 'empty'
     # it allows generating from the whole repo, see:
@@ -1176,7 +1188,7 @@ def set_precomputed_skills_data_ratio(sampling_size):
     )
 """
     exec_sdg_op_args = f"""
-sdg_op(num_instructions_to_generate={num_instructions_to_generate}, pipeline="{sdg_pipeline}", repo_branch="{exec_git_clone_op_repo_branch or ''}", repo_pr={exec_git_clone_op_repo_pr or 0}, taxonomy_path="{TAXONOMY_DATA_PATH}", sdg_path="{DATA_PVC_SDG_PATH}")
+sdg_op(num_instructions_to_generate={num_instructions_to_generate}, pipeline="{sdg_pipeline}", repo_branch="{exec_git_clone_op_repo_branch or ""}", repo_pr={exec_git_clone_op_repo_pr or 0}, taxonomy_path="{TAXONOMY_DATA_PATH}", sdg_path="{DATA_PVC_SDG_PATH}", sdg_sampling_size={sdg_sampling_size})
 """
 
     return kubernetes.client.V1Container(
@@ -1215,6 +1227,7 @@ def create_data_job(
     sdg_pipeline: str = SDG_DEFAULT_PIPELINE,
     sdg_serving_model_ca_cert: str = None,
     sdg_serving_model_ca_cert_cm_key: str = None,
+    sdg_sampling_size: float = DEFAULT_SDG_SAMPLING_SIZE,
 ) -> kubernetes.client.V1Job:
     """
     Create a Kubernetes Job object.
@@ -1236,7 +1249,7 @@ def create_data_job(
         valid path to a directory.
         sdg_serving_model_ca_cert (str): The serving model CA cert for SDG.
         sdg_serving_model_ca_cert_cm_key (str): The name of the Key in the Kubernetes ConfigMap.
-
+        sdg_sampling_size (float): Represents the sdg skills recipe sampling size in decimal form.
 
     Returns:
         kubernetes.client.V1Job: A Kubernetes Job object configured with the specified parameters.
@@ -1474,6 +1487,7 @@ def data_processing(train_args: TrainingArgs) -> None:
                 exec_git_clone_op_repo_branch=taxonomy_repo_branch,
                 exec_git_clone_op_repo_pr=taxonomy_repo_pr,
                 sdg_pipeline=sdg_pipeline,
+                sdg_sampling_size=sdg_sampling_size,
             )
 
             if sdg_serving_model_ca_cert:
@@ -2873,6 +2887,7 @@ def sdg(
     taxonomy_repo_pr = ctx.obj["taxonomy_repo_pr"]
     taxonomy_repo_branch = ctx.obj["taxonomy_repo_branch"]
     sdg_pipeline = ctx.obj["sdg_pipeline"]
+    sdg_sampling_size = ctx.obj["sdg_sampling_size"]
 
     v1 = kubernetes.client.CoreV1Api()
     # Secret details validation here!
@@ -3014,6 +3029,7 @@ def decode_base64(data):
         sdg_pipeline=sdg_pipeline,
         sdg_serving_model_ca_cert=sdg_serving_model_ca_cert,
         sdg_serving_model_ca_cert_cm_key=sdg_serving_model_ca_cert_cm_key,
+        sdg_sampling_size=sdg_sampling_size,
     )
 
     if dry_run: