Skip to content

Commit

Permalink
WIP: fix-66
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Clifford <[email protected]>
  • Loading branch information
MichaelClifford committed Oct 8, 2024
1 parent 0e32654 commit b4d58d2
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 72 deletions.
148 changes: 84 additions & 64 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -852,35 +852,39 @@ deploymentSpec:
\ int = 2,\n) -> NamedTuple(\"outputs\", manifest=str, name=str):\n import\
\ inspect\n\n Outputs = NamedTuple(\"outputs\", manifest=str, name=str)\n\
\ name = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n\n \
\ image = \"quay.io/shanand/test-train:0.0.4\"\n\n manifest = inspect.cleandoc(\n\
\ f\"\"\"\n apiVersion: kubeflow.org/v1\n kind: PyTorchJob\n\
\ metadata:\n name: {name}\n spec:\n nprocPerNode:\
\ \\\\\"{nproc_per_node}\\\\\"\n pytorchReplicaSpecs:\n \
\ Master:\n replicas: 1\n restartPolicy: OnFailure\n\
\ template:\n metadata:\n annotations:\n\
\ sidecar.istio.io/inject: 'false'\n spec:\n\
\ containers:\n - args:\n \
\ - |\n mkdir -p /output/model;\n\
\ mkdir -p /output/data;\n \
\ python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir\
\ /output/model --data_output_dir /input_data/processed_data\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ name: input-data\n readOnly:\
\ true\n - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
\ - mountPath: /output\n \
\ name: output\n env:\n - name:\
\ NNODES\n value: \\\\\"{nnodes}\\\\\"\n \
\ - name: NPROC_PER_NODE\n value:\
\ \\\\\"{nproc_per_node}\\\\\"\n resources:\n \
\ requests:\n cpu: 2\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n \
\ limits:\n cpu: 2\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n volumes:\n\
\ - name: input-data\n persistentVolumeClaim:\n\
\ image = \"registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1-1724960989\"\
\n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
\ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
\ name: {name}\n spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\
\\\"\n pytorchReplicaSpecs:\n Master:\n \
\ replicas: 1\n restartPolicy: OnFailure\n template:\n\
\ metadata:\n annotations:\n \
\ sidecar.istio.io/inject: 'false'\n spec:\n \
\ containers:\n - args:\n \
\ - |\n export XDG_CACHE_HOME=/tmp\n \
\ export TRITON_CACHE_DIR=/tmp\n \
\ mkdir -p /output/model;\n mkdir -p\
\ /output/data;\n python3.11 -m torch.distributed.launch\
\ --nnodes {nnodes} --nproc_per_node {nproc_per_node} --node_rank $(RANK)\
\ --rdzv_endpoint $(MASTER_ADDR):$(MASTER_PORT) /opt/app-root/lib/python3.11/site-packages/instructlab/training/main_ds.py\
\ --model_name_or_path={path_to_model} --data_path=/ilab/dataset/data.jsonl\
\ --output_dir=/input_data/processed_data --num_epochs=2 --effective_batch_size=3840\
\ --learning_rate=2e-6 --num_warmup_steps=800 --save_samples=0 --log_level=INFO\
\ --max_batch_len=20000 --seed=42 --cpu_offload_optimizer --sharding_strategy=FULL_SHARD\n\
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath:\
\ /input_data\n name: input-data\n \
\ readOnly: true\n - mountPath: /input_model\n\
\ name: model\n readOnly:\
\ true\n - mountPath: /output\n \
\ name: output\n resources:\n \
\ requests:\n cpu: 2\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n \
\ limits:\n cpu: 2\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n volumes:\n \
\ - name: input-data\n persistentVolumeClaim:\n\
\ claimName: {input_pvc_name}\n \
\ - name: model\n persistentVolumeClaim:\n \
\ claimName: {model_pvc_name}\n - name:\
Expand All @@ -890,9 +894,15 @@ deploymentSpec:
\ template:\n metadata:\n annotations:\n\
\ sidecar.istio.io/inject: 'false'\n spec:\n\
\ containers:\n - args:\n \
\ - |\n mkdir -p /tmp/model;\n \
\ python3.11 -u run_main_ds.py --model_path {path_to_model}\
\ --ckpt_output_dir /tmp/model --data_output_dir /input_data/processed_data\n\
\ - |\n export XDG_CACHE_HOME=/tmp\n\
\ export TRITON_CACHE_DIR=/tmp\n \
\ mkdir -p /tmp/model;\n python3.11\
\ -m torch.distributed.launch --nnodes {nnodes} --nproc_per_node {nproc_per_node}\
\ --node_rank $(RANK) --rdzv_endpoint $(MASTER_ADDR):$(MASTER_PORT) /opt/app-root/lib/python3.11/site-packages/instructlab/training/main_ds.py\
\ --model_name_or_path={path_to_model} --data_path=/ilab/dataset/data.jsonl\
\ --output_dir=/input_data/processed_data --num_epochs=2 --effective_batch_size=3840\
\ --learning_rate=2e-6 --num_warmup_steps=800 --save_samples=0 --log_level=INFO\
\ --max_batch_len=20000 --seed=42 --cpu_offload_optimizer --sharding_strategy=FULL_SHARD\n\
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
Expand Down Expand Up @@ -949,35 +959,39 @@ deploymentSpec:
\ int = 2,\n) -> NamedTuple(\"outputs\", manifest=str, name=str):\n import\
\ inspect\n\n Outputs = NamedTuple(\"outputs\", manifest=str, name=str)\n\
\ name = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n\n \
\ image = \"quay.io/shanand/test-train:0.0.4\"\n\n manifest = inspect.cleandoc(\n\
\ f\"\"\"\n apiVersion: kubeflow.org/v1\n kind: PyTorchJob\n\
\ metadata:\n name: {name}\n spec:\n nprocPerNode:\
\ \\\\\"{nproc_per_node}\\\\\"\n pytorchReplicaSpecs:\n \
\ Master:\n replicas: 1\n restartPolicy: OnFailure\n\
\ template:\n metadata:\n annotations:\n\
\ sidecar.istio.io/inject: 'false'\n spec:\n\
\ containers:\n - args:\n \
\ - |\n mkdir -p /output/model;\n\
\ mkdir -p /output/data;\n \
\ python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir\
\ /output/model --data_output_dir /input_data/processed_data\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ name: input-data\n readOnly:\
\ true\n - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
\ - mountPath: /output\n \
\ name: output\n env:\n - name:\
\ NNODES\n value: \\\\\"{nnodes}\\\\\"\n \
\ - name: NPROC_PER_NODE\n value:\
\ \\\\\"{nproc_per_node}\\\\\"\n resources:\n \
\ requests:\n cpu: 2\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n \
\ limits:\n cpu: 2\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n volumes:\n\
\ - name: input-data\n persistentVolumeClaim:\n\
\ image = \"registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1-1724960989\"\
\n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
\ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
\ name: {name}\n spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\
\\\"\n pytorchReplicaSpecs:\n Master:\n \
\ replicas: 1\n restartPolicy: OnFailure\n template:\n\
\ metadata:\n annotations:\n \
\ sidecar.istio.io/inject: 'false'\n spec:\n \
\ containers:\n - args:\n \
\ - |\n export XDG_CACHE_HOME=/tmp\n \
\ export TRITON_CACHE_DIR=/tmp\n \
\ mkdir -p /output/model;\n mkdir -p\
\ /output/data;\n python3.11 -m torch.distributed.launch\
\ --nnodes {nnodes} --nproc_per_node {nproc_per_node} --node_rank $(RANK)\
\ --rdzv_endpoint $(MASTER_ADDR):$(MASTER_PORT) /opt/app-root/lib/python3.11/site-packages/instructlab/training/main_ds.py\
\ --model_name_or_path={path_to_model} --data_path=/ilab/dataset/data.jsonl\
\ --output_dir=/input_data/processed_data --num_epochs=2 --effective_batch_size=3840\
\ --learning_rate=2e-6 --num_warmup_steps=800 --save_samples=0 --log_level=INFO\
\ --max_batch_len=20000 --seed=42 --cpu_offload_optimizer --sharding_strategy=FULL_SHARD\n\
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath:\
\ /input_data\n name: input-data\n \
\ readOnly: true\n - mountPath: /input_model\n\
\ name: model\n readOnly:\
\ true\n - mountPath: /output\n \
\ name: output\n resources:\n \
\ requests:\n cpu: 2\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n \
\ limits:\n cpu: 2\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n volumes:\n \
\ - name: input-data\n persistentVolumeClaim:\n\
\ claimName: {input_pvc_name}\n \
\ - name: model\n persistentVolumeClaim:\n \
\ claimName: {model_pvc_name}\n - name:\
Expand All @@ -987,9 +1001,15 @@ deploymentSpec:
\ template:\n metadata:\n annotations:\n\
\ sidecar.istio.io/inject: 'false'\n spec:\n\
\ containers:\n - args:\n \
\ - |\n mkdir -p /tmp/model;\n \
\ python3.11 -u run_main_ds.py --model_path {path_to_model}\
\ --ckpt_output_dir /tmp/model --data_output_dir /input_data/processed_data\n\
\ - |\n export XDG_CACHE_HOME=/tmp\n\
\ export TRITON_CACHE_DIR=/tmp\n \
\ mkdir -p /tmp/model;\n python3.11\
\ -m torch.distributed.launch --nnodes {nnodes} --nproc_per_node {nproc_per_node}\
\ --node_rank $(RANK) --rdzv_endpoint $(MASTER_ADDR):$(MASTER_PORT) /opt/app-root/lib/python3.11/site-packages/instructlab/training/main_ds.py\
\ --model_name_or_path={path_to_model} --data_path=/ilab/dataset/data.jsonl\
\ --output_dir=/input_data/processed_data --num_epochs=2 --effective_batch_size=3840\
\ --learning_rate=2e-6 --num_warmup_steps=800 --save_samples=0 --log_level=INFO\
\ --max_batch_len=20000 --seed=42 --cpu_offload_optimizer --sharding_strategy=FULL_SHARD\n\
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
Expand Down
15 changes: 7 additions & 8 deletions training/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def pytorchjob_manifest_op(
Outputs = NamedTuple("outputs", manifest=str, name=str)
name = f"train-{phase_name}-{name_suffix.rstrip('-sdg')}"

image = "quay.io/shanand/test-train:0.0.4"
image = "registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1-1724960989"

manifest = inspect.cleandoc(
f"""
Expand All @@ -117,9 +117,11 @@ def pytorchjob_manifest_op(
containers:
- args:
- |
export XDG_CACHE_HOME=/tmp
export TRITON_CACHE_DIR=/tmp
mkdir -p /output/model;
mkdir -p /output/data;
python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir /output/model --data_output_dir /input_data/processed_data
python3.11 -m torch.distributed.launch --nnodes {nnodes} --nproc_per_node {nproc_per_node} --node_rank $(RANK) --rdzv_endpoint $(MASTER_ADDR):$(MASTER_PORT) /opt/app-root/lib/python3.11/site-packages/instructlab/training/main_ds.py --model_name_or_path={path_to_model} --data_path=/ilab/dataset/data.jsonl --output_dir=/input_data/processed_data --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800 --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer --sharding_strategy=FULL_SHARD
command:
- /bin/bash
- '-c'
Expand All @@ -135,11 +137,6 @@ def pytorchjob_manifest_op(
readOnly: true
- mountPath: /output
name: output
env:
- name: NNODES
value: \\"{nnodes}\\"
- name: NPROC_PER_NODE
value: \\"{nproc_per_node}\\"
resources:
requests:
cpu: 2
Expand Down Expand Up @@ -168,8 +165,10 @@ def pytorchjob_manifest_op(
containers:
- args:
- |
export XDG_CACHE_HOME=/tmp
export TRITON_CACHE_DIR=/tmp
mkdir -p /tmp/model;
python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir /tmp/model --data_output_dir /input_data/processed_data
python3.11 -m torch.distributed.launch --nnodes {nnodes} --nproc_per_node {nproc_per_node} --node_rank $(RANK) --rdzv_endpoint $(MASTER_ADDR):$(MASTER_PORT) /opt/app-root/lib/python3.11/site-packages/instructlab/training/main_ds.py --model_name_or_path={path_to_model} --data_path=/ilab/dataset/data.jsonl --output_dir=/input_data/processed_data --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800 --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer --sharding_strategy=FULL_SHARD
command:
- /bin/bash
- '-c'
Expand Down

0 comments on commit b4d58d2

Please sign in to comment.