Merge branch 'main' into dependabot/github_actions/pypa/gh-action-pyp…

…i-publish-1.8.6
microsoft · May 22, 2023 · 8f577c9 · 8f577c9
2 parents d27bcb1 + 26ff7d0
commit 8f577c9
Show file tree

Hide file tree

Showing 18 changed files with 759 additions and 449 deletions.
diff --git a/hi-ml-azure/src/health_azure/datasets.py b/hi-ml-azure/src/health_azure/datasets.py
@@ -20,7 +20,9 @@
 from azureml.dataprep.fuse.daemon import MountContext
 from azureml.exceptions._azureml_exception import UserErrorException
 
-from health_azure.utils import PathOrString, get_workspace, get_ml_client
+from health_azure.utils import PathOrString, get_ml_client
+
+logger = logging.getLogger(__name__)
 
 
 V1OrV2DataType = Union[FileDataset, Data]
@@ -128,11 +130,14 @@ def _get_or_create_v1_dataset(datastore_name: str, dataset_name: str, workspace:
     try:
         azureml_dataset = _retrieve_v1_dataset(dataset_name, workspace)
     except UserErrorException:
+        logger.warning(f"Dataset '{dataset_name}' was not found, or is not an AzureML SDK v1 dataset.")
+        logger.info(f"Trying to create a new dataset '{dataset_name}' from files in folder '{dataset_name}'")
         if datastore_name == "":
             raise ValueError(
                 "When creating a new dataset, a datastore name must be provided. Please specify a datastore name using "
                 "the --datastore flag"
             )
+        logger.info(f"Trying to create a new dataset '{dataset_name}' in datastore '{datastore_name}'")
         azureml_dataset = _create_v1_dataset(datastore_name, dataset_name, workspace)
     return azureml_dataset
 
@@ -352,10 +357,8 @@ def __init__(
 
     def to_input_dataset_local(
         self,
-        strictly_aml_v1: bool,
-        workspace: Workspace = None,
-        ml_client: Optional[MLClient] = None,
-    ) -> Tuple[Optional[Path], Optional[MountContext]]:
+        workspace: Workspace,
+    ) -> Tuple[Path, Optional[MountContext]]:
         """
         Return a local path to the dataset when outside of an AzureML run.
         If local_folder is supplied, then this is assumed to be a local dataset, and this is returned.
@@ -364,9 +367,6 @@ def to_input_dataset_local(
         therefore a tuple of Nones will be returned.
 
         :param workspace: The AzureML workspace to read from.
-        :param strictly_aml_v1: If True, use Azure ML SDK v1 to attempt to find or create and reigster the dataset.
-            Otherwise, attempt to use Azure ML SDK v2.
-        :param ml_client: An Azure MLClient object for interacting with Azure resources.
         :return: Tuple of (path to dataset, optional mountcontext)
         """
         status = f"Dataset '{self.name}' will be "
@@ -381,12 +381,10 @@ def to_input_dataset_local(
                 f"Unable to make dataset '{self.name} available for a local run because no AzureML "
                 "workspace has been provided. Provide a workspace, or set a folder for local execution."
             )
-        azureml_dataset = get_or_create_dataset(
+        azureml_dataset = _get_or_create_v1_dataset(
             datastore_name=self.datastore,
             dataset_name=self.name,
             workspace=workspace,
-            strictly_aml_v1=strictly_aml_v1,
-            ml_client=ml_client,
         )
         if isinstance(azureml_dataset, FileDataset):
             target_path = self.target_folder or Path(tempfile.mkdtemp())
@@ -404,7 +402,7 @@ def to_input_dataset_local(
             print(status)
             return result
         else:
-            return None, None
+            raise ValueError(f"Don't know how to handle dataset '{self.name}' of type {type(azureml_dataset)}")
 
     def to_input_dataset(
         self,
@@ -556,60 +554,31 @@ def create_dataset_configs(
     return datasets
 
 
-def find_workspace_for_local_datasets(
-    aml_workspace: Optional[Workspace], workspace_config_path: Optional[Path], dataset_configs: List[DatasetConfig]
-) -> Optional[Workspace]:
-    """
-    If any of the dataset_configs require an AzureML workspace then try to get one, otherwise return None.
-
-    :param aml_workspace: There are two optional parameters used to glean an existing AzureML Workspace. The simplest is
-        to pass it in as a parameter.
-    :param workspace_config_path: The 2nd option is to specify the path to the config.json file downloaded from the
-        Azure portal from which we can retrieve the existing Workspace.
-    :param dataset_configs: List of DatasetConfig describing the input datasets.
-    :return: Workspace if required, None otherwise.
-    """
-    workspace: Workspace = None
-    # Check whether an attempt will be made to mount or download a dataset when running locally.
-    # If so, try to get the AzureML workspace.
-    if any(dc.local_folder is None for dc in dataset_configs):
-        try:
-            workspace = get_workspace(aml_workspace, workspace_config_path)
-            logging.info(f"Found workspace for datasets: {workspace.name}")
-        except Exception as ex:
-            logging.info(f"Could not find workspace for datasets. Exception: {ex}")
-    return workspace
-
-
 def setup_local_datasets(
     dataset_configs: List[DatasetConfig],
-    strictly_aml_v1: bool,
-    aml_workspace: Optional[Workspace] = None,
-    ml_client: Optional[MLClient] = None,
-    workspace_config_path: Optional[Path] = None,
-) -> Tuple[List[Optional[Path]], List[MountContext]]:
+    workspace: Optional[Workspace],
+) -> Tuple[List[Path], List[MountContext]]:
     """
     When running outside of AzureML, setup datasets to be used locally.
 
     For each DatasetConfig, if local_folder is supplied, then this is assumed to be a local dataset, and this is
     used. Otherwise the dataset is mounted or downloaded to either the target folder or a temporary folder and that is
     used.
 
-    :param aml_workspace: There are two optional parameters used to glean an existing AzureML Workspace. The simplest is
-        to pass it in as a parameter.
-    :param workspace_config_path: The 2nd option is to specify the path to the config.json file downloaded from the
-        Azure portal from which we can retrieve the existing Workspace.
+    If a dataset does not exist, an AzureML SDK v1 dataset will be created, assuming that the dataset is given
+    in a folder of the same name (for example, if a dataset is given as "mydataset", then it is created from the files
+    in folder "mydataset" in the datastore).
+
+    :param workspace: The AzureML workspace to work with. Can be None if the list of datasets is empty, or if
+        the datasets are available local.
     :param dataset_configs: List of DatasetConfig describing the input data assets.
-    :param strictly_aml_v1: If True, use Azure ML SDK v1. Otherwise, attempt to use Azure ML SDK v2.
-    :param ml_client: An MLClient object for interacting with AML v2 datastores.
-    :return: Pair of: list of optional paths to the input datasets, list of mountcontexts, one for each mounted dataset.
+    :return: Pair of: list of paths to the input datasets, list of mountcontexts, one for each mounted dataset.
     """
-    workspace = find_workspace_for_local_datasets(aml_workspace, workspace_config_path, dataset_configs)
-    mounted_input_datasets: List[Optional[Path]] = []
+    mounted_input_datasets: List[Path] = []
     mount_contexts: List[MountContext] = []
 
     for data_config in dataset_configs:
-        target_path, mount_context = data_config.to_input_dataset_local(strictly_aml_v1, workspace, ml_client)
+        target_path, mount_context = data_config.to_input_dataset_local(workspace)
 
         mounted_input_datasets.append(target_path)
 

diff --git a/hi-ml-azure/src/health_azure/himl.py b/hi-ml-azure/src/health_azure/himl.py
@@ -442,21 +442,20 @@ def effective_experiment_name(experiment_name: Optional[str], entry_script: Opti
 
 
 def submit_run_v2(
-    workspace: Optional[Workspace],
+    ml_client: MLClient,
     environment: EnvironmentV2,
+    entry_script: PathOrString,
+    script_params: List[str],
+    compute_target: str,
+    environment_variables: Optional[Dict[str, str]] = None,
     experiment_name: Optional[str] = None,
     input_datasets_v2: Optional[Dict[str, Input]] = None,
     output_datasets_v2: Optional[Dict[str, Output]] = None,
     snapshot_root_directory: Optional[Path] = None,
-    entry_script: Optional[PathOrString] = None,
-    script_params: Optional[List[str]] = None,
-    compute_target: Optional[str] = None,
     tags: Optional[Dict[str, str]] = None,
     docker_shm_size: str = "",
     wait_for_completion: bool = False,
     identity_based_auth: bool = False,
-    workspace_config_path: Optional[PathOrString] = None,
-    ml_client: Optional[MLClient] = None,
     hyperparam_args: Optional[Dict[str, Any]] = None,
     num_nodes: int = 1,
     pytorch_processes_per_node: Optional[int] = None,
@@ -465,26 +464,23 @@ def submit_run_v2(
     """
     Starts a v2 AML Job on a given workspace by submitting a command
 
-    :param workspace: The AzureML workspace to use.
+    :param ml_client: An Azure MLClient object for interacting with Azure resources.
     :param environment: An AML v2 Environment object.
+    :param entry_script: The script that should be run in AzureML.
+    :param script_params: A list of parameter to pass on to the script as it runs in AzureML.
+    :param compute_target: The name of a compute target in Azure ML to submit the job to.
+    :param environment_variables: The environment variables that should be set when running in AzureML.
     :param experiment_name: The name of the experiment that will be used or created. If the experiment name contains
         characters that are not valid in Azure, those will be removed.
     :param input_datasets_v2: An optional dictionary of Inputs to pass in to the command.
     :param output_datasets_v2: An optional dictionary of Outputs to pass in to the command.
     :param snapshot_root_directory: The directory that contains all code that should be packaged and sent to AzureML.
         All Python code that the script uses must be copied over.
-    :param entry_script: The script that should be run in AzureML.
-    :param script_params: A list of parameter to pass on to the script as it runs in AzureML.
-    :param compute_target: Optional name of a compute target in Azure ML to submit the job to. If None, will run
-        locally.
     :param tags: A dictionary of string key/value pairs, that will be added as metadata to the run. If set to None,
         a default metadata field will be added that only contains the commandline arguments that started the run.
     :param docker_shm_size: The Docker shared memory size that should be used when creating a new Docker image.
     :param wait_for_completion: If False (the default) return after the run is submitted to AzureML, otherwise wait for
         the completion of this run (if True).
-    :param workspace_config_path: If not provided with an AzureML Workspace, then load one given the information in this
-        config
-    :param ml_client: An Azure MLClient object for interacting with Azure resources.
     :param hyperparam_args: A dictionary of hyperparameter search args to pass into a sweep job.
     :param num_nodes: The number of nodes to use for the job in AzureML. The value must be 1 or greater.
     :param pytorch_processes_per_node: For plain PyTorch multi-GPU processing: The number of processes per node.
@@ -494,20 +490,6 @@ def submit_run_v2(
         display name will be generated by AzureML.
     :return: An AzureML Run object.
     """
-    if ml_client is None:
-        if workspace is not None:
-            ml_client = get_ml_client(
-                subscription_id=workspace.subscription_id,
-                resource_group=workspace.resource_group,
-                workspace_name=workspace.name,
-            )
-        elif workspace_config_path is not None:
-            ml_client = get_ml_client(workspace_config_path=workspace_config_path)
-        else:
-            raise ValueError("Either workspace or workspace_config_path must be specified to connect to the Workspace")
-
-    assert compute_target is not None, "No compute_target has been provided"
-    assert entry_script is not None, "No entry_script has been provided"
     snapshot_root_directory = snapshot_root_directory or Path.cwd()
     root_dir = Path(snapshot_root_directory)
 
@@ -547,6 +529,7 @@ def create_command_job(cmd: str) -> Command:
             inputs=input_datasets_v2,
             outputs=output_datasets_v2,
             environment=environment.name + "@latest",
+            environment_variables=environment_variables,
             compute=compute_target,
             experiment_name=experiment_name,
             tags=tags or {},
@@ -589,7 +572,11 @@ def create_command_job(cmd: str) -> Command:
         job_to_submit = create_command_job(cmd)
 
     returned_job = ml_client.jobs.create_or_update(job_to_submit)
-    print(f"URL to job: {returned_job.services['Studio'].endpoint}")  # type: ignore
+    print("\n==============================================================================")
+    # The ID field looks like /subscriptions/<sub>/resourceGroups/<rg?/providers/Microsoft.MachineLearningServices/..
+    print(f"Successfully queued run {(returned_job.id or '').split('/')[-1]}")
+    print(f"Run URL: {returned_job.services['Studio'].endpoint}")  # type: ignore
+    print("==============================================================================\n")
     if wait_for_completion:
         print("Waiting for the completion of the AzureML job.")
         wait_for_job_completion(ml_client, job_name=returned_job.name)
@@ -668,7 +655,7 @@ def submit_run(
 
     # These need to be 'print' not 'logging.info' so that the calling script sees them outside AzureML
     print("\n==============================================================================")
-    print(f"Successfully queued run number {run.number} (ID {run.id}) in experiment {run.experiment.name}")
+    print(f"Successfully queued run {run.id} in experiment {run.experiment.name}")
     print(f"Experiment name and run ID are available in file {RUN_RECOVERY_FILE}")
     print(f"Experiment URL: {run.experiment.get_portal_url()}")
     print(f"Run URL: {run.get_portal_url()}")
@@ -882,6 +869,18 @@ def submit_to_azure_if_needed(  # type: ignore
     # is necessary. If not, return to the caller for local execution.
     if submit_to_azureml is None:
         submit_to_azureml = AZUREML_FLAG in sys.argv[1:]
+
+    has_input_datasets = len(cleaned_input_datasets) > 0
+    if submit_to_azureml or has_input_datasets:
+        if strictly_aml_v1:
+            aml_workspace = get_workspace(aml_workspace, workspace_config_path)
+            assert aml_workspace is not None
+            print(f"Loaded AzureML workspace {aml_workspace.name}")
+        else:
+            ml_client = get_ml_client(ml_client=ml_client, workspace_config_path=workspace_config_path)
+            assert ml_client is not None
+            print(f"Created MLClient for AzureML workspace {ml_client.workspace_name}")
+
     if not submit_to_azureml:
         # Set the environment variables for local execution.
         environment_variables = {**DEFAULT_ENVIRONMENT_VARIABLES, **(environment_variables or {})}
@@ -895,16 +894,24 @@ def submit_to_azure_if_needed(  # type: ignore
         logs_folder = Path.cwd() / LOGS_FOLDER
         logs_folder.mkdir(exist_ok=True)
 
+        any_local_folders_missing = any(dataset.local_folder is None for dataset in cleaned_input_datasets)
+
+        if has_input_datasets and any_local_folders_missing and not strictly_aml_v1:
+            raise ValueError(
+                "AzureML SDK v2 does not support downloading datasets from AzureML for local execution. "
+                "Please switch to AzureML SDK v1 by setting strictly_aml_v1=True, or use "
+                "--strictly_aml_v1 on the commandline, or provide a local folder for each input dataset. "
+                "Note that you will not be able use AzureML datasets for runs outside AzureML if the datasets were "
+                "created via SDK v2."
+            )
+
         mounted_input_datasets, mount_contexts = setup_local_datasets(
             cleaned_input_datasets,
-            strictly_aml_v1,
-            aml_workspace=aml_workspace,
-            ml_client=ml_client,
-            workspace_config_path=workspace_config_path,
+            workspace=aml_workspace,
         )
 
         return AzureRunInfo(
-            input_datasets=mounted_input_datasets,
+            input_datasets=mounted_input_datasets,  # type: ignore
             output_datasets=[d.local_folder for d in cleaned_output_datasets],
             mount_contexts=mount_contexts,
             run=None,
@@ -917,9 +924,6 @@ def submit_to_azure_if_needed(  # type: ignore
         print(f"No snapshot root directory given. Uploading all files in the current directory {Path.cwd()}")
         snapshot_root_directory = Path.cwd()
 
-    workspace = get_workspace(aml_workspace, workspace_config_path)
-    print(f"Loaded AzureML workspace {workspace.name}")
-
     if conda_environment_file is None:
         conda_environment_file = find_file_in_parent_to_pythonpath(CONDA_ENVIRONMENT_FILE)
         if conda_environment_file is None:
@@ -935,8 +939,9 @@ def submit_to_azure_if_needed(  # type: ignore
 
     with append_to_amlignore(amlignore=amlignore_path, lines_to_append=lines_to_append):
         if strictly_aml_v1:
+            assert aml_workspace is not None, "An AzureML workspace should have been created already."
             run_config = create_run_configuration(
-                workspace=workspace,
+                workspace=aml_workspace,
                 compute_cluster_name=compute_cluster_name,
                 aml_environment_name=aml_environment_name,
                 conda_environment_file=conda_environment_file,
@@ -965,7 +970,7 @@ def submit_to_azure_if_needed(  # type: ignore
                 config_to_submit = script_run_config
 
             run = submit_run(
-                workspace=workspace,
+                workspace=aml_workspace,
                 experiment_name=effective_experiment_name(experiment_name, script_run_config.script),
                 script_run_config=config_to_submit,
                 tags=tags,
@@ -976,6 +981,7 @@ def submit_to_azure_if_needed(  # type: ignore
             if after_submission is not None:
                 after_submission(run)  # type: ignore
         else:
+            assert ml_client is not None, "An AzureML MLClient should have been created already."
             if conda_environment_file is None:
                 raise ValueError("Argument 'conda_environment_file' must be specified when using AzureML v2")
             environment = create_python_environment_v2(
@@ -984,17 +990,17 @@ def submit_to_azure_if_needed(  # type: ignore
             if entry_script is None:
                 entry_script = Path(sys.argv[0])
 
-            ml_client = get_ml_client(ml_client=ml_client, aml_workspace=workspace)
             registered_env = register_environment_v2(environment, ml_client)
             input_datasets_v2 = create_v2_inputs(ml_client, cleaned_input_datasets)
             output_datasets_v2 = create_v2_outputs(ml_client, cleaned_output_datasets)
 
             job = submit_run_v2(
-                workspace=workspace,
+                ml_client=ml_client,
                 input_datasets_v2=input_datasets_v2,
                 output_datasets_v2=output_datasets_v2,
                 experiment_name=experiment_name,
                 environment=registered_env,
+                environment_variables=environment_variables,
                 snapshot_root_directory=snapshot_root_directory,
                 entry_script=entry_script,
                 script_params=script_params,

diff --git a/hi-ml-azure/src/health_azure/himl_download.py b/hi-ml-azure/src/health_azure/himl_download.py
@@ -39,12 +39,7 @@ def main() -> None:  # pragma: no cover
 
     files_to_download = download_config.files_to_download
 
-    workspace = get_workspace()
-    ml_client = get_ml_client(
-        subscription_id=workspace.subscription_id,
-        resource_group=workspace.resource_group,
-        workspace_name=workspace.name,
-    )
+    ml_client = get_ml_client()
     for run_id in download_config.run:
         download_job_outputs_logs(ml_client, run_id, file_to_download_path=files_to_download, download_dir=output_dir)
         print("Successfully downloaded output and log files")

diff --git a/hi-ml-azure/src/health_azure/logging.py b/hi-ml-azure/src/health_azure/logging.py
@@ -13,7 +13,6 @@
 from health_azure.utils import ENV_LOCAL_RANK, check_is_any_of, is_global_rank_zero
 
 logging_stdout_handler: Optional[logging.StreamHandler] = None
-logging_to_file_handler: Optional[logging.StreamHandler] = None
 
 
 def logging_to_stdout(log_level: Union[int, str] = logging.INFO) -> None: