"""Util functions for AutoML Tabular pipeline."""
import json
import math
import os
import pathlib
from typing import Any, Dict, List, Optional, Tuple
import warnings
_DEFAULT_NUM_PARALLEL_TRAILS = 35
_DEFAULT_STAGE_2_NUM_SELECTED_TRAILS = 5
_NUM_FOLDS = 5
_DISTILL_TOTAL_TRIALS = 100
_EVALUATION_BATCH_PREDICT_MACHINE_TYPE = 'n1-highmem-8'
_EVALUATION_BATCH_PREDICT_STARTING_REPLICA_COUNT = 20
_EVALUATION_BATCH_PREDICT_MAX_REPLICA_COUNT = 20
_EVALUATION_BATCH_EXPLAIN_MACHINE_TYPE = 'n1-highmem-8'
_EVALUATION_BATCH_EXPLAIN_STARTING_REPLICA_COUNT = 10
_EVALUATION_BATCH_EXPLAIN_MAX_REPLICA_COUNT = 10
_EVALUATION_DATAFLOW_MACHINE_TYPE = 'n1-standard-4'
_EVALUATION_DATAFLOW_STARTING_NUM_WORKERS = 10
_EVALUATION_DATAFLOW_MAX_NUM_WORKERS = 100
_EVALUATION_DATAFLOW_DISK_SIZE_GB = 50
# Needed because we reference the AutoML Tabular V2 pipeline.
_GCPC_STAGING_PATH = pathlib.Path(
__file__
).parent.parent.parent.parent.resolve()
_GCPC_PREVIEW_TABULAR_PATH = (
_GCPC_STAGING_PATH / 'preview' / 'automl' / 'tabular'
)
# TODO(b/277393122): Once we finish L2L+FTE integration, add use_fte flag
# to signify FTE usage instead of the presence of num_selected_features.
def _get_default_pipeline_params(
project: str,
location: str,
root_dir: str,
target_column: str,
prediction_type: str,
optimization_objective: str,
transformations: str,
train_budget_milli_node_hours: float,
stage_1_num_parallel_trials: Optional[int] = None,
stage_2_num_parallel_trials: Optional[int] = None,
stage_2_num_selected_trials: Optional[int] = None,
data_source_csv_filenames: Optional[str] = None,
data_source_bigquery_table_path: Optional[str] = None,
predefined_split_key: Optional[str] = None,
timestamp_split_key: Optional[str] = None,
stratified_split_key: Optional[str] = None,
training_fraction: Optional[float] = None,
validation_fraction: Optional[float] = None,
test_fraction: Optional[float] = None,
weight_column: Optional[float] = None,
study_spec_parameters_override: Optional[List[Dict[str, Any]]] = None,
optimization_objective_recall_value: Optional[float] = None,
optimization_objective_precision_value: Optional[float] = None,
stage_1_tuner_worker_pool_specs_override: Optional[Dict[str, Any]] = None,
cv_trainer_worker_pool_specs_override: Optional[Dict[str, Any]] = None,
export_additional_model_without_custom_ops: bool = False,
stats_and_example_gen_dataflow_machine_type: Optional[str] = None,
stats_and_example_gen_dataflow_max_num_workers: Optional[int] = None,
stats_and_example_gen_dataflow_disk_size_gb: Optional[int] = None,
transform_dataflow_machine_type: Optional[str] = None,
transform_dataflow_max_num_workers: Optional[int] = None,
transform_dataflow_disk_size_gb: Optional[int] = None,
dataflow_subnetwork: Optional[str] = None,
dataflow_use_public_ips: bool = True,
encryption_spec_key_name: Optional[str] = None,
additional_experiments: Optional[Dict[str, Any]] = None,
dataflow_service_account: Optional[str] = None,
max_selected_features: Optional[int] = None,
apply_feature_selection_tuning: bool = False,
run_evaluation: bool = True,
evaluation_batch_predict_machine_type: Optional[str] = None,
evaluation_batch_predict_starting_replica_count: Optional[int] = None,
evaluation_batch_predict_max_replica_count: Optional[int] = None,
evaluation_batch_explain_machine_type: Optional[str] = None,
evaluation_batch_explain_starting_replica_count: Optional[int] = None,
evaluation_batch_explain_max_replica_count: Optional[int] = None,
evaluation_dataflow_machine_type: Optional[str] = None,
evaluation_dataflow_starting_num_workers: Optional[int] = None,
evaluation_dataflow_max_num_workers: Optional[int] = None,
evaluation_dataflow_disk_size_gb: Optional[int] = None,
run_distillation: bool = False,
distill_batch_predict_machine_type: Optional[str] = None,
distill_batch_predict_starting_replica_count: Optional[int] = None,
distill_batch_predict_max_replica_count: Optional[int] = None,
stage_1_tuning_result_artifact_uri: Optional[str] = None,
quantiles: Optional[List[float]] = None,
enable_probabilistic_inference: bool = False,
num_selected_features: Optional[int] = None,
model_display_name: str = '',
model_description: str = '',
) -> Dict[str, Any]:
"""Get the AutoML Tabular v1 default training pipeline.
Args:
project: The GCP project that runs the pipeline components.
location: The GCP region that runs the pipeline components.
root_dir: The root GCS directory for the pipeline components.
target_column: The target column name.
prediction_type: The type of prediction the model is to produce.
"classification" or "regression".
optimization_objective: For binary classification, "maximize-au-roc",
"minimize-log-loss", "maximize-au-prc", "maximize-precision-at-recall", or
"maximize-recall-at-precision". For multi class classification,
"minimize-log-loss". For regression, "minimize-rmse", "minimize-mae", or
"minimize-rmsle".
transformations: The path to a GCS file containing the transformations to
apply.
train_budget_milli_node_hours: The train budget of creating this model,
expressed in milli node hours i.e. 1,000 value in this field means 1 node
hour.
stage_1_num_parallel_trials: Number of parallel trails for stage 1.
stage_2_num_parallel_trials: Number of parallel trails for stage 2.
stage_2_num_selected_trials: Number of selected trials for stage 2.
data_source_csv_filenames: The CSV data source.
data_source_bigquery_table_path: The BigQuery data source.
predefined_split_key: The predefined_split column name.
timestamp_split_key: The timestamp_split column name.
stratified_split_key: The stratified_split column name.
training_fraction: The training fraction.
validation_fraction: The validation fraction.
test_fraction: float = The test fraction.
weight_column: The weight column name.
study_spec_parameters_override: The list for overriding study spec. The list
should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/study.proto#L181.
optimization_objective_recall_value: Required when optimization_objective is
"maximize-precision-at-recall". Must be between 0 and 1, inclusive.
optimization_objective_precision_value: Required when optimization_objective
is "maximize-recall-at-precision". Must be between 0 and 1, inclusive.
stage_1_tuner_worker_pool_specs_override: The dictionary for overriding.
stage 1 tuner worker pool spec. The dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
cv_trainer_worker_pool_specs_override: The dictionary for overriding stage
cv trainer worker pool spec. The dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
export_additional_model_without_custom_ops: Whether to export additional
model without custom TensorFlow operators.
stats_and_example_gen_dataflow_machine_type: The dataflow machine type for
stats_and_example_gen component.
stats_and_example_gen_dataflow_max_num_workers: The max number of Dataflow
workers for stats_and_example_gen component.
stats_and_example_gen_dataflow_disk_size_gb: Dataflow worker's disk size in
GB for stats_and_example_gen component.
transform_dataflow_machine_type: The dataflow machine type for transform
component.
transform_dataflow_max_num_workers: The max number of Dataflow workers for
transform component.
transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for
transform component.
dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty
the default subnetwork will be used. Example:
https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
dataflow_use_public_ips: Specifies whether Dataflow workers use public IP
addresses.
encryption_spec_key_name: The KMS key name.
additional_experiments: Use this field to config private preview features.
dataflow_service_account: Custom service account to run dataflow jobs.
max_selected_features: number of features to select for training,
apply_feature_selection_tuning: tuning feature selection rate if true.
run_evaluation: Whether to run evaluation in the training pipeline.
evaluation_batch_predict_machine_type: The prediction server machine type
for batch predict components during evaluation.
evaluation_batch_predict_starting_replica_count: The initial number of
prediction server for batch predict components during evaluation.
evaluation_batch_predict_max_replica_count: The max number of prediction
server for batch predict components during evaluation.
evaluation_batch_explain_machine_type: The prediction server machine type
for batch explain components during evaluation.
evaluation_batch_explain_starting_replica_count: The initial number of
prediction server for batch explain components during evaluation.
evaluation_batch_explain_max_replica_count: The max number of prediction
server for batch explain components during evaluation.
evaluation_dataflow_machine_type: The dataflow machine type for evaluation
components.
evaluation_dataflow_starting_num_workers: The initial number of Dataflow
workers for evaluation components.
evaluation_dataflow_max_num_workers: The max number of Dataflow workers for
evaluation components.
evaluation_dataflow_disk_size_gb: Dataflow worker's disk size in GB for
evaluation components.
run_distillation: Whether to run distill in the training pipeline.
distill_batch_predict_machine_type: The prediction server machine type for
batch predict component in the model distillation.
distill_batch_predict_starting_replica_count: The initial number of
prediction server for batch predict component in the model distillation.
distill_batch_predict_max_replica_count: The max number of prediction server
for batch predict component in the model distillation.
stage_1_tuning_result_artifact_uri: The stage 1 tuning result artifact GCS
URI.
quantiles: Quantiles to use for probabilistic inference. Up to 5 quantiles
are allowed of values between 0 and 1, exclusive. Represents the quantiles
to use for that objective. Quantiles must be unique.
enable_probabilistic_inference: If probabilistic inference is enabled, the
model will fit a distribution that captures the uncertainty of a
prediction. At inference time, the predictive distribution is used to make
a point prediction that minimizes the optimization objective. For example,
the mean of a predictive distribution is the point prediction that
minimizes RMSE loss. If quantiles are specified, then the quantiles of the
distribution are also returned.
num_selected_features: Number of selected features for feature selection,
defaults to None, in which case all features are used. If specified,
enable_probabilistic_inference and run_distillation cannot be enabled.
model_display_name: The display name of the uploaded Vertex model.
model_description: The description for the uploaded model.
Returns:
Tuple of pipeline_definition_path and parameter_values.
"""
if not study_spec_parameters_override:
study_spec_parameters_override = []
if not stage_1_tuner_worker_pool_specs_override:
stage_1_tuner_worker_pool_specs_override = []
if not cv_trainer_worker_pool_specs_override:
cv_trainer_worker_pool_specs_override = []
if not quantiles:
quantiles = []
parameter_values = {}
parameters = {
'project': project,
'location': location,
'root_dir': root_dir,
'target_column': target_column,
'prediction_type': prediction_type,
'data_source_csv_filenames': data_source_csv_filenames,
'data_source_bigquery_table_path': data_source_bigquery_table_path,
'predefined_split_key': predefined_split_key,
'timestamp_split_key': timestamp_split_key,
'stratified_split_key': stratified_split_key,
'training_fraction': training_fraction,
'validation_fraction': validation_fraction,
'test_fraction': test_fraction,
'optimization_objective': optimization_objective,
'train_budget_milli_node_hours': train_budget_milli_node_hours,
'stage_1_num_parallel_trials': stage_1_num_parallel_trials,
'stage_2_num_parallel_trials': stage_2_num_parallel_trials,
'stage_2_num_selected_trials': stage_2_num_selected_trials,
'weight_column': weight_column,
'optimization_objective_recall_value': (
optimization_objective_recall_value
),
'optimization_objective_precision_value': (
optimization_objective_precision_value
),
'study_spec_parameters_override': study_spec_parameters_override,
'stage_1_tuner_worker_pool_specs_override': (
stage_1_tuner_worker_pool_specs_override
),
'cv_trainer_worker_pool_specs_override': (
cv_trainer_worker_pool_specs_override
),
'export_additional_model_without_custom_ops': (
export_additional_model_without_custom_ops
),
'dataflow_subnetwork': dataflow_subnetwork,
'dataflow_use_public_ips': dataflow_use_public_ips,
'dataflow_service_account': dataflow_service_account,
'encryption_spec_key_name': encryption_spec_key_name,
'max_selected_features': max_selected_features,
'stage_1_tuning_result_artifact_uri': stage_1_tuning_result_artifact_uri,
'quantiles': quantiles,
'enable_probabilistic_inference': enable_probabilistic_inference,
'model_display_name': model_display_name,
'model_description': model_description,
}
parameter_values.update(
{param: value for param, value in parameters.items() if value is not None}
)
if run_evaluation:
eval_parameters = {
'evaluation_batch_predict_machine_type': (
evaluation_batch_predict_machine_type
),
'evaluation_batch_predict_starting_replica_count': (
evaluation_batch_predict_starting_replica_count
),
'evaluation_batch_predict_max_replica_count': (
evaluation_batch_predict_max_replica_count
),
'evaluation_batch_explain_machine_type': (
evaluation_batch_explain_machine_type
),
'evaluation_batch_explain_starting_replica_count': (
evaluation_batch_explain_starting_replica_count
),
'evaluation_batch_explain_max_replica_count': (
evaluation_batch_explain_max_replica_count
),
'evaluation_dataflow_machine_type': evaluation_dataflow_machine_type,
'evaluation_dataflow_starting_num_workers': (
evaluation_dataflow_starting_num_workers
),
'evaluation_dataflow_max_num_workers': (
evaluation_dataflow_max_num_workers
),
'evaluation_dataflow_disk_size_gb': evaluation_dataflow_disk_size_gb,
'run_evaluation': run_evaluation,
}
parameter_values.update(
{
param: value
for param, value in eval_parameters.items()
if value is not None
}
)
# V1 pipeline without FTE
if num_selected_features is None:
if not additional_experiments:
additional_experiments = {}
parameters = {
'transformations': transformations,
'stats_and_example_gen_dataflow_machine_type': (
stats_and_example_gen_dataflow_machine_type
),
'stats_and_example_gen_dataflow_max_num_workers': (
stats_and_example_gen_dataflow_max_num_workers
),
'stats_and_example_gen_dataflow_disk_size_gb': (
stats_and_example_gen_dataflow_disk_size_gb
),
'transform_dataflow_machine_type': transform_dataflow_machine_type,
'transform_dataflow_max_num_workers': (
transform_dataflow_max_num_workers
),
'transform_dataflow_disk_size_gb': transform_dataflow_disk_size_gb,
'additional_experiments': additional_experiments,
}
parameter_values.update(
{
param: value
for param, value in parameters.items()
if value is not None
}
)
if apply_feature_selection_tuning:
parameter_values.update({
'apply_feature_selection_tuning': apply_feature_selection_tuning,
})
if run_distillation:
distillation_parameters = {
'distill_batch_predict_machine_type': (
distill_batch_predict_machine_type
),
'distill_batch_predict_starting_replica_count': (
distill_batch_predict_starting_replica_count
),
'distill_batch_predict_max_replica_count': (
distill_batch_predict_max_replica_count
),
'run_distillation': run_distillation,
}
parameter_values.update(
{
param: value
for param, value in distillation_parameters.items()
if value is not None
}
)
# V2 pipeline (with FTE)
else:
if run_distillation:
raise ValueError(
'Distillation is currently not supported'
' when num_selected_features is specified.'
)
parameters = {
'num_selected_features': num_selected_features,
'dataset_level_custom_transformation_definitions': [],
'dataset_level_transformations': [],
'tf_auto_transform_features': {},
'tf_custom_transformation_definitions': [],
'legacy_transformations_path': transformations,
'feature_transform_engine_dataflow_machine_type': (
transform_dataflow_machine_type
),
'feature_transform_engine_dataflow_max_num_workers': (
transform_dataflow_max_num_workers
),
'feature_transform_engine_dataflow_disk_size_gb': (
transform_dataflow_disk_size_gb
),
}
parameter_values.update(
{
param: value
for param, value in parameters.items()
if value is not None
}
)
return parameter_values
[docs]def get_automl_tabular_pipeline_and_parameters(
project: str,
location: str,
root_dir: str,
target_column: str,
prediction_type: str,
optimization_objective: str,
transformations: str,
train_budget_milli_node_hours: float,
stage_1_num_parallel_trials: Optional[int] = None,
stage_2_num_parallel_trials: Optional[int] = None,
stage_2_num_selected_trials: Optional[int] = None,
data_source_csv_filenames: Optional[str] = None,
data_source_bigquery_table_path: Optional[str] = None,
predefined_split_key: Optional[str] = None,
timestamp_split_key: Optional[str] = None,
stratified_split_key: Optional[str] = None,
training_fraction: Optional[float] = None,
validation_fraction: Optional[float] = None,
test_fraction: Optional[float] = None,
weight_column: Optional[str] = None,
study_spec_parameters_override: Optional[List[Dict[str, Any]]] = None,
optimization_objective_recall_value: Optional[float] = None,
optimization_objective_precision_value: Optional[float] = None,
stage_1_tuner_worker_pool_specs_override: Optional[Dict[str, Any]] = None,
cv_trainer_worker_pool_specs_override: Optional[Dict[str, Any]] = None,
export_additional_model_without_custom_ops: bool = False,
stats_and_example_gen_dataflow_machine_type: Optional[str] = None,
stats_and_example_gen_dataflow_max_num_workers: Optional[int] = None,
stats_and_example_gen_dataflow_disk_size_gb: Optional[int] = None,
transform_dataflow_machine_type: Optional[str] = None,
transform_dataflow_max_num_workers: Optional[int] = None,
transform_dataflow_disk_size_gb: Optional[int] = None,
dataflow_subnetwork: Optional[str] = None,
dataflow_use_public_ips: bool = True,
encryption_spec_key_name: Optional[str] = None,
additional_experiments: Optional[Dict[str, Any]] = None,
dataflow_service_account: Optional[str] = None,
run_evaluation: bool = True,
evaluation_batch_predict_machine_type: Optional[str] = None,
evaluation_batch_predict_starting_replica_count: Optional[int] = None,
evaluation_batch_predict_max_replica_count: Optional[int] = None,
evaluation_batch_explain_machine_type: Optional[str] = None,
evaluation_batch_explain_starting_replica_count: Optional[int] = None,
evaluation_batch_explain_max_replica_count: Optional[int] = None,
evaluation_dataflow_machine_type: Optional[str] = None,
evaluation_dataflow_starting_num_workers: Optional[int] = None,
evaluation_dataflow_max_num_workers: Optional[int] = None,
evaluation_dataflow_disk_size_gb: Optional[int] = None,
run_distillation: bool = False,
distill_batch_predict_machine_type: Optional[str] = None,
distill_batch_predict_starting_replica_count: Optional[int] = None,
distill_batch_predict_max_replica_count: Optional[int] = None,
stage_1_tuning_result_artifact_uri: Optional[str] = None,
quantiles: Optional[List[float]] = None,
enable_probabilistic_inference: bool = False,
num_selected_features: Optional[int] = None,
model_display_name: str = '',
model_description: str = '',
) -> Tuple[str, Dict[str, Any]]:
# fmt: off
"""Get the AutoML Tabular v1 default training pipeline.
Args:
project: The GCP project that runs the pipeline components.
location: The GCP region that runs the pipeline components.
root_dir: The root GCS directory for the pipeline components.
target_column: The target column name.
prediction_type: The type of prediction the model is to produce. "classification" or "regression".
optimization_objective: For binary classification, "maximize-au-roc", "minimize-log-loss", "maximize-au-prc", "maximize-precision-at-recall", or "maximize-recall-at-precision". For multi class classification, "minimize-log-loss". For regression, "minimize-rmse", "minimize-mae", or "minimize-rmsle".
transformations: The path to a GCS file containing the transformations to apply.
train_budget_milli_node_hours: The train budget of creating this model, expressed in milli node hours i.e. 1,000 value in this field means 1 node hour.
stage_1_num_parallel_trials: Number of parallel trails for stage 1.
stage_2_num_parallel_trials: Number of parallel trails for stage 2.
stage_2_num_selected_trials: Number of selected trials for stage 2.
data_source_csv_filenames: The CSV data source.
data_source_bigquery_table_path: The BigQuery data source.
predefined_split_key: The predefined_split column name.
timestamp_split_key: The timestamp_split column name.
stratified_split_key: The stratified_split column name.
training_fraction: The training fraction.
validation_fraction: The validation fraction.
test_fraction: float = The test fraction.
weight_column: The weight column name.
study_spec_parameters_override: The list for overriding study spec. The list should be of format: https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/study.proto#L181.
optimization_objective_recall_value: Required when optimization_objective is "maximize-precision-at-recall". Must be between 0 and 1, inclusive.
optimization_objective_precision_value: Required when optimization_objective is "maximize-recall-at-precision". Must be between 0 and 1, inclusive.
stage_1_tuner_worker_pool_specs_override: The dictionary for overriding. stage 1 tuner worker pool spec. The dictionary should be of format: https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
cv_trainer_worker_pool_specs_override: The dictionary for overriding stage cv trainer worker pool spec. The dictionary should be of format: https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
export_additional_model_without_custom_ops: Whether to export additional model without custom TensorFlow operators.
stats_and_example_gen_dataflow_machine_type: The dataflow machine type for stats_and_example_gen component.
stats_and_example_gen_dataflow_max_num_workers: The max number of Dataflow workers for stats_and_example_gen component.
stats_and_example_gen_dataflow_disk_size_gb: Dataflow worker's disk size in GB for stats_and_example_gen component.
transform_dataflow_machine_type: The dataflow machine type for transform component.
transform_dataflow_max_num_workers: The max number of Dataflow workers for transform component.
transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for transform component.
dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty the default subnetwork will be used. Example: https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
dataflow_use_public_ips: Specifies whether Dataflow workers use public IP addresses.
encryption_spec_key_name: The KMS key name.
additional_experiments: Use this field to config private preview features.
dataflow_service_account: Custom service account to run dataflow jobs.
run_evaluation: Whether to run evaluation in the training pipeline.
evaluation_batch_predict_machine_type: The prediction server machine type for batch predict components during evaluation.
evaluation_batch_predict_starting_replica_count: The initial number of prediction server for batch predict components during evaluation.
evaluation_batch_predict_max_replica_count: The max number of prediction server for batch predict components during evaluation.
evaluation_batch_explain_machine_type: The prediction server machine type for batch explain components during evaluation.
evaluation_batch_explain_starting_replica_count: The initial number of prediction server for batch explain components during evaluation.
evaluation_batch_explain_max_replica_count: The max number of prediction server for batch explain components during evaluation.
evaluation_dataflow_machine_type: The dataflow machine type for evaluation components.
evaluation_dataflow_starting_num_workers: The initial number of Dataflow workers for evaluation components.
evaluation_dataflow_max_num_workers: The max number of Dataflow workers for evaluation components.
evaluation_dataflow_disk_size_gb: Dataflow worker's disk size in GB for evaluation components.
run_distillation: Whether to run distill in the training pipeline.
distill_batch_predict_machine_type: The prediction server machine type for batch predict component in the model distillation.
distill_batch_predict_starting_replica_count: The initial number of prediction server for batch predict component in the model distillation.
distill_batch_predict_max_replica_count: The max number of prediction server for batch predict component in the model distillation.
stage_1_tuning_result_artifact_uri: The stage 1 tuning result artifact GCS URI.
quantiles: Quantiles to use for probabilistic inference. Up to 5 quantiles are allowed of values between 0 and 1, exclusive. Represents the quantiles to use for that objective. Quantiles must be unique.
enable_probabilistic_inference: If probabilistic inference is enabled, the model will fit a distribution that captures the uncertainty of a prediction. At inference time, the predictive distribution is used to make a point prediction that minimizes the optimization objective. For example, the mean of a predictive distribution is the point prediction that minimizes RMSE loss. If quantiles are specified, then the quantiles of the distribution are also returned.
num_selected_features: Number of selected features for feature selection, defaults to None, in which case all features are used.
model_display_name: The display name of the uploaded Vertex model.
model_description: The description for the uploaded model.
Returns:
Tuple of pipeline_definition_path and parameter_values.
"""
# fmt: on
parameter_values = _get_default_pipeline_params(
project=project,
location=location,
root_dir=root_dir,
target_column=target_column,
prediction_type=prediction_type,
optimization_objective=optimization_objective,
transformations=transformations,
train_budget_milli_node_hours=train_budget_milli_node_hours,
stage_1_num_parallel_trials=stage_1_num_parallel_trials,
stage_2_num_parallel_trials=stage_2_num_parallel_trials,
stage_2_num_selected_trials=stage_2_num_selected_trials,
data_source_csv_filenames=data_source_csv_filenames,
data_source_bigquery_table_path=data_source_bigquery_table_path,
predefined_split_key=predefined_split_key,
timestamp_split_key=timestamp_split_key,
stratified_split_key=stratified_split_key,
training_fraction=training_fraction,
validation_fraction=validation_fraction,
test_fraction=test_fraction,
weight_column=weight_column,
study_spec_parameters_override=study_spec_parameters_override,
optimization_objective_recall_value=optimization_objective_recall_value,
optimization_objective_precision_value=optimization_objective_precision_value,
stage_1_tuner_worker_pool_specs_override=stage_1_tuner_worker_pool_specs_override,
cv_trainer_worker_pool_specs_override=cv_trainer_worker_pool_specs_override,
export_additional_model_without_custom_ops=export_additional_model_without_custom_ops,
stats_and_example_gen_dataflow_machine_type=stats_and_example_gen_dataflow_machine_type,
stats_and_example_gen_dataflow_max_num_workers=stats_and_example_gen_dataflow_max_num_workers,
stats_and_example_gen_dataflow_disk_size_gb=stats_and_example_gen_dataflow_disk_size_gb,
transform_dataflow_machine_type=transform_dataflow_machine_type,
transform_dataflow_max_num_workers=transform_dataflow_max_num_workers,
transform_dataflow_disk_size_gb=transform_dataflow_disk_size_gb,
dataflow_subnetwork=dataflow_subnetwork,
dataflow_use_public_ips=dataflow_use_public_ips,
encryption_spec_key_name=encryption_spec_key_name,
additional_experiments=additional_experiments,
dataflow_service_account=dataflow_service_account,
run_evaluation=run_evaluation,
evaluation_batch_predict_machine_type=evaluation_batch_predict_machine_type,
evaluation_batch_predict_starting_replica_count=evaluation_batch_predict_starting_replica_count,
evaluation_batch_predict_max_replica_count=evaluation_batch_predict_max_replica_count,
evaluation_batch_explain_machine_type=evaluation_batch_explain_machine_type,
evaluation_batch_explain_starting_replica_count=evaluation_batch_explain_starting_replica_count,
evaluation_batch_explain_max_replica_count=evaluation_batch_explain_max_replica_count,
evaluation_dataflow_machine_type=evaluation_dataflow_machine_type,
evaluation_dataflow_starting_num_workers=evaluation_dataflow_starting_num_workers,
evaluation_dataflow_max_num_workers=evaluation_dataflow_max_num_workers,
evaluation_dataflow_disk_size_gb=evaluation_dataflow_disk_size_gb,
run_distillation=run_distillation,
distill_batch_predict_machine_type=distill_batch_predict_machine_type,
distill_batch_predict_starting_replica_count=distill_batch_predict_starting_replica_count,
distill_batch_predict_max_replica_count=distill_batch_predict_max_replica_count,
stage_1_tuning_result_artifact_uri=stage_1_tuning_result_artifact_uri,
quantiles=quantiles,
enable_probabilistic_inference=enable_probabilistic_inference,
num_selected_features=num_selected_features,
model_display_name=model_display_name,
model_description=model_description,
)
# V1 pipeline without FTE
if num_selected_features is None:
pipeline_definition_path = os.path.join(
pathlib.Path(__file__).parent.resolve(), 'automl_tabular_pipeline.yaml'
)
# V2 pipeline with FTE
else:
pipeline_definition_path = os.path.join(
_GCPC_PREVIEW_TABULAR_PATH,
'automl_tabular_v2_pipeline.yaml',
)
# V2 pipeline requires execution engine to be set.
if 'tf_transform_execution_engine' not in parameter_values:
parameter_values['tf_transform_execution_engine'] = 'dataflow'
return pipeline_definition_path, parameter_values
def input_dictionary_to_parameter(input_dict: Optional[Dict[str, Any]]) -> str:
"""Convert json input dict to encoded parameter string.
This function is required due to the limitation on YAML component definition
that YAML definition does not have a keyword for apply quote escape, so the
JSON argument's quote must be manually escaped using this function.
Args:
input_dict: The input json dictionary.
Returns:
The encoded string used for parameter.
"""
if not input_dict:
return ''
out = json.dumps(json.dumps(input_dict))
return out[1:-1] # remove the outside quotes, e.g., "foo" -> foo
def get_skip_evaluation_pipeline_and_parameters(
project: str,
location: str,
root_dir: str,
target_column_name: str,
prediction_type: str,
optimization_objective: str,
transformations: Dict[str, Any],
split_spec: Dict[str, Any],
data_source: Dict[str, Any],
train_budget_milli_node_hours: float,
stage_1_num_parallel_trials: int = _DEFAULT_NUM_PARALLEL_TRAILS,
stage_2_num_parallel_trials: int = _DEFAULT_NUM_PARALLEL_TRAILS,
stage_2_num_selected_trials: int = _DEFAULT_STAGE_2_NUM_SELECTED_TRAILS,
weight_column_name: str = '',
study_spec_override: Optional[Dict[str, Any]] = None,
optimization_objective_recall_value: float = -1,
optimization_objective_precision_value: float = -1,
stage_1_tuner_worker_pool_specs_override: Optional[Dict[str, Any]] = None,
cv_trainer_worker_pool_specs_override: Optional[Dict[str, Any]] = None,
export_additional_model_without_custom_ops: bool = False,
stats_and_example_gen_dataflow_machine_type: str = 'n1-standard-16',
stats_and_example_gen_dataflow_max_num_workers: int = 25,
stats_and_example_gen_dataflow_disk_size_gb: int = 40,
transform_dataflow_machine_type: str = 'n1-standard-16',
transform_dataflow_max_num_workers: int = 25,
transform_dataflow_disk_size_gb: int = 40,
dataflow_subnetwork: str = '',
dataflow_use_public_ips: bool = True,
encryption_spec_key_name: str = '',
additional_experiments: Optional[Dict[str, Any]] = None,
) -> Tuple[str, Dict[str, Any]]:
"""Get the AutoML Tabular training pipeline that skips evaluation.
Args:
project: The GCP project that runs the pipeline components.
location: The GCP region that runs the pipeline components.
root_dir: The root GCS directory for the pipeline components.
target_column_name: The target column name.
prediction_type: The type of prediction the model is to produce.
"classification" or "regression".
optimization_objective: For binary classification, "maximize-au-roc",
"minimize-log-loss", "maximize-au-prc", "maximize-precision-at-recall", or
"maximize-recall-at-precision". For multi class classification,
"minimize-log-loss". For regression, "minimize-rmse", "minimize-mae", or
"minimize-rmsle".
transformations: The transformations to apply.
split_spec: The split spec.
data_source: The data source.
train_budget_milli_node_hours: The train budget of creating this model,
expressed in milli node hours i.e. 1,000 value in this field means 1 node
hour.
stage_1_num_parallel_trials: Number of parallel trails for stage 1.
stage_2_num_parallel_trials: Number of parallel trails for stage 2.
stage_2_num_selected_trials: Number of selected trials for stage 2.
weight_column_name: The weight column name.
study_spec_override: The dictionary for overriding study spec. The
dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/study.proto#L181.
optimization_objective_recall_value: Required when optimization_objective is
"maximize-precision-at-recall". Must be between 0 and 1, inclusive.
optimization_objective_precision_value: Required when optimization_objective
is "maximize-recall-at-precision". Must be between 0 and 1, inclusive.
stage_1_tuner_worker_pool_specs_override: The dictionary for overriding.
stage 1 tuner worker pool spec. The dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
cv_trainer_worker_pool_specs_override: The dictionary for overriding stage
cv trainer worker pool spec. The dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
export_additional_model_without_custom_ops: Whether to export additional
model without custom TensorFlow operators.
stats_and_example_gen_dataflow_machine_type: The dataflow machine type for
stats_and_example_gen component.
stats_and_example_gen_dataflow_max_num_workers: The max number of Dataflow
workers for stats_and_example_gen component.
stats_and_example_gen_dataflow_disk_size_gb: Dataflow worker's disk size in
GB for stats_and_example_gen component.
transform_dataflow_machine_type: The dataflow machine type for transform
component.
transform_dataflow_max_num_workers: The max number of Dataflow workers for
transform component.
transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for
transform component.
dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty
the default subnetwork will be used. Example:
https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
dataflow_use_public_ips: Specifies whether Dataflow workers use public IP
addresses.
encryption_spec_key_name: The KMS key name.
additional_experiments: Use this field to config private preview features.
Returns:
Tuple of pipeline_definition_path and parameter_values.
"""
return get_default_pipeline_and_parameters(
project=project,
location=location,
root_dir=root_dir,
target_column_name=target_column_name,
prediction_type=prediction_type,
optimization_objective=optimization_objective,
transformations=transformations,
split_spec=split_spec,
data_source=data_source,
train_budget_milli_node_hours=train_budget_milli_node_hours,
stage_1_num_parallel_trials=stage_1_num_parallel_trials,
stage_2_num_parallel_trials=stage_2_num_parallel_trials,
stage_2_num_selected_trials=stage_2_num_selected_trials,
weight_column_name=weight_column_name,
study_spec_override=study_spec_override,
optimization_objective_recall_value=optimization_objective_recall_value,
optimization_objective_precision_value=optimization_objective_precision_value,
stage_1_tuner_worker_pool_specs_override=stage_1_tuner_worker_pool_specs_override,
cv_trainer_worker_pool_specs_override=cv_trainer_worker_pool_specs_override,
export_additional_model_without_custom_ops=export_additional_model_without_custom_ops,
stats_and_example_gen_dataflow_machine_type=stats_and_example_gen_dataflow_machine_type,
stats_and_example_gen_dataflow_max_num_workers=stats_and_example_gen_dataflow_max_num_workers,
stats_and_example_gen_dataflow_disk_size_gb=stats_and_example_gen_dataflow_disk_size_gb,
transform_dataflow_machine_type=transform_dataflow_machine_type,
transform_dataflow_max_num_workers=transform_dataflow_max_num_workers,
transform_dataflow_disk_size_gb=transform_dataflow_disk_size_gb,
dataflow_subnetwork=dataflow_subnetwork,
dataflow_use_public_ips=dataflow_use_public_ips,
encryption_spec_key_name=encryption_spec_key_name,
additional_experiments=additional_experiments,
run_evaluation=False,
run_distillation=False,
)
def get_default_pipeline_and_parameters(
project: str,
location: str,
root_dir: str,
target_column_name: str,
prediction_type: str,
optimization_objective: str,
transformations: Dict[str, Any],
split_spec: Dict[str, Any],
data_source: Dict[str, Any],
train_budget_milli_node_hours: float,
stage_1_num_parallel_trials: int = _DEFAULT_NUM_PARALLEL_TRAILS,
stage_2_num_parallel_trials: int = _DEFAULT_NUM_PARALLEL_TRAILS,
stage_2_num_selected_trials: int = _DEFAULT_STAGE_2_NUM_SELECTED_TRAILS,
weight_column_name: str = '',
study_spec_override: Optional[Dict[str, Any]] = None,
optimization_objective_recall_value: float = -1,
optimization_objective_precision_value: float = -1,
stage_1_tuner_worker_pool_specs_override: Optional[Dict[str, Any]] = None,
cv_trainer_worker_pool_specs_override: Optional[Dict[str, Any]] = None,
export_additional_model_without_custom_ops: bool = False,
stats_and_example_gen_dataflow_machine_type: str = 'n1-standard-16',
stats_and_example_gen_dataflow_max_num_workers: int = 25,
stats_and_example_gen_dataflow_disk_size_gb: int = 40,
transform_dataflow_machine_type: str = 'n1-standard-16',
transform_dataflow_max_num_workers: int = 25,
transform_dataflow_disk_size_gb: int = 40,
dataflow_subnetwork: str = '',
dataflow_use_public_ips: bool = True,
encryption_spec_key_name: str = '',
additional_experiments: Optional[Dict[str, Any]] = None,
dataflow_service_account: str = '',
run_evaluation: bool = True,
evaluation_batch_predict_machine_type: str = _EVALUATION_BATCH_PREDICT_MACHINE_TYPE,
evaluation_batch_predict_starting_replica_count: int = _EVALUATION_BATCH_PREDICT_STARTING_REPLICA_COUNT,
evaluation_batch_predict_max_replica_count: int = _EVALUATION_BATCH_PREDICT_MAX_REPLICA_COUNT,
evaluation_dataflow_machine_type: str = _EVALUATION_DATAFLOW_MACHINE_TYPE,
evaluation_dataflow_max_num_workers: int = _EVALUATION_DATAFLOW_MAX_NUM_WORKERS,
evaluation_dataflow_disk_size_gb: int = _EVALUATION_DATAFLOW_DISK_SIZE_GB,
run_distillation: bool = False,
distill_batch_predict_machine_type: str = 'n1-standard-16',
distill_batch_predict_starting_replica_count: int = 25,
distill_batch_predict_max_replica_count: int = 25,
) -> Tuple[str, Dict[str, Any]]:
"""Get the AutoML Tabular default training pipeline.
Args:
project: The GCP project that runs the pipeline components.
location: The GCP region that runs the pipeline components.
root_dir: The root GCS directory for the pipeline components.
target_column_name: The target column name.
prediction_type: The type of prediction the model is to produce.
"classification" or "regression".
optimization_objective: For binary classification, "maximize-au-roc",
"minimize-log-loss", "maximize-au-prc", "maximize-precision-at-recall", or
"maximize-recall-at-precision". For multi class classification,
"minimize-log-loss". For regression, "minimize-rmse", "minimize-mae", or
"minimize-rmsle".
transformations: The transformations to apply.
split_spec: The split spec.
data_source: The data source.
train_budget_milli_node_hours: The train budget of creating this model,
expressed in milli node hours i.e. 1,000 value in this field means 1 node
hour.
stage_1_num_parallel_trials: Number of parallel trails for stage 1.
stage_2_num_parallel_trials: Number of parallel trails for stage 2.
stage_2_num_selected_trials: Number of selected trials for stage 2.
weight_column_name: The weight column name.
study_spec_override: The dictionary for overriding study spec. The
dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/study.proto#L181.
optimization_objective_recall_value: Required when optimization_objective is
"maximize-precision-at-recall". Must be between 0 and 1, inclusive.
optimization_objective_precision_value: Required when optimization_objective
is "maximize-recall-at-precision". Must be between 0 and 1, inclusive.
stage_1_tuner_worker_pool_specs_override: The dictionary for overriding.
stage 1 tuner worker pool spec. The dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
cv_trainer_worker_pool_specs_override: The dictionary for overriding stage
cv trainer worker pool spec. The dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
export_additional_model_without_custom_ops: Whether to export additional
model without custom TensorFlow operators.
stats_and_example_gen_dataflow_machine_type: The dataflow machine type for
stats_and_example_gen component.
stats_and_example_gen_dataflow_max_num_workers: The max number of Dataflow
workers for stats_and_example_gen component.
stats_and_example_gen_dataflow_disk_size_gb: Dataflow worker's disk size in
GB for stats_and_example_gen component.
transform_dataflow_machine_type: The dataflow machine type for transform
component.
transform_dataflow_max_num_workers: The max number of Dataflow workers for
transform component.
transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for
transform component.
dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty
the default subnetwork will be used. Example:
https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
dataflow_use_public_ips: Specifies whether Dataflow workers use public IP
addresses.
encryption_spec_key_name: The KMS key name.
additional_experiments: Use this field to config private preview features.
dataflow_service_account: Custom service account to run dataflow jobs.
run_evaluation: Whether to run evaluation in the training pipeline.
evaluation_batch_predict_machine_type: The prediction server machine type
for batch predict components during evaluation.
evaluation_batch_predict_starting_replica_count: The initial number of
prediction server for batch predict components during evaluation.
evaluation_batch_predict_max_replica_count: The max number of prediction
server for batch predict components during evaluation.
evaluation_dataflow_machine_type: The dataflow machine type for evaluation
components.
evaluation_dataflow_max_num_workers: The max number of Dataflow workers for
evaluation components.
evaluation_dataflow_disk_size_gb: Dataflow worker's disk size in GB for
evaluation components.
run_distillation: Whether to run distill in the training pipeline.
distill_batch_predict_machine_type: The prediction server machine type for
batch predict component in the model distillation.
distill_batch_predict_starting_replica_count: The initial number of
prediction server for batch predict component in the model distillation.
distill_batch_predict_max_replica_count: The max number of prediction server
for batch predict component in the model distillation.
Returns:
Tuple of pipeline_definition_path and parameter_values.
"""
warnings.warn(
'This method is deprecated,'
' please use get_automl_tabular_pipeline_and_parameters instead.'
)
if stage_1_num_parallel_trials <= 0:
stage_1_num_parallel_trials = _DEFAULT_NUM_PARALLEL_TRAILS
if stage_2_num_parallel_trials <= 0:
stage_2_num_parallel_trials = _DEFAULT_NUM_PARALLEL_TRAILS
hours = float(train_budget_milli_node_hours) / 1000.0
multiplier = stage_1_num_parallel_trials * hours / 500.0
stage_1_single_run_max_secs = int(math.sqrt(multiplier) * 2400.0)
phase_2_rounds = int(
math.sqrt(multiplier) * 100 / stage_2_num_parallel_trials + 0.5
)
if phase_2_rounds < 1:
phase_2_rounds = 1
# All of magic number "1.3" above is because the trial doesn't always finish
# in time_per_trial. 1.3 is an empirical safety margin here.
stage_1_deadline_secs = int(
hours * 3600.0 - 1.3 * stage_1_single_run_max_secs * phase_2_rounds
)
if stage_1_deadline_secs < hours * 3600.0 * 0.5:
stage_1_deadline_secs = int(hours * 3600.0 * 0.5)
# Phase 1 deadline is the same as phase 2 deadline in this case. Phase 2
# can't finish in time after the deadline is cut, so adjust the time per
# trial to meet the deadline.
stage_1_single_run_max_secs = int(
stage_1_deadline_secs / (1.3 * phase_2_rounds)
)
reduce_search_space_mode = 'minimal'
if multiplier > 2:
reduce_search_space_mode = 'regular'
if multiplier > 4:
reduce_search_space_mode = 'full'
# Stage 2 number of trials is stage_1_num_selected_trials *
# _NUM_FOLDS, which should be equal to phase_2_rounds *
# stage_2_num_parallel_trials. Use this information to calculate
# stage_1_num_selected_trials:
stage_1_num_selected_trials = int(
phase_2_rounds * stage_2_num_parallel_trials / _NUM_FOLDS
)
stage_1_deadline_hours = stage_1_deadline_secs / 3600.0
stage_2_deadline_hours = hours - stage_1_deadline_hours
stage_2_single_run_max_secs = stage_1_single_run_max_secs
parameter_values = {
'project': project,
'location': location,
'root_dir': root_dir,
'target_column_name': target_column_name,
'prediction_type': prediction_type,
'optimization_objective': optimization_objective,
'transformations': input_dictionary_to_parameter(transformations),
'split_spec': input_dictionary_to_parameter(split_spec),
'data_source': input_dictionary_to_parameter(data_source),
'stage_1_deadline_hours': stage_1_deadline_hours,
'stage_1_num_parallel_trials': stage_1_num_parallel_trials,
'stage_1_num_selected_trials': stage_1_num_selected_trials,
'stage_1_single_run_max_secs': stage_1_single_run_max_secs,
'reduce_search_space_mode': reduce_search_space_mode,
'stage_2_deadline_hours': stage_2_deadline_hours,
'stage_2_num_parallel_trials': stage_2_num_parallel_trials,
'stage_2_num_selected_trials': stage_2_num_selected_trials,
'stage_2_single_run_max_secs': stage_2_single_run_max_secs,
'weight_column_name': weight_column_name,
'optimization_objective_recall_value': (
optimization_objective_recall_value
),
'optimization_objective_precision_value': (
optimization_objective_precision_value
),
'study_spec_override': input_dictionary_to_parameter(study_spec_override),
'stage_1_tuner_worker_pool_specs_override': input_dictionary_to_parameter(
stage_1_tuner_worker_pool_specs_override
),
'cv_trainer_worker_pool_specs_override': input_dictionary_to_parameter(
cv_trainer_worker_pool_specs_override
),
'export_additional_model_without_custom_ops': (
export_additional_model_without_custom_ops
),
'stats_and_example_gen_dataflow_machine_type': (
stats_and_example_gen_dataflow_machine_type
),
'stats_and_example_gen_dataflow_max_num_workers': (
stats_and_example_gen_dataflow_max_num_workers
),
'stats_and_example_gen_dataflow_disk_size_gb': (
stats_and_example_gen_dataflow_disk_size_gb
),
'transform_dataflow_machine_type': transform_dataflow_machine_type,
'transform_dataflow_max_num_workers': transform_dataflow_max_num_workers,
'transform_dataflow_disk_size_gb': transform_dataflow_disk_size_gb,
'dataflow_subnetwork': dataflow_subnetwork,
'dataflow_use_public_ips': dataflow_use_public_ips,
'encryption_spec_key_name': encryption_spec_key_name,
}
if additional_experiments:
parameter_values.update(
{
'additional_experiments': input_dictionary_to_parameter(
additional_experiments
)
}
)
if run_evaluation:
parameter_values.update({
'dataflow_service_account': dataflow_service_account,
'evaluation_batch_predict_machine_type': (
evaluation_batch_predict_machine_type
),
'evaluation_batch_predict_starting_replica_count': (
evaluation_batch_predict_starting_replica_count
),
'evaluation_batch_predict_max_replica_count': (
evaluation_batch_predict_max_replica_count
),
'evaluation_dataflow_machine_type': evaluation_dataflow_machine_type,
'evaluation_dataflow_max_num_workers': (
evaluation_dataflow_max_num_workers
),
'evaluation_dataflow_disk_size_gb': evaluation_dataflow_disk_size_gb,
'run_evaluation': run_evaluation,
})
if run_distillation:
# All of magic number "1.3" above is because the trial doesn't always finish
# in time_per_trial. 1.3 is an empirical safety margin here.
distill_stage_1_deadline_hours = (
math.ceil(
float(_DISTILL_TOTAL_TRIALS)
/ parameter_values['stage_1_num_parallel_trials']
)
* parameter_values['stage_1_single_run_max_secs']
* 1.3
/ 3600.0
)
parameter_values.update({
'distill_stage_1_deadline_hours': distill_stage_1_deadline_hours,
'distill_batch_predict_machine_type': (
distill_batch_predict_machine_type
),
'distill_batch_predict_starting_replica_count': (
distill_batch_predict_starting_replica_count
),
'distill_batch_predict_max_replica_count': (
distill_batch_predict_max_replica_count
),
'run_distillation': run_distillation,
})
pipeline_definition_path = os.path.join(
pathlib.Path(__file__).parent.resolve(),
'deprecated/default_pipeline.json',
)
return pipeline_definition_path, parameter_values
def get_skip_architecture_search_pipeline_and_parameters(
project: str,
location: str,
root_dir: str,
target_column: str,
prediction_type: str,
optimization_objective: str,
transformations: str,
train_budget_milli_node_hours: float,
stage_1_tuning_result_artifact_uri: str,
stage_2_num_parallel_trials: Optional[int] = None,
stage_2_num_selected_trials: Optional[int] = None,
data_source_csv_filenames: Optional[str] = None,
data_source_bigquery_table_path: Optional[str] = None,
predefined_split_key: Optional[str] = None,
timestamp_split_key: Optional[str] = None,
stratified_split_key: Optional[str] = None,
training_fraction: Optional[float] = None,
validation_fraction: Optional[float] = None,
test_fraction: Optional[float] = None,
weight_column: Optional[str] = None,
optimization_objective_recall_value: Optional[float] = None,
optimization_objective_precision_value: Optional[float] = None,
cv_trainer_worker_pool_specs_override: Optional[Dict[str, Any]] = None,
export_additional_model_without_custom_ops: bool = False,
stats_and_example_gen_dataflow_machine_type: Optional[str] = None,
stats_and_example_gen_dataflow_max_num_workers: Optional[int] = None,
stats_and_example_gen_dataflow_disk_size_gb: Optional[int] = None,
transform_dataflow_machine_type: Optional[str] = None,
transform_dataflow_max_num_workers: Optional[int] = None,
transform_dataflow_disk_size_gb: Optional[int] = None,
dataflow_subnetwork: Optional[str] = None,
dataflow_use_public_ips: bool = True,
encryption_spec_key_name: Optional[str] = None,
additional_experiments: Optional[Dict[str, Any]] = None,
dataflow_service_account: Optional[str] = None,
run_evaluation: bool = True,
evaluation_batch_predict_machine_type: Optional[str] = None,
evaluation_batch_predict_starting_replica_count: Optional[int] = None,
evaluation_batch_predict_max_replica_count: Optional[int] = None,
evaluation_batch_explain_machine_type: Optional[str] = None,
evaluation_batch_explain_starting_replica_count: Optional[int] = None,
evaluation_batch_explain_max_replica_count: Optional[int] = None,
evaluation_dataflow_machine_type: Optional[str] = None,
evaluation_dataflow_starting_num_workers: Optional[int] = None,
evaluation_dataflow_max_num_workers: Optional[int] = None,
evaluation_dataflow_disk_size_gb: Optional[int] = None,
) -> Tuple[str, Dict[str, Any]]:
"""Get the AutoML Tabular training pipeline that skips architecture search.
Args:
project: The GCP project that runs the pipeline components.
location: The GCP region that runs the pipeline components.
root_dir: The root GCS directory for the pipeline components.
target_column: The target column name.
prediction_type: The type of prediction the model is to produce.
"classification" or "regression".
optimization_objective: For binary classification, "maximize-au-roc",
"minimize-log-loss", "maximize-au-prc", "maximize-precision-at-recall", or
"maximize-recall-at-precision". For multi class classification,
"minimize-log-loss". For regression, "minimize-rmse", "minimize-mae", or
"minimize-rmsle".
transformations: The transformations to apply.
train_budget_milli_node_hours: The train budget of creating this model,
expressed in milli node hours i.e. 1,000 value in this field means 1 node
hour.
stage_1_tuning_result_artifact_uri: The stage 1 tuning result artifact GCS
URI.
stage_2_num_parallel_trials: Number of parallel trails for stage 2.
stage_2_num_selected_trials: Number of selected trials for stage 2.
data_source_csv_filenames: The CSV data source.
data_source_bigquery_table_path: The BigQuery data source.
predefined_split_key: The predefined_split column name.
timestamp_split_key: The timestamp_split column name.
stratified_split_key: The stratified_split column name.
training_fraction: The training fraction.
validation_fraction: The validation fraction.
test_fraction: float = The test fraction.
weight_column: The weight column name.
optimization_objective_recall_value: Required when optimization_objective is
"maximize-precision-at-recall". Must be between 0 and 1, inclusive.
optimization_objective_precision_value: Required when optimization_objective
is "maximize-recall-at-precision". Must be between 0 and 1, inclusive.
cv_trainer_worker_pool_specs_override: The dictionary for overriding stage
cv trainer worker pool spec. The dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
export_additional_model_without_custom_ops: Whether to export additional
model without custom TensorFlow operators.
stats_and_example_gen_dataflow_machine_type: The dataflow machine type for
stats_and_example_gen component.
stats_and_example_gen_dataflow_max_num_workers: The max number of Dataflow
workers for stats_and_example_gen component.
stats_and_example_gen_dataflow_disk_size_gb: Dataflow worker's disk size in
GB for stats_and_example_gen component.
transform_dataflow_machine_type: The dataflow machine type for transform
component.
transform_dataflow_max_num_workers: The max number of Dataflow workers for
transform component.
transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for
transform component.
dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty
the default subnetwork will be used. Example:
https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
dataflow_use_public_ips: Specifies whether Dataflow workers use public IP
addresses.
encryption_spec_key_name: The KMS key name.
additional_experiments: Use this field to config private preview features.
dataflow_service_account: Custom service account to run dataflow jobs.
run_evaluation: Whether to run evaluation in the training pipeline.
evaluation_batch_predict_machine_type: The prediction server machine type
for batch predict components during evaluation.
evaluation_batch_predict_starting_replica_count: The initial number of
prediction server for batch predict components during evaluation.
evaluation_batch_predict_max_replica_count: The max number of prediction
server for batch predict components during evaluation.
evaluation_batch_explain_machine_type: The prediction server machine type
for batch explain components during evaluation.
evaluation_batch_explain_starting_replica_count: The initial number of
prediction server for batch explain components during evaluation.
evaluation_batch_explain_max_replica_count: The max number of prediction
server for batch explain components during evaluation.
evaluation_dataflow_machine_type: The dataflow machine type for evaluation
components.
evaluation_dataflow_starting_num_workers: The initial number of Dataflow
workers for evaluation components.
evaluation_dataflow_max_num_workers: The max number of Dataflow workers for
evaluation components.
evaluation_dataflow_disk_size_gb: Dataflow worker's disk size in GB for
evaluation components.
Returns:
Tuple of pipeline_definition_path and parameter_values.
"""
return get_automl_tabular_pipeline_and_parameters( # pytype: disable=wrong-arg-types
project=project,
location=location,
root_dir=root_dir,
target_column=target_column,
prediction_type=prediction_type,
optimization_objective=optimization_objective,
transformations=transformations,
train_budget_milli_node_hours=train_budget_milli_node_hours,
stage_1_num_parallel_trials=None,
stage_2_num_parallel_trials=stage_2_num_parallel_trials,
stage_2_num_selected_trials=stage_2_num_selected_trials,
data_source_csv_filenames=data_source_csv_filenames,
data_source_bigquery_table_path=data_source_bigquery_table_path,
predefined_split_key=predefined_split_key,
timestamp_split_key=timestamp_split_key,
stratified_split_key=stratified_split_key,
training_fraction=training_fraction,
validation_fraction=validation_fraction,
test_fraction=test_fraction,
weight_column=weight_column,
study_spec_parameters_override=[],
optimization_objective_recall_value=optimization_objective_recall_value,
optimization_objective_precision_value=optimization_objective_precision_value,
stage_1_tuner_worker_pool_specs_override={},
cv_trainer_worker_pool_specs_override=cv_trainer_worker_pool_specs_override,
export_additional_model_without_custom_ops=export_additional_model_without_custom_ops,
stats_and_example_gen_dataflow_machine_type=stats_and_example_gen_dataflow_machine_type,
stats_and_example_gen_dataflow_max_num_workers=stats_and_example_gen_dataflow_max_num_workers,
stats_and_example_gen_dataflow_disk_size_gb=stats_and_example_gen_dataflow_disk_size_gb,
transform_dataflow_machine_type=transform_dataflow_machine_type,
transform_dataflow_max_num_workers=transform_dataflow_max_num_workers,
transform_dataflow_disk_size_gb=transform_dataflow_disk_size_gb,
dataflow_subnetwork=dataflow_subnetwork,
dataflow_use_public_ips=dataflow_use_public_ips,
encryption_spec_key_name=encryption_spec_key_name,
additional_experiments=additional_experiments,
dataflow_service_account=dataflow_service_account,
run_evaluation=run_evaluation,
evaluation_batch_predict_machine_type=evaluation_batch_predict_machine_type,
evaluation_batch_predict_starting_replica_count=evaluation_batch_predict_starting_replica_count,
evaluation_batch_predict_max_replica_count=evaluation_batch_predict_max_replica_count,
evaluation_batch_explain_machine_type=evaluation_batch_explain_machine_type,
evaluation_batch_explain_starting_replica_count=evaluation_batch_explain_starting_replica_count,
evaluation_batch_explain_max_replica_count=evaluation_batch_explain_max_replica_count,
evaluation_dataflow_machine_type=evaluation_dataflow_machine_type,
evaluation_dataflow_starting_num_workers=evaluation_dataflow_starting_num_workers,
evaluation_dataflow_max_num_workers=evaluation_dataflow_max_num_workers,
evaluation_dataflow_disk_size_gb=evaluation_dataflow_disk_size_gb,
run_distillation=None,
distill_batch_predict_machine_type=None,
distill_batch_predict_starting_replica_count=None,
distill_batch_predict_max_replica_count=None,
stage_1_tuning_result_artifact_uri=stage_1_tuning_result_artifact_uri,
quantiles=[],
enable_probabilistic_inference=False,
)
def get_distill_skip_evaluation_pipeline_and_parameters(
project: str,
location: str,
root_dir: str,
target_column_name: str,
prediction_type: str,
optimization_objective: str,
transformations: Dict[str, Any],
split_spec: Dict[str, Any],
data_source: Dict[str, Any],
train_budget_milli_node_hours: float,
stage_1_num_parallel_trials: int = _DEFAULT_NUM_PARALLEL_TRAILS,
stage_2_num_parallel_trials: int = _DEFAULT_NUM_PARALLEL_TRAILS,
stage_2_num_selected_trials: int = _DEFAULT_STAGE_2_NUM_SELECTED_TRAILS,
weight_column_name: str = '',
study_spec_override: Optional[Dict[str, Any]] = None,
optimization_objective_recall_value: float = -1,
optimization_objective_precision_value: float = -1,
stage_1_tuner_worker_pool_specs_override: Optional[Dict[str, Any]] = None,
cv_trainer_worker_pool_specs_override: Optional[Dict[str, Any]] = None,
export_additional_model_without_custom_ops: bool = False,
stats_and_example_gen_dataflow_machine_type: str = 'n1-standard-16',
stats_and_example_gen_dataflow_max_num_workers: int = 25,
stats_and_example_gen_dataflow_disk_size_gb: int = 40,
transform_dataflow_machine_type: str = 'n1-standard-16',
transform_dataflow_max_num_workers: int = 25,
transform_dataflow_disk_size_gb: int = 40,
dataflow_subnetwork: str = '',
dataflow_use_public_ips: bool = True,
encryption_spec_key_name: str = '',
additional_experiments: Optional[Dict[str, Any]] = None,
distill_batch_predict_machine_type: str = 'n1-standard-16',
distill_batch_predict_starting_replica_count: int = 25,
distill_batch_predict_max_replica_count: int = 25,
) -> Tuple[str, Dict[str, Any]]:
"""Get the AutoML Tabular training pipeline that distill and skips evaluation.
Args:
project: The GCP project that runs the pipeline components.
location: The GCP region that runs the pipeline components.
root_dir: The root GCS directory for the pipeline components.
target_column_name: The target column name.
prediction_type: The type of prediction the model is to produce.
"classification" or "regression".
optimization_objective: For binary classification, "maximize-au-roc",
"minimize-log-loss", "maximize-au-prc", "maximize-precision-at-recall", or
"maximize-recall-at-precision". For multi class classification,
"minimize-log-loss". For regression, "minimize-rmse", "minimize-mae", or
"minimize-rmsle".
transformations: The transformations to apply.
split_spec: The split spec.
data_source: The data source.
train_budget_milli_node_hours: The train budget of creating this model,
expressed in milli node hours i.e. 1,000 value in this field means 1 node
hour.
stage_1_num_parallel_trials: Number of parallel trails for stage 1.
stage_2_num_parallel_trials: Number of parallel trails for stage 2.
stage_2_num_selected_trials: Number of selected trials for stage 2.
weight_column_name: The weight column name.
study_spec_override: The dictionary for overriding study spec. The
dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/study.proto#L181.
optimization_objective_recall_value: Required when optimization_objective is
"maximize-precision-at-recall". Must be between 0 and 1, inclusive.
optimization_objective_precision_value: Required when optimization_objective
is "maximize-recall-at-precision". Must be between 0 and 1, inclusive.
stage_1_tuner_worker_pool_specs_override: The dictionary for overriding.
stage 1 tuner worker pool spec. The dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
cv_trainer_worker_pool_specs_override: The dictionary for overriding stage
cv trainer worker pool spec. The dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
export_additional_model_without_custom_ops: Whether to export additional
model without custom TensorFlow operators.
stats_and_example_gen_dataflow_machine_type: The dataflow machine type for
stats_and_example_gen component.
stats_and_example_gen_dataflow_max_num_workers: The max number of Dataflow
workers for stats_and_example_gen component.
stats_and_example_gen_dataflow_disk_size_gb: Dataflow worker's disk size in
GB for stats_and_example_gen component.
transform_dataflow_machine_type: The dataflow machine type for transform
component.
transform_dataflow_max_num_workers: The max number of Dataflow workers for
transform component.
transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for
transform component.
dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty
the default subnetwork will be used. Example:
https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
dataflow_use_public_ips: Specifies whether Dataflow workers use public IP
addresses.
encryption_spec_key_name: The KMS key name.
additional_experiments: Use this field to config private preview features.
distill_batch_predict_machine_type: The prediction server machine type for
batch predict component in the model distillation.
distill_batch_predict_starting_replica_count: The initial number of
prediction server for batch predict component in the model distillation.
distill_batch_predict_max_replica_count: The max number of prediction server
for batch predict component in the model distillation.
Returns:
Tuple of pipeline_definition_path and parameter_values.
"""
warnings.warn(
'Depreciated. Please use get_automl_tabular_pipeline_and_parameters.'
)
return get_default_pipeline_and_parameters(
project=project,
location=location,
root_dir=root_dir,
target_column_name=target_column_name,
prediction_type=prediction_type,
optimization_objective=optimization_objective,
transformations=transformations,
split_spec=split_spec,
data_source=data_source,
train_budget_milli_node_hours=train_budget_milli_node_hours,
stage_1_num_parallel_trials=stage_1_num_parallel_trials,
stage_2_num_parallel_trials=stage_2_num_parallel_trials,
stage_2_num_selected_trials=stage_2_num_selected_trials,
weight_column_name=weight_column_name,
study_spec_override=study_spec_override,
optimization_objective_recall_value=optimization_objective_recall_value,
optimization_objective_precision_value=optimization_objective_precision_value,
stage_1_tuner_worker_pool_specs_override=stage_1_tuner_worker_pool_specs_override,
cv_trainer_worker_pool_specs_override=cv_trainer_worker_pool_specs_override,
export_additional_model_without_custom_ops=export_additional_model_without_custom_ops,
stats_and_example_gen_dataflow_machine_type=stats_and_example_gen_dataflow_machine_type,
stats_and_example_gen_dataflow_max_num_workers=stats_and_example_gen_dataflow_max_num_workers,
stats_and_example_gen_dataflow_disk_size_gb=stats_and_example_gen_dataflow_disk_size_gb,
transform_dataflow_machine_type=transform_dataflow_machine_type,
transform_dataflow_max_num_workers=transform_dataflow_max_num_workers,
transform_dataflow_disk_size_gb=transform_dataflow_disk_size_gb,
dataflow_subnetwork=dataflow_subnetwork,
dataflow_use_public_ips=dataflow_use_public_ips,
encryption_spec_key_name=encryption_spec_key_name,
additional_experiments=additional_experiments,
distill_batch_predict_machine_type=distill_batch_predict_machine_type,
distill_batch_predict_starting_replica_count=distill_batch_predict_starting_replica_count,
distill_batch_predict_max_replica_count=distill_batch_predict_max_replica_count,
run_evaluation=False,
run_distillation=True,
)