"""Util functions for AutoML Tabular pipeline."""
import json
import os
import pathlib
from typing import Any, Dict, List, Optional, Tuple, Union
import uuid
import warnings
_DEFAULT_NUM_PARALLEL_TRAILS = 35
_DEFAULT_STAGE_2_NUM_SELECTED_TRAILS = 5
_NUM_FOLDS = 5
_DISTILL_TOTAL_TRIALS = 100
_EVALUATION_BATCH_PREDICT_MACHINE_TYPE = 'n1-highmem-8'
_EVALUATION_BATCH_PREDICT_STARTING_REPLICA_COUNT = 20
_EVALUATION_BATCH_PREDICT_MAX_REPLICA_COUNT = 20
_EVALUATION_BATCH_EXPLAIN_MACHINE_TYPE = 'n1-highmem-8'
_EVALUATION_BATCH_EXPLAIN_STARTING_REPLICA_COUNT = 10
_EVALUATION_BATCH_EXPLAIN_MAX_REPLICA_COUNT = 10
_EVALUATION_DATAFLOW_MACHINE_TYPE = 'n1-standard-4'
_EVALUATION_DATAFLOW_STARTING_NUM_WORKERS = 10
_EVALUATION_DATAFLOW_MAX_NUM_WORKERS = 100
_EVALUATION_DATAFLOW_DISK_SIZE_GB = 50
_FEATURE_SELECTION_EXECUTION_ENGINE_BIGQUERY = 'bigquery'
# Needed because we reference the AutoML Tabular V1 pipeline.
_GCPC_STAGING_PATH = pathlib.Path(
__file__
).parent.parent.parent.parent.resolve()
_GCPC_GA_TABULAR_PATH = str(_GCPC_STAGING_PATH / 'v1' / 'automl' / 'tabular')
def _update_parameters(
parameter_values: Dict[str, Any], new_params: Dict[str, Any]
):
parameter_values.update(
{param: value for param, value in new_params.items() if value is not None}
)
def _generate_model_display_name() -> str:
"""Automatically generates a model_display_name.
Returns:
model_display_name.
"""
return f'tabular-workflow-model-{uuid.uuid4()}'
def _get_default_pipeline_params(
project: str,
location: str,
root_dir: str,
target_column: str,
prediction_type: str,
optimization_objective: str,
transformations: str,
train_budget_milli_node_hours: float,
stage_1_num_parallel_trials: Optional[int] = None,
stage_2_num_parallel_trials: Optional[int] = None,
stage_2_num_selected_trials: Optional[int] = None,
data_source_csv_filenames: Optional[str] = None,
data_source_bigquery_table_path: Optional[str] = None,
predefined_split_key: Optional[str] = None,
timestamp_split_key: Optional[str] = None,
stratified_split_key: Optional[str] = None,
training_fraction: Optional[float] = None,
validation_fraction: Optional[float] = None,
test_fraction: Optional[float] = None,
weight_column: Optional[float] = None,
study_spec_parameters_override: Optional[List[Dict[str, Any]]] = None,
optimization_objective_recall_value: Optional[float] = None,
optimization_objective_precision_value: Optional[float] = None,
stage_1_tuner_worker_pool_specs_override: Optional[Dict[str, Any]] = None,
cv_trainer_worker_pool_specs_override: Optional[Dict[str, Any]] = None,
export_additional_model_without_custom_ops: bool = False,
stats_and_example_gen_dataflow_machine_type: Optional[str] = None,
stats_and_example_gen_dataflow_max_num_workers: Optional[int] = None,
stats_and_example_gen_dataflow_disk_size_gb: Optional[int] = None,
transform_dataflow_machine_type: Optional[str] = None,
transform_dataflow_max_num_workers: Optional[int] = None,
transform_dataflow_disk_size_gb: Optional[int] = None,
dataflow_subnetwork: Optional[str] = None,
dataflow_use_public_ips: bool = True,
encryption_spec_key_name: Optional[str] = None,
additional_experiments: Optional[Dict[str, Any]] = None,
dataflow_service_account: Optional[str] = None,
max_selected_features: Optional[int] = None,
apply_feature_selection_tuning: bool = False,
run_evaluation: bool = True,
evaluation_batch_predict_machine_type: Optional[str] = None,
evaluation_batch_predict_starting_replica_count: Optional[int] = None,
evaluation_batch_predict_max_replica_count: Optional[int] = None,
evaluation_batch_explain_machine_type: Optional[str] = None,
evaluation_batch_explain_starting_replica_count: Optional[int] = None,
evaluation_batch_explain_max_replica_count: Optional[int] = None,
evaluation_dataflow_machine_type: Optional[str] = None,
evaluation_dataflow_starting_num_workers: Optional[int] = None,
evaluation_dataflow_max_num_workers: Optional[int] = None,
evaluation_dataflow_disk_size_gb: Optional[int] = None,
run_distillation: bool = False,
distill_batch_predict_machine_type: Optional[str] = None,
distill_batch_predict_starting_replica_count: Optional[int] = None,
distill_batch_predict_max_replica_count: Optional[int] = None,
stage_1_tuning_result_artifact_uri: Optional[str] = None,
quantiles: Optional[List[float]] = None,
enable_probabilistic_inference: bool = False,
num_selected_features: Optional[int] = None,
model_display_name: str = '',
model_description: str = '',
enable_fte: bool = False,
) -> Dict[str, Any]:
"""Get the AutoML Tabular v1 default training pipeline.
Args:
project: The GCP project that runs the pipeline components.
location: The GCP region that runs the pipeline components.
root_dir: The root GCS directory for the pipeline components.
target_column: The target column name.
prediction_type: The type of prediction the model is to produce.
"classification" or "regression".
optimization_objective: For binary classification, "maximize-au-roc",
"minimize-log-loss", "maximize-au-prc", "maximize-precision-at-recall", or
"maximize-recall-at-precision". For multi class classification,
"minimize-log-loss". For regression, "minimize-rmse", "minimize-mae", or
"minimize-rmsle".
transformations: The path to a GCS file containing the transformations to
apply.
train_budget_milli_node_hours: The train budget of creating this model,
expressed in milli node hours i.e. 1,000 value in this field means 1 node
hour.
stage_1_num_parallel_trials: Number of parallel trails for stage 1.
stage_2_num_parallel_trials: Number of parallel trails for stage 2.
stage_2_num_selected_trials: Number of selected trials for stage 2.
data_source_csv_filenames: The CSV data source.
data_source_bigquery_table_path: The BigQuery data source.
predefined_split_key: The predefined_split column name.
timestamp_split_key: The timestamp_split column name.
stratified_split_key: The stratified_split column name.
training_fraction: The training fraction.
validation_fraction: The validation fraction.
test_fraction: float = The test fraction.
weight_column: The weight column name.
study_spec_parameters_override: The list for overriding study spec. The list
should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/study.proto#L181.
optimization_objective_recall_value: Required when optimization_objective is
"maximize-precision-at-recall". Must be between 0 and 1, inclusive.
optimization_objective_precision_value: Required when optimization_objective
is "maximize-recall-at-precision". Must be between 0 and 1, inclusive.
stage_1_tuner_worker_pool_specs_override: The dictionary for overriding.
stage 1 tuner worker pool spec. The dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
cv_trainer_worker_pool_specs_override: The dictionary for overriding stage
cv trainer worker pool spec. The dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
export_additional_model_without_custom_ops: Whether to export additional
model without custom TensorFlow operators.
stats_and_example_gen_dataflow_machine_type: The dataflow machine type for
stats_and_example_gen component.
stats_and_example_gen_dataflow_max_num_workers: The max number of Dataflow
workers for stats_and_example_gen component.
stats_and_example_gen_dataflow_disk_size_gb: Dataflow worker's disk size in
GB for stats_and_example_gen component.
transform_dataflow_machine_type: The dataflow machine type for transform
component.
transform_dataflow_max_num_workers: The max number of Dataflow workers for
transform component.
transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for
transform component.
dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty
the default subnetwork will be used. Example:
https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
dataflow_use_public_ips: Specifies whether Dataflow workers use public IP
addresses.
encryption_spec_key_name: The KMS key name.
additional_experiments: Use this field to config private preview features.
dataflow_service_account: Custom service account to run dataflow jobs.
max_selected_features: number of features to select for training,
apply_feature_selection_tuning: tuning feature selection rate if true.
run_evaluation: Whether to run evaluation in the training pipeline.
evaluation_batch_predict_machine_type: The prediction server machine type
for batch predict components during evaluation.
evaluation_batch_predict_starting_replica_count: The initial number of
prediction server for batch predict components during evaluation.
evaluation_batch_predict_max_replica_count: The max number of prediction
server for batch predict components during evaluation.
evaluation_batch_explain_machine_type: The prediction server machine type
for batch explain components during evaluation.
evaluation_batch_explain_starting_replica_count: The initial number of
prediction server for batch explain components during evaluation.
evaluation_batch_explain_max_replica_count: The max number of prediction
server for batch explain components during evaluation.
evaluation_dataflow_machine_type: The dataflow machine type for evaluation
components.
evaluation_dataflow_starting_num_workers: The initial number of Dataflow
workers for evaluation components.
evaluation_dataflow_max_num_workers: The max number of Dataflow workers for
evaluation components.
evaluation_dataflow_disk_size_gb: Dataflow worker's disk size in GB for
evaluation components.
run_distillation: Whether to run distill in the training pipeline.
distill_batch_predict_machine_type: The prediction server machine type for
batch predict component in the model distillation.
distill_batch_predict_starting_replica_count: The initial number of
prediction server for batch predict component in the model distillation.
distill_batch_predict_max_replica_count: The max number of prediction server
for batch predict component in the model distillation.
stage_1_tuning_result_artifact_uri: The stage 1 tuning result artifact GCS
URI.
quantiles: Quantiles to use for probabilistic inference. Up to 5 quantiles
are allowed of values between 0 and 1, exclusive. Represents the quantiles
to use for that objective. Quantiles must be unique.
enable_probabilistic_inference: If probabilistic inference is enabled, the
model will fit a distribution that captures the uncertainty of a
prediction. At inference time, the predictive distribution is used to make
a point prediction that minimizes the optimization objective. For example,
the mean of a predictive distribution is the point prediction that
minimizes RMSE loss. If quantiles are specified, then the quantiles of the
distribution are also returned.
num_selected_features: Number of selected features for feature selection,
defaults to None, in which case all features are used. If specified,
enable_probabilistic_inference and run_distillation cannot be enabled.
model_display_name: The display name of the uploaded Vertex model.
model_description: The description for the uploaded model.
enable_fte: Whether to enable the Feature Transform Engine.
Returns:
Tuple of pipeline_definition_path and parameter_values.
"""
if not study_spec_parameters_override:
study_spec_parameters_override = []
if not stage_1_tuner_worker_pool_specs_override:
stage_1_tuner_worker_pool_specs_override = []
if not cv_trainer_worker_pool_specs_override:
cv_trainer_worker_pool_specs_override = []
if not quantiles:
quantiles = []
parameter_values = {}
parameters = {
'project': project,
'location': location,
'root_dir': root_dir,
'target_column': target_column,
'prediction_type': prediction_type,
'data_source_csv_filenames': data_source_csv_filenames,
'data_source_bigquery_table_path': data_source_bigquery_table_path,
'predefined_split_key': predefined_split_key,
'timestamp_split_key': timestamp_split_key,
'stratified_split_key': stratified_split_key,
'training_fraction': training_fraction,
'validation_fraction': validation_fraction,
'test_fraction': test_fraction,
'optimization_objective': optimization_objective,
'train_budget_milli_node_hours': train_budget_milli_node_hours,
'stage_1_num_parallel_trials': stage_1_num_parallel_trials,
'stage_2_num_parallel_trials': stage_2_num_parallel_trials,
'stage_2_num_selected_trials': stage_2_num_selected_trials,
'weight_column': weight_column,
'optimization_objective_recall_value': (
optimization_objective_recall_value
),
'optimization_objective_precision_value': (
optimization_objective_precision_value
),
'study_spec_parameters_override': study_spec_parameters_override,
'stage_1_tuner_worker_pool_specs_override': (
stage_1_tuner_worker_pool_specs_override
),
'cv_trainer_worker_pool_specs_override': (
cv_trainer_worker_pool_specs_override
),
'export_additional_model_without_custom_ops': (
export_additional_model_without_custom_ops
),
'dataflow_subnetwork': dataflow_subnetwork,
'dataflow_use_public_ips': dataflow_use_public_ips,
'dataflow_service_account': dataflow_service_account,
'encryption_spec_key_name': encryption_spec_key_name,
'max_selected_features': max_selected_features,
'stage_1_tuning_result_artifact_uri': stage_1_tuning_result_artifact_uri,
'quantiles': quantiles,
'enable_probabilistic_inference': enable_probabilistic_inference,
'model_display_name': model_display_name,
'model_description': model_description,
}
parameter_values.update(
{param: value for param, value in parameters.items() if value is not None}
)
if run_evaluation:
eval_parameters = {
'evaluation_batch_predict_machine_type': (
evaluation_batch_predict_machine_type
),
'evaluation_batch_predict_starting_replica_count': (
evaluation_batch_predict_starting_replica_count
),
'evaluation_batch_predict_max_replica_count': (
evaluation_batch_predict_max_replica_count
),
'evaluation_batch_explain_machine_type': (
evaluation_batch_explain_machine_type
),
'evaluation_batch_explain_starting_replica_count': (
evaluation_batch_explain_starting_replica_count
),
'evaluation_batch_explain_max_replica_count': (
evaluation_batch_explain_max_replica_count
),
'evaluation_dataflow_machine_type': evaluation_dataflow_machine_type,
'evaluation_dataflow_starting_num_workers': (
evaluation_dataflow_starting_num_workers
),
'evaluation_dataflow_max_num_workers': (
evaluation_dataflow_max_num_workers
),
'evaluation_dataflow_disk_size_gb': evaluation_dataflow_disk_size_gb,
'run_evaluation': run_evaluation,
}
parameter_values.update(
{
param: value
for param, value in eval_parameters.items()
if value is not None
}
)
if run_distillation:
distillation_parameters = {
'distill_batch_predict_machine_type': (
distill_batch_predict_machine_type
),
'distill_batch_predict_starting_replica_count': (
distill_batch_predict_starting_replica_count
),
'distill_batch_predict_max_replica_count': (
distill_batch_predict_max_replica_count
),
'run_distillation': run_distillation,
}
parameter_values.update(
{
param: value
for param, value in distillation_parameters.items()
if value is not None
}
)
# V1 pipeline
if not enable_fte:
if not additional_experiments:
additional_experiments = {}
parameters = {
'transformations': transformations,
'stats_and_example_gen_dataflow_machine_type': (
stats_and_example_gen_dataflow_machine_type
),
'stats_and_example_gen_dataflow_max_num_workers': (
stats_and_example_gen_dataflow_max_num_workers
),
'stats_and_example_gen_dataflow_disk_size_gb': (
stats_and_example_gen_dataflow_disk_size_gb
),
'transform_dataflow_machine_type': transform_dataflow_machine_type,
'transform_dataflow_max_num_workers': (
transform_dataflow_max_num_workers
),
'transform_dataflow_disk_size_gb': transform_dataflow_disk_size_gb,
'additional_experiments': additional_experiments,
}
parameter_values.update(
{
param: value
for param, value in parameters.items()
if value is not None
}
)
if apply_feature_selection_tuning:
parameter_values.update({
'apply_feature_selection_tuning': apply_feature_selection_tuning,
})
# V2 pipeline (with FTE)
else:
parameters = {
'num_selected_features': num_selected_features,
'dataset_level_custom_transformation_definitions': [],
'dataset_level_transformations': [],
'tf_auto_transform_features': {},
'tf_custom_transformation_definitions': [],
'legacy_transformations_path': transformations,
'feature_transform_engine_dataflow_machine_type': (
transform_dataflow_machine_type
),
'feature_transform_engine_dataflow_max_num_workers': (
transform_dataflow_max_num_workers
),
'feature_transform_engine_dataflow_disk_size_gb': (
transform_dataflow_disk_size_gb
),
}
parameter_values.update(
{
param: value
for param, value in parameters.items()
if value is not None
}
)
return parameter_values
def get_automl_tabular_pipeline_and_parameters(
project: str,
location: str,
root_dir: str,
target_column: str,
prediction_type: str,
optimization_objective: str,
transformations: str,
train_budget_milli_node_hours: float,
stage_1_num_parallel_trials: Optional[int] = None,
stage_2_num_parallel_trials: Optional[int] = None,
stage_2_num_selected_trials: Optional[int] = None,
data_source_csv_filenames: Optional[str] = None,
data_source_bigquery_table_path: Optional[str] = None,
predefined_split_key: Optional[str] = None,
timestamp_split_key: Optional[str] = None,
stratified_split_key: Optional[str] = None,
training_fraction: Optional[float] = None,
validation_fraction: Optional[float] = None,
test_fraction: Optional[float] = None,
weight_column: Optional[str] = None,
study_spec_parameters_override: Optional[List[Dict[str, Any]]] = None,
optimization_objective_recall_value: Optional[float] = None,
optimization_objective_precision_value: Optional[float] = None,
stage_1_tuner_worker_pool_specs_override: Optional[Dict[str, Any]] = None,
cv_trainer_worker_pool_specs_override: Optional[Dict[str, Any]] = None,
export_additional_model_without_custom_ops: bool = False,
stats_and_example_gen_dataflow_machine_type: Optional[str] = None,
stats_and_example_gen_dataflow_max_num_workers: Optional[int] = None,
stats_and_example_gen_dataflow_disk_size_gb: Optional[int] = None,
transform_dataflow_machine_type: Optional[str] = None,
transform_dataflow_max_num_workers: Optional[int] = None,
transform_dataflow_disk_size_gb: Optional[int] = None,
dataflow_subnetwork: Optional[str] = None,
dataflow_use_public_ips: bool = True,
encryption_spec_key_name: Optional[str] = None,
additional_experiments: Optional[Dict[str, Any]] = None,
dataflow_service_account: Optional[str] = None,
run_evaluation: bool = True,
evaluation_batch_predict_machine_type: Optional[str] = None,
evaluation_batch_predict_starting_replica_count: Optional[int] = None,
evaluation_batch_predict_max_replica_count: Optional[int] = None,
evaluation_batch_explain_machine_type: Optional[str] = None,
evaluation_batch_explain_starting_replica_count: Optional[int] = None,
evaluation_batch_explain_max_replica_count: Optional[int] = None,
evaluation_dataflow_machine_type: Optional[str] = None,
evaluation_dataflow_starting_num_workers: Optional[int] = None,
evaluation_dataflow_max_num_workers: Optional[int] = None,
evaluation_dataflow_disk_size_gb: Optional[int] = None,
run_distillation: bool = False,
distill_batch_predict_machine_type: Optional[str] = None,
distill_batch_predict_starting_replica_count: Optional[int] = None,
distill_batch_predict_max_replica_count: Optional[int] = None,
stage_1_tuning_result_artifact_uri: Optional[str] = None,
quantiles: Optional[List[float]] = None,
enable_probabilistic_inference: bool = False,
num_selected_features: Optional[int] = None,
model_display_name: str = '',
model_description: str = '',
enable_fte: bool = False,
) -> Tuple[str, Dict[str, Any]]:
"""Get the AutoML Tabular v1 default training pipeline.
Args:
project: The GCP project that runs the pipeline components.
location: The GCP region that runs the pipeline components.
root_dir: The root GCS directory for the pipeline components.
target_column: The target column name.
prediction_type: The type of prediction the model is to produce.
"classification" or "regression".
optimization_objective: For binary classification, "maximize-au-roc",
"minimize-log-loss", "maximize-au-prc", "maximize-precision-at-recall", or
"maximize-recall-at-precision". For multi class classification,
"minimize-log-loss". For regression, "minimize-rmse", "minimize-mae", or
"minimize-rmsle".
transformations: The path to a GCS file containing the transformations to
apply.
train_budget_milli_node_hours: The train budget of creating this model,
expressed in milli node hours i.e. 1,000 value in this field means 1 node
hour.
stage_1_num_parallel_trials: Number of parallel trails for stage 1.
stage_2_num_parallel_trials: Number of parallel trails for stage 2.
stage_2_num_selected_trials: Number of selected trials for stage 2.
data_source_csv_filenames: The CSV data source.
data_source_bigquery_table_path: The BigQuery data source.
predefined_split_key: The predefined_split column name.
timestamp_split_key: The timestamp_split column name.
stratified_split_key: The stratified_split column name.
training_fraction: The training fraction.
validation_fraction: The validation fraction.
test_fraction: float = The test fraction.
weight_column: The weight column name.
study_spec_parameters_override: The list for overriding study spec. The list
should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/study.proto#L181.
optimization_objective_recall_value: Required when optimization_objective is
"maximize-precision-at-recall". Must be between 0 and 1, inclusive.
optimization_objective_precision_value: Required when optimization_objective
is "maximize-recall-at-precision". Must be between 0 and 1, inclusive.
stage_1_tuner_worker_pool_specs_override: The dictionary for overriding.
stage 1 tuner worker pool spec. The dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
cv_trainer_worker_pool_specs_override: The dictionary for overriding stage
cv trainer worker pool spec. The dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
export_additional_model_without_custom_ops: Whether to export additional
model without custom TensorFlow operators.
stats_and_example_gen_dataflow_machine_type: The dataflow machine type for
stats_and_example_gen component.
stats_and_example_gen_dataflow_max_num_workers: The max number of Dataflow
workers for stats_and_example_gen component.
stats_and_example_gen_dataflow_disk_size_gb: Dataflow worker's disk size in
GB for stats_and_example_gen component.
transform_dataflow_machine_type: The dataflow machine type for transform
component.
transform_dataflow_max_num_workers: The max number of Dataflow workers for
transform component.
transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for
transform component.
dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty
the default subnetwork will be used. Example:
https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
dataflow_use_public_ips: Specifies whether Dataflow workers use public IP
addresses.
encryption_spec_key_name: The KMS key name.
additional_experiments: Use this field to config private preview features.
dataflow_service_account: Custom service account to run dataflow jobs.
run_evaluation: Whether to run evaluation in the training pipeline.
evaluation_batch_predict_machine_type: The prediction server machine type
for batch predict components during evaluation.
evaluation_batch_predict_starting_replica_count: The initial number of
prediction server for batch predict components during evaluation.
evaluation_batch_predict_max_replica_count: The max number of prediction
server for batch predict components during evaluation.
evaluation_batch_explain_machine_type: The prediction server machine type
for batch explain components during evaluation.
evaluation_batch_explain_starting_replica_count: The initial number of
prediction server for batch explain components during evaluation.
evaluation_batch_explain_max_replica_count: The max number of prediction
server for batch explain components during evaluation.
evaluation_dataflow_machine_type: The dataflow machine type for evaluation
components.
evaluation_dataflow_starting_num_workers: The initial number of Dataflow
workers for evaluation components.
evaluation_dataflow_max_num_workers: The max number of Dataflow workers for
evaluation components.
evaluation_dataflow_disk_size_gb: Dataflow worker's disk size in GB for
evaluation components.
run_distillation: Whether to run distill in the training pipeline.
distill_batch_predict_machine_type: The prediction server machine type for
batch predict component in the model distillation.
distill_batch_predict_starting_replica_count: The initial number of
prediction server for batch predict component in the model distillation.
distill_batch_predict_max_replica_count: The max number of prediction server
for batch predict component in the model distillation.
stage_1_tuning_result_artifact_uri: The stage 1 tuning result artifact GCS
URI.
quantiles: Quantiles to use for probabilistic inference. Up to 5 quantiles
are allowed of values between 0 and 1, exclusive. Represents the quantiles
to use for that objective. Quantiles must be unique.
enable_probabilistic_inference: If probabilistic inference is enabled, the
model will fit a distribution that captures the uncertainty of a
prediction. At inference time, the predictive distribution is used to make
a point prediction that minimizes the optimization objective. For example,
the mean of a predictive distribution is the point prediction that
minimizes RMSE loss. If quantiles are specified, then the quantiles of the
distribution are also returned.
num_selected_features: Number of selected features for feature selection,
defaults to None, in which case all features are used.
model_display_name: The display name of the uploaded Vertex model.
model_description: The description for the uploaded model.
enable_fte: Whether to enable the Feature Transform Engine.
Returns:
Tuple of pipeline_definition_path and parameter_values.
"""
parameter_values = _get_default_pipeline_params(
project=project,
location=location,
root_dir=root_dir,
target_column=target_column,
prediction_type=prediction_type,
optimization_objective=optimization_objective,
transformations=transformations,
train_budget_milli_node_hours=train_budget_milli_node_hours,
stage_1_num_parallel_trials=stage_1_num_parallel_trials,
stage_2_num_parallel_trials=stage_2_num_parallel_trials,
stage_2_num_selected_trials=stage_2_num_selected_trials,
data_source_csv_filenames=data_source_csv_filenames,
data_source_bigquery_table_path=data_source_bigquery_table_path,
predefined_split_key=predefined_split_key,
timestamp_split_key=timestamp_split_key,
stratified_split_key=stratified_split_key,
training_fraction=training_fraction,
validation_fraction=validation_fraction,
test_fraction=test_fraction,
weight_column=weight_column,
study_spec_parameters_override=study_spec_parameters_override,
optimization_objective_recall_value=optimization_objective_recall_value,
optimization_objective_precision_value=optimization_objective_precision_value,
stage_1_tuner_worker_pool_specs_override=stage_1_tuner_worker_pool_specs_override,
cv_trainer_worker_pool_specs_override=cv_trainer_worker_pool_specs_override,
export_additional_model_without_custom_ops=export_additional_model_without_custom_ops,
stats_and_example_gen_dataflow_machine_type=stats_and_example_gen_dataflow_machine_type,
stats_and_example_gen_dataflow_max_num_workers=stats_and_example_gen_dataflow_max_num_workers,
stats_and_example_gen_dataflow_disk_size_gb=stats_and_example_gen_dataflow_disk_size_gb,
transform_dataflow_machine_type=transform_dataflow_machine_type,
transform_dataflow_max_num_workers=transform_dataflow_max_num_workers,
transform_dataflow_disk_size_gb=transform_dataflow_disk_size_gb,
dataflow_subnetwork=dataflow_subnetwork,
dataflow_use_public_ips=dataflow_use_public_ips,
encryption_spec_key_name=encryption_spec_key_name,
additional_experiments=additional_experiments,
dataflow_service_account=dataflow_service_account,
run_evaluation=run_evaluation,
evaluation_batch_predict_machine_type=evaluation_batch_predict_machine_type,
evaluation_batch_predict_starting_replica_count=evaluation_batch_predict_starting_replica_count,
evaluation_batch_predict_max_replica_count=evaluation_batch_predict_max_replica_count,
evaluation_batch_explain_machine_type=evaluation_batch_explain_machine_type,
evaluation_batch_explain_starting_replica_count=evaluation_batch_explain_starting_replica_count,
evaluation_batch_explain_max_replica_count=evaluation_batch_explain_max_replica_count,
evaluation_dataflow_machine_type=evaluation_dataflow_machine_type,
evaluation_dataflow_starting_num_workers=evaluation_dataflow_starting_num_workers,
evaluation_dataflow_max_num_workers=evaluation_dataflow_max_num_workers,
evaluation_dataflow_disk_size_gb=evaluation_dataflow_disk_size_gb,
run_distillation=run_distillation,
distill_batch_predict_machine_type=distill_batch_predict_machine_type,
distill_batch_predict_starting_replica_count=distill_batch_predict_starting_replica_count,
distill_batch_predict_max_replica_count=distill_batch_predict_max_replica_count,
stage_1_tuning_result_artifact_uri=stage_1_tuning_result_artifact_uri,
quantiles=quantiles,
enable_probabilistic_inference=enable_probabilistic_inference,
num_selected_features=num_selected_features,
model_display_name=model_display_name,
model_description=model_description,
enable_fte=enable_fte,
)
# V1 pipeline without FTE
if not enable_fte:
pipeline_definition_path = os.path.join(
_GCPC_GA_TABULAR_PATH, 'automl_tabular_pipeline.yaml'
)
# V2 pipeline with FTE
else:
pipeline_definition_path = os.path.join(
pathlib.Path(__file__).parent.resolve(),
'automl_tabular_v2_pipeline.yaml',
)
return pipeline_definition_path, parameter_values
def get_automl_tabular_feature_selection_pipeline_and_parameters(
project: str,
location: str,
root_dir: str,
target_column: str,
prediction_type: str,
optimization_objective: str,
transformations: str,
train_budget_milli_node_hours: float,
stage_1_num_parallel_trials: Optional[int] = None,
stage_2_num_parallel_trials: Optional[int] = None,
stage_2_num_selected_trials: Optional[int] = None,
data_source_csv_filenames: Optional[str] = None,
data_source_bigquery_table_path: Optional[str] = None,
predefined_split_key: Optional[str] = None,
timestamp_split_key: Optional[str] = None,
stratified_split_key: Optional[str] = None,
training_fraction: Optional[float] = None,
validation_fraction: Optional[float] = None,
test_fraction: Optional[float] = None,
weight_column: Optional[str] = None,
study_spec_parameters_override: Optional[List[Dict[str, Any]]] = None,
optimization_objective_recall_value: Optional[float] = None,
optimization_objective_precision_value: Optional[float] = None,
stage_1_tuner_worker_pool_specs_override: Optional[Dict[str, Any]] = None,
cv_trainer_worker_pool_specs_override: Optional[Dict[str, Any]] = None,
export_additional_model_without_custom_ops: bool = False,
stats_and_example_gen_dataflow_machine_type: Optional[str] = None,
stats_and_example_gen_dataflow_max_num_workers: Optional[int] = None,
stats_and_example_gen_dataflow_disk_size_gb: Optional[int] = None,
transform_dataflow_machine_type: Optional[str] = None,
transform_dataflow_max_num_workers: Optional[int] = None,
transform_dataflow_disk_size_gb: Optional[int] = None,
dataflow_subnetwork: Optional[str] = None,
dataflow_use_public_ips: bool = True,
encryption_spec_key_name: Optional[str] = None,
additional_experiments: Optional[Dict[str, Any]] = None,
dataflow_service_account: Optional[str] = None,
run_evaluation: bool = True,
evaluation_batch_predict_machine_type: Optional[str] = None,
evaluation_batch_predict_starting_replica_count: Optional[int] = None,
evaluation_batch_predict_max_replica_count: Optional[int] = None,
evaluation_batch_explain_machine_type: Optional[str] = None,
evaluation_batch_explain_starting_replica_count: Optional[int] = None,
evaluation_batch_explain_max_replica_count: Optional[int] = None,
evaluation_dataflow_machine_type: Optional[str] = None,
evaluation_dataflow_starting_num_workers: Optional[int] = None,
evaluation_dataflow_max_num_workers: Optional[int] = None,
evaluation_dataflow_disk_size_gb: Optional[int] = None,
max_selected_features: int = 1000,
apply_feature_selection_tuning: bool = False,
run_distillation: bool = False,
distill_batch_predict_machine_type: Optional[str] = None,
distill_batch_predict_starting_replica_count: Optional[int] = None,
distill_batch_predict_max_replica_count: Optional[int] = None,
model_display_name: str = '',
model_description: str = '',
) -> Tuple[str, Dict[str, Any]]:
"""Get the AutoML Tabular v1 default training pipeline.
Args:
project: The GCP project that runs the pipeline components.
location: The GCP region that runs the pipeline components.
root_dir: The root GCS directory for the pipeline components.
target_column: The target column name.
prediction_type: The type of prediction the model is to produce.
"classification" or "regression".
optimization_objective: For binary classification, "maximize-au-roc",
"minimize-log-loss", "maximize-au-prc", "maximize-precision-at-recall", or
"maximize-recall-at-precision". For multi class classification,
"minimize-log-loss". For regression, "minimize-rmse", "minimize-mae", or
"minimize-rmsle".
transformations: The path to a GCS file containing the transformations to
apply.
train_budget_milli_node_hours: The train budget of creating this model,
expressed in milli node hours i.e. 1,000 value in this field means 1 node
hour.
stage_1_num_parallel_trials: Number of parallel trails for stage 1.
stage_2_num_parallel_trials: Number of parallel trails for stage 2.
stage_2_num_selected_trials: Number of selected trials for stage 2.
data_source_csv_filenames: The CSV data source.
data_source_bigquery_table_path: The BigQuery data source.
predefined_split_key: The predefined_split column name.
timestamp_split_key: The timestamp_split column name.
stratified_split_key: The stratified_split column name.
training_fraction: The training fraction.
validation_fraction: The validation fraction.
test_fraction: float = The test fraction.
weight_column: The weight column name.
study_spec_parameters_override: The list for overriding study spec. The list
should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/study.proto#L181.
optimization_objective_recall_value: Required when optimization_objective is
"maximize-precision-at-recall". Must be between 0 and 1, inclusive.
optimization_objective_precision_value: Required when optimization_objective
is "maximize-recall-at-precision". Must be between 0 and 1, inclusive.
stage_1_tuner_worker_pool_specs_override: The dictionary for overriding.
stage 1 tuner worker pool spec. The dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
cv_trainer_worker_pool_specs_override: The dictionary for overriding stage
cv trainer worker pool spec. The dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
export_additional_model_without_custom_ops: Whether to export additional
model without custom TensorFlow operators.
stats_and_example_gen_dataflow_machine_type: The dataflow machine type for
stats_and_example_gen component.
stats_and_example_gen_dataflow_max_num_workers: The max number of Dataflow
workers for stats_and_example_gen component.
stats_and_example_gen_dataflow_disk_size_gb: Dataflow worker's disk size in
GB for stats_and_example_gen component.
transform_dataflow_machine_type: The dataflow machine type for transform
component.
transform_dataflow_max_num_workers: The max number of Dataflow workers for
transform component.
transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for
transform component.
dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty
the default subnetwork will be used. Example:
https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
dataflow_use_public_ips: Specifies whether Dataflow workers use public IP
addresses.
encryption_spec_key_name: The KMS key name.
additional_experiments: Use this field to config private preview features.
dataflow_service_account: Custom service account to run dataflow jobs.
run_evaluation: Whether to run evaluation in the training pipeline.
evaluation_batch_predict_machine_type: The prediction server machine type
for batch predict components during evaluation.
evaluation_batch_predict_starting_replica_count: The initial number of
prediction server for batch predict components during evaluation.
evaluation_batch_predict_max_replica_count: The max number of prediction
server for batch predict components during evaluation.
evaluation_batch_explain_machine_type: The prediction server machine type
for batch explain components during evaluation.
evaluation_batch_explain_starting_replica_count: The initial number of
prediction server for batch explain components during evaluation.
evaluation_batch_explain_max_replica_count: The max number of prediction
server for batch explain components during evaluation.
evaluation_dataflow_machine_type: The dataflow machine type for evaluation
components.
evaluation_dataflow_starting_num_workers: The initial number of Dataflow
workers for evaluation components.
evaluation_dataflow_max_num_workers: The max number of Dataflow workers for
evaluation components.
evaluation_dataflow_disk_size_gb: Dataflow worker's disk size in GB for
evaluation components.
max_selected_features: number of features to select for training,
apply_feature_selection_tuning: tuning feature selection rate if true.
run_distillation: Whether to run distill in the training pipeline.
distill_batch_predict_machine_type: The prediction server machine type for
batch predict component in the model distillation.
distill_batch_predict_starting_replica_count: The initial number of
prediction server for batch predict component in the model distillation.
distill_batch_predict_max_replica_count: The max number of prediction server
for batch predict component in the model distillation.
model_display_name: The display name of the uploaded Vertex model.
model_description: The description for the uploaded model.
Returns:
Tuple of pipeline_definition_path and parameter_values.
"""
model_display_name = (
model_display_name
if model_display_name
else _generate_model_display_name()
)
parameter_values = _get_default_pipeline_params(
project=project,
location=location,
root_dir=root_dir,
target_column=target_column,
prediction_type=prediction_type,
optimization_objective=optimization_objective,
transformations=transformations,
train_budget_milli_node_hours=train_budget_milli_node_hours,
stage_1_num_parallel_trials=stage_1_num_parallel_trials,
stage_2_num_parallel_trials=stage_2_num_parallel_trials,
stage_2_num_selected_trials=stage_2_num_selected_trials,
data_source_csv_filenames=data_source_csv_filenames,
data_source_bigquery_table_path=data_source_bigquery_table_path,
predefined_split_key=predefined_split_key,
timestamp_split_key=timestamp_split_key,
stratified_split_key=stratified_split_key,
training_fraction=training_fraction,
validation_fraction=validation_fraction,
test_fraction=test_fraction,
weight_column=weight_column,
study_spec_parameters_override=study_spec_parameters_override,
optimization_objective_recall_value=optimization_objective_recall_value,
optimization_objective_precision_value=optimization_objective_precision_value,
stage_1_tuner_worker_pool_specs_override=stage_1_tuner_worker_pool_specs_override,
cv_trainer_worker_pool_specs_override=cv_trainer_worker_pool_specs_override,
export_additional_model_without_custom_ops=export_additional_model_without_custom_ops,
stats_and_example_gen_dataflow_machine_type=stats_and_example_gen_dataflow_machine_type,
stats_and_example_gen_dataflow_max_num_workers=stats_and_example_gen_dataflow_max_num_workers,
stats_and_example_gen_dataflow_disk_size_gb=stats_and_example_gen_dataflow_disk_size_gb,
transform_dataflow_machine_type=transform_dataflow_machine_type,
transform_dataflow_max_num_workers=transform_dataflow_max_num_workers,
transform_dataflow_disk_size_gb=transform_dataflow_disk_size_gb,
dataflow_subnetwork=dataflow_subnetwork,
dataflow_use_public_ips=dataflow_use_public_ips,
encryption_spec_key_name=encryption_spec_key_name,
additional_experiments=additional_experiments,
dataflow_service_account=dataflow_service_account,
max_selected_features=max_selected_features,
apply_feature_selection_tuning=apply_feature_selection_tuning,
run_evaluation=run_evaluation,
evaluation_batch_predict_machine_type=evaluation_batch_predict_machine_type,
evaluation_batch_predict_starting_replica_count=evaluation_batch_predict_starting_replica_count,
evaluation_batch_predict_max_replica_count=evaluation_batch_predict_max_replica_count,
evaluation_batch_explain_machine_type=evaluation_batch_explain_machine_type,
evaluation_batch_explain_starting_replica_count=evaluation_batch_explain_starting_replica_count,
evaluation_batch_explain_max_replica_count=evaluation_batch_explain_max_replica_count,
evaluation_dataflow_machine_type=evaluation_dataflow_machine_type,
evaluation_dataflow_starting_num_workers=evaluation_dataflow_starting_num_workers,
evaluation_dataflow_max_num_workers=evaluation_dataflow_max_num_workers,
evaluation_dataflow_disk_size_gb=evaluation_dataflow_disk_size_gb,
run_distillation=run_distillation,
distill_batch_predict_machine_type=distill_batch_predict_machine_type,
distill_batch_predict_starting_replica_count=distill_batch_predict_starting_replica_count,
distill_batch_predict_max_replica_count=distill_batch_predict_max_replica_count,
model_display_name=model_display_name,
model_description=model_description,
)
pipeline_definition_path = os.path.join(
pathlib.Path(__file__).parent.resolve(),
'automl_tabular_feature_selection_pipeline.yaml',
)
return pipeline_definition_path, parameter_values
def input_dictionary_to_parameter(input_dict: Optional[Dict[str, Any]]) -> str:
"""Convert json input dict to encoded parameter string.
This function is required due to the limitation on YAML component definition
that YAML definition does not have a keyword for apply quote escape, so the
JSON argument's quote must be manually escaped using this function.
Args:
input_dict: The input json dictionary.
Returns:
The encoded string used for parameter.
"""
if not input_dict:
return ''
out = json.dumps(json.dumps(input_dict))
return out[1:-1] # remove the outside quotes, e.g., "foo" -> foo
def get_skip_architecture_search_pipeline_and_parameters(
project: str,
location: str,
root_dir: str,
target_column: str,
prediction_type: str,
optimization_objective: str,
transformations: str,
train_budget_milli_node_hours: float,
stage_1_tuning_result_artifact_uri: str,
stage_2_num_parallel_trials: Optional[int] = None,
stage_2_num_selected_trials: Optional[int] = None,
data_source_csv_filenames: Optional[str] = None,
data_source_bigquery_table_path: Optional[str] = None,
predefined_split_key: Optional[str] = None,
timestamp_split_key: Optional[str] = None,
stratified_split_key: Optional[str] = None,
training_fraction: Optional[float] = None,
validation_fraction: Optional[float] = None,
test_fraction: Optional[float] = None,
weight_column: Optional[str] = None,
optimization_objective_recall_value: Optional[float] = None,
optimization_objective_precision_value: Optional[float] = None,
cv_trainer_worker_pool_specs_override: Optional[Dict[str, Any]] = None,
export_additional_model_without_custom_ops: bool = False,
stats_and_example_gen_dataflow_machine_type: Optional[str] = None,
stats_and_example_gen_dataflow_max_num_workers: Optional[int] = None,
stats_and_example_gen_dataflow_disk_size_gb: Optional[int] = None,
transform_dataflow_machine_type: Optional[str] = None,
transform_dataflow_max_num_workers: Optional[int] = None,
transform_dataflow_disk_size_gb: Optional[int] = None,
dataflow_subnetwork: Optional[str] = None,
dataflow_use_public_ips: bool = True,
encryption_spec_key_name: Optional[str] = None,
additional_experiments: Optional[Dict[str, Any]] = None,
dataflow_service_account: Optional[str] = None,
run_evaluation: bool = True,
evaluation_batch_predict_machine_type: Optional[str] = None,
evaluation_batch_predict_starting_replica_count: Optional[int] = None,
evaluation_batch_predict_max_replica_count: Optional[int] = None,
evaluation_batch_explain_machine_type: Optional[str] = None,
evaluation_batch_explain_starting_replica_count: Optional[int] = None,
evaluation_batch_explain_max_replica_count: Optional[int] = None,
evaluation_dataflow_machine_type: Optional[str] = None,
evaluation_dataflow_starting_num_workers: Optional[int] = None,
evaluation_dataflow_max_num_workers: Optional[int] = None,
evaluation_dataflow_disk_size_gb: Optional[int] = None,
) -> Tuple[str, Dict[str, Any]]:
"""Get the AutoML Tabular training pipeline that skips architecture search.
Args:
project: The GCP project that runs the pipeline components.
location: The GCP region that runs the pipeline components.
root_dir: The root GCS directory for the pipeline components.
target_column: The target column name.
prediction_type: The type of prediction the model is to produce.
"classification" or "regression".
optimization_objective: For binary classification, "maximize-au-roc",
"minimize-log-loss", "maximize-au-prc", "maximize-precision-at-recall", or
"maximize-recall-at-precision". For multi class classification,
"minimize-log-loss". For regression, "minimize-rmse", "minimize-mae", or
"minimize-rmsle".
transformations: The transformations to apply.
train_budget_milli_node_hours: The train budget of creating this model,
expressed in milli node hours i.e. 1,000 value in this field means 1 node
hour.
stage_1_tuning_result_artifact_uri: The stage 1 tuning result artifact GCS
URI.
stage_2_num_parallel_trials: Number of parallel trails for stage 2.
stage_2_num_selected_trials: Number of selected trials for stage 2.
data_source_csv_filenames: The CSV data source.
data_source_bigquery_table_path: The BigQuery data source.
predefined_split_key: The predefined_split column name.
timestamp_split_key: The timestamp_split column name.
stratified_split_key: The stratified_split column name.
training_fraction: The training fraction.
validation_fraction: The validation fraction.
test_fraction: float = The test fraction.
weight_column: The weight column name.
optimization_objective_recall_value: Required when optimization_objective is
"maximize-precision-at-recall". Must be between 0 and 1, inclusive.
optimization_objective_precision_value: Required when optimization_objective
is "maximize-recall-at-precision". Must be between 0 and 1, inclusive.
cv_trainer_worker_pool_specs_override: The dictionary for overriding stage
cv trainer worker pool spec. The dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
export_additional_model_without_custom_ops: Whether to export additional
model without custom TensorFlow operators.
stats_and_example_gen_dataflow_machine_type: The dataflow machine type for
stats_and_example_gen component.
stats_and_example_gen_dataflow_max_num_workers: The max number of Dataflow
workers for stats_and_example_gen component.
stats_and_example_gen_dataflow_disk_size_gb: Dataflow worker's disk size in
GB for stats_and_example_gen component.
transform_dataflow_machine_type: The dataflow machine type for transform
component.
transform_dataflow_max_num_workers: The max number of Dataflow workers for
transform component.
transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for
transform component.
dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty
the default subnetwork will be used. Example:
https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
dataflow_use_public_ips: Specifies whether Dataflow workers use public IP
addresses.
encryption_spec_key_name: The KMS key name.
additional_experiments: Use this field to config private preview features.
dataflow_service_account: Custom service account to run dataflow jobs.
run_evaluation: Whether to run evaluation in the training pipeline.
evaluation_batch_predict_machine_type: The prediction server machine type
for batch predict components during evaluation.
evaluation_batch_predict_starting_replica_count: The initial number of
prediction server for batch predict components during evaluation.
evaluation_batch_predict_max_replica_count: The max number of prediction
server for batch predict components during evaluation.
evaluation_batch_explain_machine_type: The prediction server machine type
for batch explain components during evaluation.
evaluation_batch_explain_starting_replica_count: The initial number of
prediction server for batch explain components during evaluation.
evaluation_batch_explain_max_replica_count: The max number of prediction
server for batch explain components during evaluation.
evaluation_dataflow_machine_type: The dataflow machine type for evaluation
components.
evaluation_dataflow_starting_num_workers: The initial number of Dataflow
workers for evaluation components.
evaluation_dataflow_max_num_workers: The max number of Dataflow workers for
evaluation components.
evaluation_dataflow_disk_size_gb: Dataflow worker's disk size in GB for
evaluation components.
Returns:
Tuple of pipeline_definition_path and parameter_values.
"""
return get_automl_tabular_pipeline_and_parameters(
project=project,
location=location,
root_dir=root_dir,
target_column=target_column,
prediction_type=prediction_type,
optimization_objective=optimization_objective,
transformations=transformations,
train_budget_milli_node_hours=train_budget_milli_node_hours,
stage_1_num_parallel_trials=None,
stage_2_num_parallel_trials=stage_2_num_parallel_trials,
stage_2_num_selected_trials=stage_2_num_selected_trials,
data_source_csv_filenames=data_source_csv_filenames,
data_source_bigquery_table_path=data_source_bigquery_table_path,
predefined_split_key=predefined_split_key,
timestamp_split_key=timestamp_split_key,
stratified_split_key=stratified_split_key,
training_fraction=training_fraction,
validation_fraction=validation_fraction,
test_fraction=test_fraction,
weight_column=weight_column,
study_spec_parameters_override=[],
optimization_objective_recall_value=optimization_objective_recall_value,
optimization_objective_precision_value=optimization_objective_precision_value,
stage_1_tuner_worker_pool_specs_override={},
cv_trainer_worker_pool_specs_override=cv_trainer_worker_pool_specs_override,
export_additional_model_without_custom_ops=export_additional_model_without_custom_ops,
stats_and_example_gen_dataflow_machine_type=stats_and_example_gen_dataflow_machine_type,
stats_and_example_gen_dataflow_max_num_workers=stats_and_example_gen_dataflow_max_num_workers,
stats_and_example_gen_dataflow_disk_size_gb=stats_and_example_gen_dataflow_disk_size_gb,
transform_dataflow_machine_type=transform_dataflow_machine_type,
transform_dataflow_max_num_workers=transform_dataflow_max_num_workers,
transform_dataflow_disk_size_gb=transform_dataflow_disk_size_gb,
dataflow_subnetwork=dataflow_subnetwork,
dataflow_use_public_ips=dataflow_use_public_ips,
encryption_spec_key_name=encryption_spec_key_name,
additional_experiments=additional_experiments,
dataflow_service_account=dataflow_service_account,
run_evaluation=run_evaluation,
evaluation_batch_predict_machine_type=evaluation_batch_predict_machine_type,
evaluation_batch_predict_starting_replica_count=evaluation_batch_predict_starting_replica_count,
evaluation_batch_predict_max_replica_count=evaluation_batch_predict_max_replica_count,
evaluation_batch_explain_machine_type=evaluation_batch_explain_machine_type,
evaluation_batch_explain_starting_replica_count=evaluation_batch_explain_starting_replica_count,
evaluation_batch_explain_max_replica_count=evaluation_batch_explain_max_replica_count,
evaluation_dataflow_machine_type=evaluation_dataflow_machine_type,
evaluation_dataflow_starting_num_workers=evaluation_dataflow_starting_num_workers,
evaluation_dataflow_max_num_workers=evaluation_dataflow_max_num_workers,
evaluation_dataflow_disk_size_gb=evaluation_dataflow_disk_size_gb,
run_distillation=None,
distill_batch_predict_machine_type=None,
distill_batch_predict_starting_replica_count=None,
distill_batch_predict_max_replica_count=None,
stage_1_tuning_result_artifact_uri=stage_1_tuning_result_artifact_uri,
quantiles=[],
enable_probabilistic_inference=False,
)
[docs]def get_wide_and_deep_trainer_pipeline_and_parameters(
project: str,
location: str,
root_dir: str,
target_column: str,
prediction_type: str,
learning_rate: float,
dnn_learning_rate: float,
transform_config: Optional[str] = None,
dataset_level_custom_transformation_definitions: Optional[
List[Dict[str, Any]]
] = None,
dataset_level_transformations: Optional[List[Dict[str, Any]]] = None,
run_feature_selection: bool = False,
feature_selection_algorithm: Optional[str] = None,
materialized_examples_format: Optional[str] = None,
max_selected_features: Optional[int] = None,
predefined_split_key: Optional[str] = None,
stratified_split_key: Optional[str] = None,
training_fraction: Optional[float] = None,
validation_fraction: Optional[float] = None,
test_fraction: Optional[float] = None,
tf_transform_execution_engine: Optional[str] = None,
tf_auto_transform_features: Optional[
Union[List[str], Dict[str, List[str]]]
] = None,
tf_custom_transformation_definitions: Optional[List[Dict[str, Any]]] = None,
tf_transformations_path: Optional[str] = None,
optimizer_type: str = 'adam',
max_steps: int = -1,
max_train_secs: int = -1,
l1_regularization_strength: float = 0,
l2_regularization_strength: float = 0,
l2_shrinkage_regularization_strength: float = 0,
beta_1: float = 0.9,
beta_2: float = 0.999,
hidden_units: str = '30,30,30',
use_wide: bool = True,
embed_categories: bool = True,
dnn_dropout: float = 0,
dnn_optimizer_type: str = 'adam',
dnn_l1_regularization_strength: float = 0,
dnn_l2_regularization_strength: float = 0,
dnn_l2_shrinkage_regularization_strength: float = 0,
dnn_beta_1: float = 0.9,
dnn_beta_2: float = 0.999,
enable_profiler: bool = False,
cache_data: str = 'auto',
seed: int = 1,
eval_steps: int = 0,
batch_size: int = 100,
measurement_selection_type: Optional[str] = None,
optimization_metric: Optional[str] = None,
eval_frequency_secs: int = 600,
data_source_csv_filenames: Optional[str] = None,
data_source_bigquery_table_path: Optional[str] = None,
bigquery_staging_full_dataset_id: Optional[str] = None,
weight_column: str = '',
transform_dataflow_machine_type: str = 'n1-standard-16',
transform_dataflow_max_num_workers: int = 25,
transform_dataflow_disk_size_gb: int = 40,
worker_pool_specs_override: Optional[Dict[str, Any]] = None,
run_evaluation: bool = True,
evaluation_batch_predict_machine_type: str = _EVALUATION_BATCH_PREDICT_MACHINE_TYPE,
evaluation_batch_predict_starting_replica_count: int = _EVALUATION_BATCH_PREDICT_STARTING_REPLICA_COUNT,
evaluation_batch_predict_max_replica_count: int = _EVALUATION_BATCH_PREDICT_MAX_REPLICA_COUNT,
evaluation_dataflow_machine_type: str = _EVALUATION_DATAFLOW_MACHINE_TYPE,
evaluation_dataflow_starting_num_workers: int = _EVALUATION_DATAFLOW_STARTING_NUM_WORKERS,
evaluation_dataflow_max_num_workers: int = _EVALUATION_DATAFLOW_MAX_NUM_WORKERS,
evaluation_dataflow_disk_size_gb: int = _EVALUATION_DATAFLOW_DISK_SIZE_GB,
dataflow_service_account: str = '',
dataflow_subnetwork: str = '',
dataflow_use_public_ips: bool = True,
encryption_spec_key_name: str = '',
) -> Tuple[str, Dict[str, Any]]:
# fmt: off
"""Get the Wide & Deep training pipeline.
Args:
project: The GCP project that runs the pipeline components.
location: The GCP region that runs the pipeline components.
root_dir: The root GCS directory for the pipeline components.
target_column: The target column name.
prediction_type: The type of prediction the model is to produce. 'classification' or 'regression'.
learning_rate: The learning rate used by the linear optimizer.
dnn_learning_rate: The learning rate for training the deep part of the model.
transform_config: Path to v1 TF transformation configuration.
dataset_level_custom_transformation_definitions: Dataset-level custom transformation definitions in string format.
dataset_level_transformations: Dataset-level transformation configuration in string format.
run_feature_selection: Whether to enable feature selection.
feature_selection_algorithm: Feature selection algorithm.
materialized_examples_format: The format for the materialized examples.
max_selected_features: Maximum number of features to select.
predefined_split_key: Predefined split key.
stratified_split_key: Stratified split key.
training_fraction: Training fraction.
validation_fraction: Validation fraction.
test_fraction: Test fraction.
tf_transform_execution_engine: The execution engine used to execute TF-based transformations.
tf_auto_transform_features: List of auto transform features in the comma-separated string format.
tf_custom_transformation_definitions: TF custom transformation definitions in string format.
tf_transformations_path: Path to TF transformation configuration.
optimizer_type: The type of optimizer to use. Choices are "adam", "ftrl" and "sgd" for the Adam, FTRL, and Gradient Descent Optimizers, respectively.
max_steps: Number of steps to run the trainer for.
max_train_secs: Amount of time in seconds to run the trainer for.
l1_regularization_strength: L1 regularization strength for optimizer_type="ftrl".
l2_regularization_strength: L2 regularization strength for optimizer_type="ftrl".
l2_shrinkage_regularization_strength: L2 shrinkage regularization strength for optimizer_type="ftrl".
beta_1: Beta 1 value for optimizer_type="adam".
beta_2: Beta 2 value for optimizer_type="adam".
hidden_units: Hidden layer sizes to use for DNN feature columns, provided in comma-separated layers.
use_wide: If set to true, the categorical columns will be used in the wide part of the DNN model.
embed_categories: If set to true, the categorical columns will be used embedded and used in the deep part of the model. Embedding size is the square root of the column cardinality.
dnn_dropout: The probability we will drop out a given coordinate.
dnn_optimizer_type: The type of optimizer to use for the deep part of the model. Choices are "adam", "ftrl" and "sgd". for the Adam, FTRL, and Gradient Descent Optimizers, respectively.
dnn_l1_regularization_strength: L1 regularization strength for dnn_optimizer_type="ftrl".
dnn_l2_regularization_strength: L2 regularization strength for dnn_optimizer_type="ftrl".
dnn_l2_shrinkage_regularization_strength: L2 shrinkage regularization strength for dnn_optimizer_type="ftrl".
dnn_beta_1: Beta 1 value for dnn_optimizer_type="adam".
dnn_beta_2: Beta 2 value for dnn_optimizer_type="adam".
enable_profiler: Enables profiling and saves a trace during evaluation.
cache_data: Whether to cache data or not. If set to 'auto', caching is determined based on the dataset size.
seed: Seed to be used for this run.
eval_steps: Number of steps to run evaluation for. If not specified or negative, it means run evaluation on the whole validation dataset. If set to 0, it means run evaluation for a fixed number of samples.
batch_size: Batch size for training.
measurement_selection_type: Which measurement to use if/when the service automatically selects the final measurement from previously reported intermediate measurements. One of "BEST_MEASUREMENT" or "LAST_MEASUREMENT".
optimization_metric: Optimization metric used for `measurement_selection_type`. Default is "rmse" for regression and "auc" for classification.
eval_frequency_secs: Frequency at which evaluation and checkpointing will take place.
data_source_csv_filenames: The CSV data source.
data_source_bigquery_table_path: The BigQuery data source.
bigquery_staging_full_dataset_id: The BigQuery staging full dataset id for storing intermediate tables.
weight_column: The weight column name.
transform_dataflow_machine_type: The dataflow machine type for transform component.
transform_dataflow_max_num_workers: The max number of Dataflow workers for transform component.
transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for transform component.
worker_pool_specs_override: The dictionary for overriding training and evaluation worker pool specs. The dictionary should be of format https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
run_evaluation: Whether to run evaluation steps during training.
evaluation_batch_predict_machine_type: The prediction server machine type for batch predict components during evaluation.
evaluation_batch_predict_starting_replica_count: The initial number of prediction server for batch predict components during evaluation.
evaluation_batch_predict_max_replica_count: The max number of prediction server for batch predict components during evaluation.
evaluation_dataflow_machine_type: The dataflow machine type for evaluation components.
evaluation_dataflow_starting_num_workers: The initial number of Dataflow workers for evaluation components.
evaluation_dataflow_max_num_workers: The max number of Dataflow workers for evaluation components.
evaluation_dataflow_disk_size_gb: Dataflow worker's disk size in GB for evaluation components.
dataflow_service_account: Custom service account to run dataflow jobs.
dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty the default subnetwork will be used. Example: https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
dataflow_use_public_ips: Specifies whether Dataflow workers use public IP addresses.
encryption_spec_key_name: The KMS key name.
Returns:
Tuple of pipeline_definition_path and parameter_values.
"""
# fmt: on
if isinstance(tf_auto_transform_features, list):
tf_auto_transform_features = {'auto': tf_auto_transform_features}
if transform_config and tf_transformations_path:
raise ValueError(
'Only one of transform_config and tf_transformations_path can '
'be specified.'
)
elif transform_config:
warnings.warn(
'transform_config parameter is deprecated. '
'Please use the flattened transform config arguments instead.'
)
tf_transformations_path = transform_config
if not worker_pool_specs_override:
worker_pool_specs_override = []
parameter_values = {}
training_and_eval_parameters = {
'project': project,
'location': location,
'root_dir': root_dir,
'target_column': target_column,
'prediction_type': prediction_type,
'learning_rate': learning_rate,
'dnn_learning_rate': dnn_learning_rate,
'optimizer_type': optimizer_type,
'max_steps': max_steps,
'max_train_secs': max_train_secs,
'l1_regularization_strength': l1_regularization_strength,
'l2_regularization_strength': l2_regularization_strength,
'l2_shrinkage_regularization_strength': (
l2_shrinkage_regularization_strength
),
'beta_1': beta_1,
'beta_2': beta_2,
'hidden_units': hidden_units,
'use_wide': use_wide,
'embed_categories': embed_categories,
'dnn_dropout': dnn_dropout,
'dnn_optimizer_type': dnn_optimizer_type,
'dnn_l1_regularization_strength': dnn_l1_regularization_strength,
'dnn_l2_regularization_strength': dnn_l2_regularization_strength,
'dnn_l2_shrinkage_regularization_strength': (
dnn_l2_shrinkage_regularization_strength
),
'dnn_beta_1': dnn_beta_1,
'dnn_beta_2': dnn_beta_2,
'enable_profiler': enable_profiler,
'cache_data': cache_data,
'seed': seed,
'eval_steps': eval_steps,
'batch_size': batch_size,
'measurement_selection_type': measurement_selection_type,
'optimization_metric': optimization_metric,
'eval_frequency_secs': eval_frequency_secs,
'weight_column': weight_column,
'transform_dataflow_machine_type': transform_dataflow_machine_type,
'transform_dataflow_max_num_workers': transform_dataflow_max_num_workers,
'transform_dataflow_disk_size_gb': transform_dataflow_disk_size_gb,
'worker_pool_specs_override': worker_pool_specs_override,
'run_evaluation': run_evaluation,
'evaluation_batch_predict_machine_type': (
evaluation_batch_predict_machine_type
),
'evaluation_batch_predict_starting_replica_count': (
evaluation_batch_predict_starting_replica_count
),
'evaluation_batch_predict_max_replica_count': (
evaluation_batch_predict_max_replica_count
),
'evaluation_dataflow_machine_type': evaluation_dataflow_machine_type,
'evaluation_dataflow_starting_num_workers': (
evaluation_dataflow_starting_num_workers
),
'evaluation_dataflow_max_num_workers': (
evaluation_dataflow_max_num_workers
),
'evaluation_dataflow_disk_size_gb': evaluation_dataflow_disk_size_gb,
'dataflow_service_account': dataflow_service_account,
'dataflow_subnetwork': dataflow_subnetwork,
'dataflow_use_public_ips': dataflow_use_public_ips,
'encryption_spec_key_name': encryption_spec_key_name,
}
_update_parameters(parameter_values, training_and_eval_parameters)
fte_params = {
'dataset_level_custom_transformation_definitions': (
dataset_level_custom_transformation_definitions
if dataset_level_custom_transformation_definitions
else []
),
'dataset_level_transformations': (
dataset_level_transformations if dataset_level_transformations else []
),
'run_feature_selection': run_feature_selection,
'feature_selection_algorithm': feature_selection_algorithm,
'max_selected_features': max_selected_features,
'predefined_split_key': predefined_split_key,
'stratified_split_key': stratified_split_key,
'training_fraction': training_fraction,
'validation_fraction': validation_fraction,
'test_fraction': test_fraction,
'tf_auto_transform_features': (
tf_auto_transform_features if tf_auto_transform_features else {}
),
'tf_custom_transformation_definitions': (
tf_custom_transformation_definitions
if tf_custom_transformation_definitions
else []
),
'tf_transformations_path': tf_transformations_path,
'materialized_examples_format': (
materialized_examples_format
if materialized_examples_format
else 'tfrecords_gzip'
),
'tf_transform_execution_engine': (
tf_transform_execution_engine
if tf_transform_execution_engine
else 'dataflow'
),
}
_update_parameters(parameter_values, fte_params)
data_source_and_split_parameters = {
'data_source_csv_filenames': data_source_csv_filenames,
'data_source_bigquery_table_path': data_source_bigquery_table_path,
'bigquery_staging_full_dataset_id': bigquery_staging_full_dataset_id,
}
_update_parameters(parameter_values, data_source_and_split_parameters)
pipeline_definition_path = os.path.join(
pathlib.Path(__file__).parent.resolve(),
'wide_and_deep_trainer_pipeline.yaml',
)
return pipeline_definition_path, parameter_values
def get_builtin_algorithm_hyperparameter_tuning_job_pipeline_and_parameters(
project: str,
location: str,
root_dir: str,
target_column: str,
prediction_type: str,
study_spec_metric_id: str,
study_spec_metric_goal: str,
study_spec_parameters_override: List[Dict[str, Any]],
max_trial_count: int,
parallel_trial_count: int,
algorithm: str,
enable_profiler: bool = False,
seed: int = 1,
eval_steps: int = 0,
eval_frequency_secs: int = 600,
transform_config: Optional[str] = None,
dataset_level_custom_transformation_definitions: Optional[
List[Dict[str, Any]]
] = None,
dataset_level_transformations: Optional[List[Dict[str, Any]]] = None,
predefined_split_key: Optional[str] = None,
stratified_split_key: Optional[str] = None,
training_fraction: Optional[float] = None,
validation_fraction: Optional[float] = None,
test_fraction: Optional[float] = None,
tf_transform_execution_engine: Optional[str] = None,
tf_auto_transform_features: Optional[
Union[List[str], Dict[str, List[str]]]
] = None,
tf_custom_transformation_definitions: Optional[List[Dict[str, Any]]] = None,
tf_transformations_path: Optional[str] = None,
data_source_csv_filenames: Optional[str] = None,
data_source_bigquery_table_path: Optional[str] = None,
bigquery_staging_full_dataset_id: Optional[str] = None,
weight_column: str = '',
max_failed_trial_count: int = 0,
study_spec_algorithm: str = 'ALGORITHM_UNSPECIFIED',
study_spec_measurement_selection_type: str = 'BEST_MEASUREMENT',
transform_dataflow_machine_type: str = 'n1-standard-16',
transform_dataflow_max_num_workers: int = 25,
transform_dataflow_disk_size_gb: int = 40,
worker_pool_specs_override: Optional[Dict[str, Any]] = None,
run_evaluation: bool = True,
evaluation_batch_predict_machine_type: str = _EVALUATION_BATCH_PREDICT_MACHINE_TYPE,
evaluation_batch_predict_starting_replica_count: int = _EVALUATION_BATCH_PREDICT_STARTING_REPLICA_COUNT,
evaluation_batch_predict_max_replica_count: int = _EVALUATION_BATCH_PREDICT_MAX_REPLICA_COUNT,
evaluation_dataflow_machine_type: str = _EVALUATION_DATAFLOW_MACHINE_TYPE,
evaluation_dataflow_starting_num_workers: int = _EVALUATION_DATAFLOW_STARTING_NUM_WORKERS,
evaluation_dataflow_max_num_workers: int = _EVALUATION_DATAFLOW_MAX_NUM_WORKERS,
evaluation_dataflow_disk_size_gb: int = _EVALUATION_DATAFLOW_DISK_SIZE_GB,
dataflow_service_account: str = '',
dataflow_subnetwork: str = '',
dataflow_use_public_ips: bool = True,
encryption_spec_key_name: str = '',
) -> Tuple[str, Dict[str, Any]]:
"""Get the built-in algorithm HyperparameterTuningJob pipeline.
Args:
project: The GCP project that runs the pipeline components.
location: The GCP region that runs the pipeline components.
root_dir: The root GCS directory for the pipeline components.
target_column: The target column name.
prediction_type: The type of prediction the model is to produce.
"classification" or "regression".
study_spec_metric_id: Metric to optimize, possible values: [ 'loss',
'average_loss', 'rmse', 'mae', 'mql', 'accuracy', 'auc', 'precision',
'recall'].
study_spec_metric_goal: Optimization goal of the metric, possible values:
"MAXIMIZE", "MINIMIZE".
study_spec_parameters_override: List of dictionaries representing parameters
to optimize. The dictionary key is the parameter_id, which is passed to
training job as a command line argument, and the dictionary value is the
parameter specification of the metric.
max_trial_count: The desired total number of trials.
parallel_trial_count: The desired number of trials to run in parallel.
algorithm: Algorithm to train. One of "tabnet" and "wide_and_deep".
enable_profiler: Enables profiling and saves a trace during evaluation.
seed: Seed to be used for this run.
eval_steps: Number of steps to run evaluation for. If not specified or
negative, it means run evaluation on the whole validation dataset. If set
to 0, it means run evaluation for a fixed number of samples.
eval_frequency_secs: Frequency at which evaluation and checkpointing will
take place.
transform_config: Path to v1 TF transformation configuration.
dataset_level_custom_transformation_definitions: Dataset-level custom
transformation definitions in string format.
dataset_level_transformations: Dataset-level transformation configuration in
string format.
predefined_split_key: Predefined split key.
stratified_split_key: Stratified split key.
training_fraction: Training fraction.
validation_fraction: Validation fraction.
test_fraction: Test fraction.
tf_transform_execution_engine: The execution engine used to execute TF-based
transformations.
tf_auto_transform_features: List of auto transform features in the
comma-separated string format.
tf_custom_transformation_definitions: TF custom transformation definitions
in string format.
tf_transformations_path: Path to TF transformation configuration.
data_source_csv_filenames: The CSV data source.
data_source_bigquery_table_path: The BigQuery data source.
bigquery_staging_full_dataset_id: The BigQuery staging full dataset id for
storing intermediate tables.
weight_column: The weight column name.
max_failed_trial_count: The number of failed trials that need to be seen
before failing the HyperparameterTuningJob. If set to 0, Vertex AI decides
how many trials must fail before the whole job fails.
study_spec_algorithm: The search algorithm specified for the study. One of
"ALGORITHM_UNSPECIFIED", "GRID_SEARCH", or "RANDOM_SEARCH".
study_spec_measurement_selection_type: Which measurement to use if/when the
service automatically selects the final measurement from previously
reported intermediate measurements. One of "BEST_MEASUREMENT" or
"LAST_MEASUREMENT".
transform_dataflow_machine_type: The dataflow machine type for transform
component.
transform_dataflow_max_num_workers: The max number of Dataflow workers for
transform component.
transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for
transform component.
worker_pool_specs_override: The dictionary for overriding training and
evaluation worker pool specs. The dictionary should be of format
https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
run_evaluation: Whether to run evaluation steps during training.
evaluation_batch_predict_machine_type: The prediction server machine type
for batch predict components during evaluation.
evaluation_batch_predict_starting_replica_count: The initial number of
prediction server for batch predict components during evaluation.
evaluation_batch_predict_max_replica_count: The max number of prediction
server for batch predict components during evaluation.
evaluation_dataflow_machine_type: The dataflow machine type for evaluation
components.
evaluation_dataflow_starting_num_workers: The initial number of Dataflow
workers for evaluation components.
evaluation_dataflow_max_num_workers: The max number of Dataflow workers for
evaluation components.
evaluation_dataflow_disk_size_gb: Dataflow worker's disk size in GB for
evaluation components.
dataflow_service_account: Custom service account to run dataflow jobs.
dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty
the default subnetwork will be used. Example:
https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
dataflow_use_public_ips: Specifies whether Dataflow workers use public IP
addresses.
encryption_spec_key_name: The KMS key name.
Returns:
Tuple of pipeline_definition_path and parameter_values.
"""
warnings.warn(
'This method is deprecated. Please use'
' get_tabnet_hyperparameter_tuning_job_pipeline_and_parameters or'
' get_wide_and_deep_hyperparameter_tuning_job_pipeline_and_parameters'
' instead.'
)
if algorithm == 'tabnet':
return get_tabnet_hyperparameter_tuning_job_pipeline_and_parameters(
project=project,
location=location,
root_dir=root_dir,
target_column=target_column,
prediction_type=prediction_type,
study_spec_metric_id=study_spec_metric_id,
study_spec_metric_goal=study_spec_metric_goal,
study_spec_parameters_override=study_spec_parameters_override,
max_trial_count=max_trial_count,
parallel_trial_count=parallel_trial_count,
transform_config=transform_config,
dataset_level_custom_transformation_definitions=dataset_level_custom_transformation_definitions,
dataset_level_transformations=dataset_level_transformations,
predefined_split_key=predefined_split_key,
stratified_split_key=stratified_split_key,
training_fraction=training_fraction,
validation_fraction=validation_fraction,
test_fraction=test_fraction,
tf_transform_execution_engine=tf_transform_execution_engine,
tf_auto_transform_features=tf_auto_transform_features,
tf_custom_transformation_definitions=tf_custom_transformation_definitions,
tf_transformations_path=tf_transformations_path,
enable_profiler=enable_profiler,
seed=seed,
eval_steps=eval_steps,
eval_frequency_secs=eval_frequency_secs,
data_source_csv_filenames=data_source_csv_filenames,
data_source_bigquery_table_path=data_source_bigquery_table_path,
bigquery_staging_full_dataset_id=bigquery_staging_full_dataset_id,
weight_column=weight_column,
max_failed_trial_count=max_failed_trial_count,
study_spec_algorithm=study_spec_algorithm,
study_spec_measurement_selection_type=study_spec_measurement_selection_type,
transform_dataflow_machine_type=transform_dataflow_machine_type,
transform_dataflow_max_num_workers=transform_dataflow_max_num_workers,
transform_dataflow_disk_size_gb=transform_dataflow_disk_size_gb,
worker_pool_specs_override=worker_pool_specs_override,
run_evaluation=run_evaluation,
evaluation_batch_predict_machine_type=evaluation_batch_predict_machine_type,
evaluation_batch_predict_starting_replica_count=evaluation_batch_predict_starting_replica_count,
evaluation_batch_predict_max_replica_count=evaluation_batch_predict_max_replica_count,
evaluation_dataflow_machine_type=evaluation_dataflow_machine_type,
evaluation_dataflow_disk_size_gb=evaluation_dataflow_disk_size_gb,
evaluation_dataflow_starting_num_workers=evaluation_dataflow_starting_num_workers,
evaluation_dataflow_max_num_workers=evaluation_dataflow_max_num_workers,
dataflow_service_account=dataflow_service_account,
dataflow_subnetwork=dataflow_subnetwork,
dataflow_use_public_ips=dataflow_use_public_ips,
encryption_spec_key_name=encryption_spec_key_name,
)
elif algorithm == 'wide_and_deep':
return get_wide_and_deep_hyperparameter_tuning_job_pipeline_and_parameters(
project=project,
location=location,
root_dir=root_dir,
target_column=target_column,
prediction_type=prediction_type,
study_spec_metric_id=study_spec_metric_id,
study_spec_metric_goal=study_spec_metric_goal,
study_spec_parameters_override=study_spec_parameters_override,
max_trial_count=max_trial_count,
parallel_trial_count=parallel_trial_count,
transform_config=transform_config,
dataset_level_custom_transformation_definitions=dataset_level_custom_transformation_definitions,
dataset_level_transformations=dataset_level_transformations,
predefined_split_key=predefined_split_key,
stratified_split_key=stratified_split_key,
training_fraction=training_fraction,
validation_fraction=validation_fraction,
test_fraction=test_fraction,
tf_transform_execution_engine=tf_transform_execution_engine,
tf_auto_transform_features=tf_auto_transform_features,
tf_custom_transformation_definitions=tf_custom_transformation_definitions,
tf_transformations_path=tf_transformations_path,
enable_profiler=enable_profiler,
seed=seed,
eval_steps=eval_steps,
eval_frequency_secs=eval_frequency_secs,
data_source_csv_filenames=data_source_csv_filenames,
data_source_bigquery_table_path=data_source_bigquery_table_path,
bigquery_staging_full_dataset_id=bigquery_staging_full_dataset_id,
weight_column=weight_column,
max_failed_trial_count=max_failed_trial_count,
study_spec_algorithm=study_spec_algorithm,
study_spec_measurement_selection_type=study_spec_measurement_selection_type,
transform_dataflow_machine_type=transform_dataflow_machine_type,
transform_dataflow_max_num_workers=transform_dataflow_max_num_workers,
transform_dataflow_disk_size_gb=transform_dataflow_disk_size_gb,
worker_pool_specs_override=worker_pool_specs_override,
run_evaluation=run_evaluation,
evaluation_batch_predict_machine_type=evaluation_batch_predict_machine_type,
evaluation_batch_predict_starting_replica_count=evaluation_batch_predict_starting_replica_count,
evaluation_batch_predict_max_replica_count=evaluation_batch_predict_max_replica_count,
evaluation_dataflow_machine_type=evaluation_dataflow_machine_type,
evaluation_dataflow_disk_size_gb=evaluation_dataflow_disk_size_gb,
evaluation_dataflow_starting_num_workers=evaluation_dataflow_starting_num_workers,
evaluation_dataflow_max_num_workers=evaluation_dataflow_max_num_workers,
dataflow_service_account=dataflow_service_account,
dataflow_subnetwork=dataflow_subnetwork,
dataflow_use_public_ips=dataflow_use_public_ips,
encryption_spec_key_name=encryption_spec_key_name,
)
else:
raise ValueError(
'Invalid algorithm provided. Supported values are "tabnet" and'
' "wide_and_deep".'
)
[docs]def get_tabnet_hyperparameter_tuning_job_pipeline_and_parameters(
project: str,
location: str,
root_dir: str,
target_column: str,
prediction_type: str,
study_spec_metric_id: str,
study_spec_metric_goal: str,
study_spec_parameters_override: List[Dict[str, Any]],
max_trial_count: int,
parallel_trial_count: int,
transform_config: Optional[str] = None,
dataset_level_custom_transformation_definitions: Optional[
List[Dict[str, Any]]
] = None,
dataset_level_transformations: Optional[List[Dict[str, Any]]] = None,
run_feature_selection: bool = False,
feature_selection_algorithm: Optional[str] = None,
materialized_examples_format: Optional[str] = None,
max_selected_features: Optional[int] = None,
predefined_split_key: Optional[str] = None,
stratified_split_key: Optional[str] = None,
training_fraction: Optional[float] = None,
validation_fraction: Optional[float] = None,
test_fraction: Optional[float] = None,
tf_transform_execution_engine: Optional[str] = None,
tf_auto_transform_features: Optional[
Union[List[str], Dict[str, List[str]]]
] = None,
tf_custom_transformation_definitions: Optional[List[Dict[str, Any]]] = None,
tf_transformations_path: Optional[str] = None,
enable_profiler: bool = False,
cache_data: str = 'auto',
seed: int = 1,
eval_steps: int = 0,
eval_frequency_secs: int = 600,
data_source_csv_filenames: Optional[str] = None,
data_source_bigquery_table_path: Optional[str] = None,
bigquery_staging_full_dataset_id: Optional[str] = None,
weight_column: str = '',
max_failed_trial_count: int = 0,
study_spec_algorithm: str = 'ALGORITHM_UNSPECIFIED',
study_spec_measurement_selection_type: str = 'BEST_MEASUREMENT',
transform_dataflow_machine_type: str = 'n1-standard-16',
transform_dataflow_max_num_workers: int = 25,
transform_dataflow_disk_size_gb: int = 40,
worker_pool_specs_override: Optional[Dict[str, Any]] = None,
run_evaluation: bool = True,
evaluation_batch_predict_machine_type: str = _EVALUATION_BATCH_PREDICT_MACHINE_TYPE,
evaluation_batch_predict_starting_replica_count: int = _EVALUATION_BATCH_PREDICT_STARTING_REPLICA_COUNT,
evaluation_batch_predict_max_replica_count: int = _EVALUATION_BATCH_PREDICT_MAX_REPLICA_COUNT,
evaluation_dataflow_machine_type: str = _EVALUATION_DATAFLOW_MACHINE_TYPE,
evaluation_dataflow_starting_num_workers: int = _EVALUATION_DATAFLOW_STARTING_NUM_WORKERS,
evaluation_dataflow_max_num_workers: int = _EVALUATION_DATAFLOW_MAX_NUM_WORKERS,
evaluation_dataflow_disk_size_gb: int = _EVALUATION_DATAFLOW_DISK_SIZE_GB,
dataflow_service_account: str = '',
dataflow_subnetwork: str = '',
dataflow_use_public_ips: bool = True,
encryption_spec_key_name: str = '',
) -> Tuple[str, Dict[str, Any]]:
# fmt: off
"""Get the TabNet HyperparameterTuningJob pipeline.
Args:
project: The GCP project that runs the pipeline components.
location: The GCP region that runs the pipeline components.
root_dir: The root GCS directory for the pipeline components.
target_column: The target column name.
prediction_type: The type of prediction the model is to produce. "classification" or "regression".
study_spec_metric_id: Metric to optimize, possible values: [ 'loss', 'average_loss', 'rmse', 'mae', 'mql', 'accuracy', 'auc', 'precision', 'recall'].
study_spec_metric_goal: Optimization goal of the metric, possible values: "MAXIMIZE", "MINIMIZE".
study_spec_parameters_override: List of dictionaries representing parameters to optimize. The dictionary key is the parameter_id, which is passed to training job as a command line argument, and the dictionary value is the parameter specification of the metric.
max_trial_count: The desired total number of trials.
parallel_trial_count: The desired number of trials to run in parallel.
transform_config: Path to v1 TF transformation configuration.
dataset_level_custom_transformation_definitions: Dataset-level custom transformation definitions in string format.
dataset_level_transformations: Dataset-level transformation configuration in string format.
run_feature_selection: Whether to enable feature selection.
feature_selection_algorithm: Feature selection algorithm.
materialized_examples_format: The format for the materialized examples.
max_selected_features: Maximum number of features to select.
predefined_split_key: Predefined split key.
stratified_split_key: Stratified split key.
training_fraction: Training fraction.
validation_fraction: Validation fraction.
test_fraction: Test fraction.
tf_transform_execution_engine: The execution engine used to execute TF-based transformations.
tf_auto_transform_features: List of auto transform features in the comma-separated string format.
tf_custom_transformation_definitions: TF custom transformation definitions in string format.
tf_transformations_path: Path to TF transformation configuration.
enable_profiler: Enables profiling and saves a trace during evaluation.
cache_data: Whether to cache data or not. If set to 'auto', caching is determined based on the dataset size.
seed: Seed to be used for this run.
eval_steps: Number of steps to run evaluation for. If not specified or negative, it means run evaluation on the whole validation dataset. If set to 0, it means run evaluation for a fixed number of samples.
eval_frequency_secs: Frequency at which evaluation and checkpointing will take place.
data_source_csv_filenames: The CSV data source.
data_source_bigquery_table_path: The BigQuery data source.
bigquery_staging_full_dataset_id: The BigQuery staging full dataset id for storing intermediate tables.
weight_column: The weight column name.
max_failed_trial_count: The number of failed trials that need to be seen before failing the HyperparameterTuningJob. If set to 0, Vertex AI decides how many trials must fail before the whole job fails.
study_spec_algorithm: The search algorithm specified for the study. One of "ALGORITHM_UNSPECIFIED", "GRID_SEARCH", or "RANDOM_SEARCH".
study_spec_measurement_selection_type: Which measurement to use if/when the service automatically selects the final measurement from previously reported intermediate measurements. One of "BEST_MEASUREMENT" or "LAST_MEASUREMENT".
transform_dataflow_machine_type: The dataflow machine type for transform component.
transform_dataflow_max_num_workers: The max number of Dataflow workers for transform component.
transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for transform component.
worker_pool_specs_override: The dictionary for overriding training and evaluation worker pool specs. The dictionary should be of format https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
run_evaluation: Whether to run evaluation steps during training.
evaluation_batch_predict_machine_type: The prediction server machine type for batch predict components during evaluation.
evaluation_batch_predict_starting_replica_count: The initial number of prediction server for batch predict components during evaluation.
evaluation_batch_predict_max_replica_count: The max number of prediction server for batch predict components during evaluation.
evaluation_dataflow_machine_type: The dataflow machine type for evaluation components.
evaluation_dataflow_starting_num_workers: The initial number of Dataflow workers for evaluation components.
evaluation_dataflow_max_num_workers: The max number of Dataflow workers for evaluation components.
evaluation_dataflow_disk_size_gb: Dataflow worker's disk size in GB for evaluation components.
dataflow_service_account: Custom service account to run dataflow jobs.
dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty the default subnetwork will be used. Example: https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
dataflow_use_public_ips: Specifies whether Dataflow workers use public IP addresses.
encryption_spec_key_name: The KMS key name.
Returns:
Tuple of pipeline_definition_path and parameter_values.
"""
# fmt: on
if isinstance(tf_auto_transform_features, list):
tf_auto_transform_features = {'auto': tf_auto_transform_features}
if transform_config and tf_transformations_path:
raise ValueError(
'Only one of transform_config and tf_transformations_path can '
'be specified.'
)
elif transform_config:
warnings.warn(
'transform_config parameter is deprecated. '
'Please use the flattened transform config arguments instead.'
)
tf_transformations_path = transform_config
if not worker_pool_specs_override:
worker_pool_specs_override = []
parameter_values = {
'project': project,
'location': location,
'root_dir': root_dir,
'target_column': target_column,
'prediction_type': prediction_type,
'study_spec_metric_id': study_spec_metric_id,
'study_spec_metric_goal': study_spec_metric_goal,
'study_spec_parameters_override': study_spec_parameters_override,
'max_trial_count': max_trial_count,
'parallel_trial_count': parallel_trial_count,
'enable_profiler': enable_profiler,
'cache_data': cache_data,
'seed': seed,
'eval_steps': eval_steps,
'eval_frequency_secs': eval_frequency_secs,
'weight_column': weight_column,
'max_failed_trial_count': max_failed_trial_count,
'study_spec_algorithm': study_spec_algorithm,
'study_spec_measurement_selection_type': (
study_spec_measurement_selection_type
),
'transform_dataflow_machine_type': transform_dataflow_machine_type,
'transform_dataflow_max_num_workers': transform_dataflow_max_num_workers,
'transform_dataflow_disk_size_gb': transform_dataflow_disk_size_gb,
'worker_pool_specs_override': worker_pool_specs_override,
'run_evaluation': run_evaluation,
'evaluation_batch_predict_machine_type': (
evaluation_batch_predict_machine_type
),
'evaluation_batch_predict_starting_replica_count': (
evaluation_batch_predict_starting_replica_count
),
'evaluation_batch_predict_max_replica_count': (
evaluation_batch_predict_max_replica_count
),
'evaluation_dataflow_machine_type': evaluation_dataflow_machine_type,
'evaluation_dataflow_starting_num_workers': (
evaluation_dataflow_starting_num_workers
),
'evaluation_dataflow_max_num_workers': (
evaluation_dataflow_max_num_workers
),
'evaluation_dataflow_disk_size_gb': evaluation_dataflow_disk_size_gb,
'dataflow_service_account': dataflow_service_account,
'dataflow_subnetwork': dataflow_subnetwork,
'dataflow_use_public_ips': dataflow_use_public_ips,
'encryption_spec_key_name': encryption_spec_key_name,
}
fte_params = {
'dataset_level_custom_transformation_definitions': (
dataset_level_custom_transformation_definitions
if dataset_level_custom_transformation_definitions
else []
),
'dataset_level_transformations': (
dataset_level_transformations if dataset_level_transformations else []
),
'run_feature_selection': run_feature_selection,
'feature_selection_algorithm': feature_selection_algorithm,
'max_selected_features': max_selected_features,
'predefined_split_key': predefined_split_key,
'stratified_split_key': stratified_split_key,
'training_fraction': training_fraction,
'validation_fraction': validation_fraction,
'test_fraction': test_fraction,
'tf_auto_transform_features': (
tf_auto_transform_features if tf_auto_transform_features else {}
),
'tf_custom_transformation_definitions': (
tf_custom_transformation_definitions
if tf_custom_transformation_definitions
else []
),
'tf_transformations_path': tf_transformations_path,
'materialized_examples_format': (
materialized_examples_format
if materialized_examples_format
else 'tfrecords_gzip'
),
'tf_transform_execution_engine': (
tf_transform_execution_engine
if tf_transform_execution_engine
else 'dataflow'
),
}
_update_parameters(parameter_values, fte_params)
data_source_and_split_parameters = {
'data_source_csv_filenames': data_source_csv_filenames,
'data_source_bigquery_table_path': data_source_bigquery_table_path,
'bigquery_staging_full_dataset_id': bigquery_staging_full_dataset_id,
}
_update_parameters(parameter_values, data_source_and_split_parameters)
pipeline_definition_path = os.path.join(
pathlib.Path(__file__).parent.resolve(),
'tabnet_hyperparameter_tuning_job_pipeline.yaml',
)
return pipeline_definition_path, parameter_values
[docs]def get_wide_and_deep_hyperparameter_tuning_job_pipeline_and_parameters(
project: str,
location: str,
root_dir: str,
target_column: str,
prediction_type: str,
study_spec_metric_id: str,
study_spec_metric_goal: str,
study_spec_parameters_override: List[Dict[str, Any]],
max_trial_count: int,
parallel_trial_count: int,
transform_config: Optional[str] = None,
dataset_level_custom_transformation_definitions: Optional[
List[Dict[str, Any]]
] = None,
dataset_level_transformations: Optional[List[Dict[str, Any]]] = None,
run_feature_selection: bool = False,
feature_selection_algorithm: Optional[str] = None,
materialized_examples_format: Optional[str] = None,
max_selected_features: Optional[int] = None,
predefined_split_key: Optional[str] = None,
stratified_split_key: Optional[str] = None,
training_fraction: Optional[float] = None,
validation_fraction: Optional[float] = None,
test_fraction: Optional[float] = None,
tf_transform_execution_engine: Optional[str] = None,
tf_auto_transform_features: Optional[
Union[List[str], Dict[str, List[str]]]
] = None,
tf_custom_transformation_definitions: Optional[List[Dict[str, Any]]] = None,
tf_transformations_path: Optional[str] = None,
enable_profiler: bool = False,
cache_data: str = 'auto',
seed: int = 1,
eval_steps: int = 0,
eval_frequency_secs: int = 600,
data_source_csv_filenames: Optional[str] = None,
data_source_bigquery_table_path: Optional[str] = None,
bigquery_staging_full_dataset_id: Optional[str] = None,
weight_column: str = '',
max_failed_trial_count: int = 0,
study_spec_algorithm: str = 'ALGORITHM_UNSPECIFIED',
study_spec_measurement_selection_type: str = 'BEST_MEASUREMENT',
transform_dataflow_machine_type: str = 'n1-standard-16',
transform_dataflow_max_num_workers: int = 25,
transform_dataflow_disk_size_gb: int = 40,
worker_pool_specs_override: Optional[Dict[str, Any]] = None,
run_evaluation: bool = True,
evaluation_batch_predict_machine_type: str = _EVALUATION_BATCH_PREDICT_MACHINE_TYPE,
evaluation_batch_predict_starting_replica_count: int = _EVALUATION_BATCH_PREDICT_STARTING_REPLICA_COUNT,
evaluation_batch_predict_max_replica_count: int = _EVALUATION_BATCH_PREDICT_MAX_REPLICA_COUNT,
evaluation_dataflow_machine_type: str = _EVALUATION_DATAFLOW_MACHINE_TYPE,
evaluation_dataflow_starting_num_workers: int = _EVALUATION_DATAFLOW_STARTING_NUM_WORKERS,
evaluation_dataflow_max_num_workers: int = _EVALUATION_DATAFLOW_MAX_NUM_WORKERS,
evaluation_dataflow_disk_size_gb: int = _EVALUATION_DATAFLOW_DISK_SIZE_GB,
dataflow_service_account: str = '',
dataflow_subnetwork: str = '',
dataflow_use_public_ips: bool = True,
encryption_spec_key_name: str = '',
) -> Tuple[str, Dict[str, Any]]:
# fmt: off
"""Get the Wide & Deep algorithm HyperparameterTuningJob pipeline.
Args:
project: The GCP project that runs the pipeline components.
location: The GCP region that runs the pipeline components.
root_dir: The root GCS directory for the pipeline components.
target_column: The target column name.
prediction_type: The type of prediction the model is to produce. "classification" or "regression".
study_spec_metric_id: Metric to optimize, possible values: [ 'loss', 'average_loss', 'rmse', 'mae', 'mql', 'accuracy', 'auc', 'precision', 'recall'].
study_spec_metric_goal: Optimization goal of the metric, possible values: "MAXIMIZE", "MINIMIZE".
study_spec_parameters_override: List of dictionaries representing parameters to optimize. The dictionary key is the parameter_id, which is passed to training job as a command line argument, and the dictionary value is the parameter specification of the metric.
max_trial_count: The desired total number of trials.
parallel_trial_count: The desired number of trials to run in parallel.
transform_config: Path to v1 TF transformation configuration.
dataset_level_custom_transformation_definitions: Dataset-level custom transformation definitions in string format.
dataset_level_transformations: Dataset-level transformation configuration in string format.
run_feature_selection: Whether to enable feature selection.
feature_selection_algorithm: Feature selection algorithm.
materialized_examples_format: The format for the materialized examples.
max_selected_features: Maximum number of features to select.
predefined_split_key: Predefined split key.
stratified_split_key: Stratified split key.
training_fraction: Training fraction.
validation_fraction: Validation fraction.
test_fraction: Test fraction.
tf_transform_execution_engine: The execution engine used to execute TF-based transformations.
tf_auto_transform_features: List of auto transform features in the comma-separated string format.
tf_custom_transformation_definitions: TF custom transformation definitions in string format.
tf_transformations_path: Path to TF transformation configuration.
enable_profiler: Enables profiling and saves a trace during evaluation.
cache_data: Whether to cache data or not. If set to 'auto', caching is determined based on the dataset size.
seed: Seed to be used for this run.
eval_steps: Number of steps to run evaluation for. If not specified or negative, it means run evaluation on the whole validation dataset. If set to 0, it means run evaluation for a fixed number of samples.
eval_frequency_secs: Frequency at which evaluation and checkpointing will take place.
data_source_csv_filenames: The CSV data source.
data_source_bigquery_table_path: The BigQuery data source.
bigquery_staging_full_dataset_id: The BigQuery staging full dataset id for storing intermediate tables.
weight_column: The weight column name.
max_failed_trial_count: The number of failed trials that need to be seen before failing the HyperparameterTuningJob. If set to 0, Vertex AI decides how many trials must fail before the whole job fails.
study_spec_algorithm: The search algorithm specified for the study. One of "ALGORITHM_UNSPECIFIED", "GRID_SEARCH", or "RANDOM_SEARCH".
study_spec_measurement_selection_type: Which measurement to use if/when the service automatically selects the final measurement from previously reported intermediate measurements. One of "BEST_MEASUREMENT" or "LAST_MEASUREMENT".
transform_dataflow_machine_type: The dataflow machine type for transform component.
transform_dataflow_max_num_workers: The max number of Dataflow workers for transform component.
transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for transform component.
worker_pool_specs_override: The dictionary for overriding training and evaluation worker pool specs. The dictionary should be of format https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
run_evaluation: Whether to run evaluation steps during training.
evaluation_batch_predict_machine_type: The prediction server machine type for batch predict components during evaluation.
evaluation_batch_predict_starting_replica_count: The initial number of prediction server for batch predict components during evaluation.
evaluation_batch_predict_max_replica_count: The max number of prediction server for batch predict components during evaluation.
evaluation_dataflow_machine_type: The dataflow machine type for evaluation components.
evaluation_dataflow_starting_num_workers: The initial number of Dataflow workers for evaluation components.
evaluation_dataflow_max_num_workers: The max number of Dataflow workers for evaluation components.
evaluation_dataflow_disk_size_gb: Dataflow worker's disk size in GB for evaluation components.
dataflow_service_account: Custom service account to run dataflow jobs.
dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty the default subnetwork will be used. Example: https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
dataflow_use_public_ips: Specifies whether Dataflow workers use public IP addresses.
encryption_spec_key_name: The KMS key name.
Returns:
Tuple of pipeline_definition_path and parameter_values.
"""
# fmt: on
if isinstance(tf_auto_transform_features, list):
tf_auto_transform_features = {'auto': tf_auto_transform_features}
if transform_config and tf_transformations_path:
raise ValueError(
'Only one of transform_config and tf_transformations_path can '
'be specified.'
)
elif transform_config:
warnings.warn(
'transform_config parameter is deprecated. '
'Please use the flattened transform config arguments instead.'
)
tf_transformations_path = transform_config
if not worker_pool_specs_override:
worker_pool_specs_override = []
parameter_values = {
'project': project,
'location': location,
'root_dir': root_dir,
'target_column': target_column,
'prediction_type': prediction_type,
'study_spec_metric_id': study_spec_metric_id,
'study_spec_metric_goal': study_spec_metric_goal,
'study_spec_parameters_override': study_spec_parameters_override,
'max_trial_count': max_trial_count,
'parallel_trial_count': parallel_trial_count,
'enable_profiler': enable_profiler,
'cache_data': cache_data,
'seed': seed,
'eval_steps': eval_steps,
'eval_frequency_secs': eval_frequency_secs,
'weight_column': weight_column,
'max_failed_trial_count': max_failed_trial_count,
'study_spec_algorithm': study_spec_algorithm,
'study_spec_measurement_selection_type': (
study_spec_measurement_selection_type
),
'transform_dataflow_machine_type': transform_dataflow_machine_type,
'transform_dataflow_max_num_workers': transform_dataflow_max_num_workers,
'transform_dataflow_disk_size_gb': transform_dataflow_disk_size_gb,
'worker_pool_specs_override': worker_pool_specs_override,
'run_evaluation': run_evaluation,
'evaluation_batch_predict_machine_type': (
evaluation_batch_predict_machine_type
),
'evaluation_batch_predict_starting_replica_count': (
evaluation_batch_predict_starting_replica_count
),
'evaluation_batch_predict_max_replica_count': (
evaluation_batch_predict_max_replica_count
),
'evaluation_dataflow_machine_type': evaluation_dataflow_machine_type,
'evaluation_dataflow_starting_num_workers': (
evaluation_dataflow_starting_num_workers
),
'evaluation_dataflow_max_num_workers': (
evaluation_dataflow_max_num_workers
),
'evaluation_dataflow_disk_size_gb': evaluation_dataflow_disk_size_gb,
'dataflow_service_account': dataflow_service_account,
'dataflow_subnetwork': dataflow_subnetwork,
'dataflow_use_public_ips': dataflow_use_public_ips,
'encryption_spec_key_name': encryption_spec_key_name,
}
fte_params = {
'dataset_level_custom_transformation_definitions': (
dataset_level_custom_transformation_definitions
if dataset_level_custom_transformation_definitions
else []
),
'dataset_level_transformations': (
dataset_level_transformations if dataset_level_transformations else []
),
'run_feature_selection': run_feature_selection,
'feature_selection_algorithm': feature_selection_algorithm,
'max_selected_features': max_selected_features,
'predefined_split_key': predefined_split_key,
'stratified_split_key': stratified_split_key,
'training_fraction': training_fraction,
'validation_fraction': validation_fraction,
'test_fraction': test_fraction,
'tf_auto_transform_features': (
tf_auto_transform_features if tf_auto_transform_features else {}
),
'tf_custom_transformation_definitions': (
tf_custom_transformation_definitions
if tf_custom_transformation_definitions
else []
),
'tf_transformations_path': tf_transformations_path,
'materialized_examples_format': (
materialized_examples_format
if materialized_examples_format
else 'tfrecords_gzip'
),
'tf_transform_execution_engine': (
tf_transform_execution_engine
if tf_transform_execution_engine
else 'dataflow'
),
}
_update_parameters(parameter_values, fte_params)
data_source_and_split_parameters = {
'data_source_csv_filenames': data_source_csv_filenames,
'data_source_bigquery_table_path': data_source_bigquery_table_path,
'bigquery_staging_full_dataset_id': bigquery_staging_full_dataset_id,
}
_update_parameters(parameter_values, data_source_and_split_parameters)
pipeline_definition_path = os.path.join(
pathlib.Path(__file__).parent.resolve(),
'wide_and_deep_hyperparameter_tuning_job_pipeline.yaml',
)
return pipeline_definition_path, parameter_values
[docs]def get_tabnet_trainer_pipeline_and_parameters(
project: str,
location: str,
root_dir: str,
target_column: str,
prediction_type: str,
learning_rate: float,
transform_config: Optional[str] = None,
dataset_level_custom_transformation_definitions: Optional[
List[Dict[str, Any]]
] = None,
dataset_level_transformations: Optional[List[Dict[str, Any]]] = None,
run_feature_selection: bool = False,
feature_selection_algorithm: Optional[str] = None,
materialized_examples_format: Optional[str] = None,
max_selected_features: Optional[int] = None,
predefined_split_key: Optional[str] = None,
stratified_split_key: Optional[str] = None,
training_fraction: Optional[float] = None,
validation_fraction: Optional[float] = None,
test_fraction: Optional[float] = None,
tf_transform_execution_engine: Optional[str] = None,
tf_auto_transform_features: Optional[
Union[List[str], Dict[str, List[str]]]
] = None,
tf_custom_transformation_definitions: Optional[List[Dict[str, Any]]] = None,
tf_transformations_path: Optional[str] = None,
max_steps: int = -1,
max_train_secs: int = -1,
large_category_dim: int = 1,
large_category_thresh: int = 300,
yeo_johnson_transform: bool = True,
feature_dim: int = 64,
feature_dim_ratio: float = 0.5,
num_decision_steps: int = 6,
relaxation_factor: float = 1.5,
decay_every: float = 100,
decay_rate: float = 0.95,
gradient_thresh: float = 2000,
sparsity_loss_weight: float = 0.00001,
batch_momentum: float = 0.95,
batch_size_ratio: float = 0.25,
num_transformer_layers: int = 4,
num_transformer_layers_ratio: float = 0.25,
class_weight: float = 1.0,
loss_function_type: str = 'default',
alpha_focal_loss: float = 0.25,
gamma_focal_loss: float = 2.0,
enable_profiler: bool = False,
cache_data: str = 'auto',
seed: int = 1,
eval_steps: int = 0,
batch_size: int = 100,
measurement_selection_type: Optional[str] = None,
optimization_metric: Optional[str] = None,
eval_frequency_secs: int = 600,
data_source_csv_filenames: Optional[str] = None,
data_source_bigquery_table_path: Optional[str] = None,
bigquery_staging_full_dataset_id: Optional[str] = None,
weight_column: str = '',
transform_dataflow_machine_type: str = 'n1-standard-16',
transform_dataflow_max_num_workers: int = 25,
transform_dataflow_disk_size_gb: int = 40,
worker_pool_specs_override: Optional[Dict[str, Any]] = None,
run_evaluation: bool = True,
evaluation_batch_predict_machine_type: str = _EVALUATION_BATCH_PREDICT_MACHINE_TYPE,
evaluation_batch_predict_starting_replica_count: int = _EVALUATION_BATCH_PREDICT_STARTING_REPLICA_COUNT,
evaluation_batch_predict_max_replica_count: int = _EVALUATION_BATCH_PREDICT_MAX_REPLICA_COUNT,
evaluation_dataflow_machine_type: str = _EVALUATION_DATAFLOW_MACHINE_TYPE,
evaluation_dataflow_starting_num_workers: int = _EVALUATION_DATAFLOW_STARTING_NUM_WORKERS,
evaluation_dataflow_max_num_workers: int = _EVALUATION_DATAFLOW_MAX_NUM_WORKERS,
evaluation_dataflow_disk_size_gb: int = _EVALUATION_DATAFLOW_DISK_SIZE_GB,
dataflow_service_account: str = '',
dataflow_subnetwork: str = '',
dataflow_use_public_ips: bool = True,
encryption_spec_key_name: str = '',
) -> Tuple[str, Dict[str, Any]]:
# fmt: off
"""Get the TabNet training pipeline.
Args:
project: The GCP project that runs the pipeline components.
location: The GCP region that runs the pipeline components.
root_dir: The root GCS directory for the pipeline components.
target_column: The target column name.
prediction_type: The type of prediction the model is to produce. "classification" or "regression".
learning_rate: The learning rate used by the linear optimizer.
transform_config: Path to v1 TF transformation configuration.
dataset_level_custom_transformation_definitions: Dataset-level custom transformation definitions in string format.
dataset_level_transformations: Dataset-level transformation configuration in string format.
run_feature_selection: Whether to enable feature selection.
feature_selection_algorithm: Feature selection algorithm.
materialized_examples_format: The format for the materialized examples.
max_selected_features: Maximum number of features to select.
predefined_split_key: Predefined split key.
stratified_split_key: Stratified split key.
training_fraction: Training fraction.
validation_fraction: Validation fraction.
test_fraction: Test fraction.
tf_transform_execution_engine: The execution engine used to execute TF-based transformations.
tf_auto_transform_features: List of auto transform features in the comma-separated string format.
tf_custom_transformation_definitions: TF custom transformation definitions in string format.
tf_transformations_path: Path to TF transformation configuration.
max_steps: Number of steps to run the trainer for.
max_train_secs: Amount of time in seconds to run the trainer for.
large_category_dim: Embedding dimension for categorical feature with large number of categories.
large_category_thresh: Threshold for number of categories to apply large_category_dim embedding dimension to.
yeo_johnson_transform: Enables trainable Yeo-Johnson power transform.
feature_dim: Dimensionality of the hidden representation in feature transformation block.
feature_dim_ratio: The ratio of output dimension (dimensionality of the outputs of each decision step) to feature dimension.
num_decision_steps: Number of sequential decision steps.
relaxation_factor: Relaxation factor that promotes the reuse of each feature at different decision steps. When it is 1, a feature is enforced to be used only at one decision step and as it increases, more flexibility is provided to use a feature at multiple decision steps.
decay_every: Number of iterations for periodically applying learning rate decaying.
decay_rate: Learning rate decaying.
gradient_thresh: Threshold for the norm of gradients for clipping.
sparsity_loss_weight: Weight of the loss for sparsity regularization (increasing it will yield more sparse feature selection).
batch_momentum: Momentum in ghost batch normalization.
batch_size_ratio: The ratio of virtual batch size (size of the ghost batch normalization) to batch size.
num_transformer_layers: The number of transformer layers for each decision step. used only at one decision step and as it increases, more flexibility is provided to use a feature at multiple decision steps.
num_transformer_layers_ratio: The ratio of shared transformer layer to transformer layers.
class_weight: The class weight is used to computes a weighted cross entropy which is helpful in classify imbalanced dataset. Only used for classification.
loss_function_type: Loss function type. Loss function in classification [cross_entropy, weighted_cross_entropy, focal_loss], default is cross_entropy. Loss function in regression: [rmse, mae, mse], default is mse.
alpha_focal_loss: Alpha value (balancing factor) in focal_loss function. Only used for classification.
gamma_focal_loss: Gamma value (modulating factor) for focal loss for focal loss. Only used for classification.
enable_profiler: Enables profiling and saves a trace during evaluation.
cache_data: Whether to cache data or not. If set to 'auto', caching is determined based on the dataset size.
seed: Seed to be used for this run.
eval_steps: Number of steps to run evaluation for. If not specified or negative, it means run evaluation on the whole validation dataset. If set to 0, it means run evaluation for a fixed number of samples.
batch_size: Batch size for training.
measurement_selection_type: Which measurement to use if/when the service automatically selects the final measurement from previously reported intermediate measurements. One of "BEST_MEASUREMENT" or "LAST_MEASUREMENT".
optimization_metric: Optimization metric used for `measurement_selection_type`. Default is "rmse" for regression and "auc" for classification.
eval_frequency_secs: Frequency at which evaluation and checkpointing will take place.
data_source_csv_filenames: The CSV data source.
data_source_bigquery_table_path: The BigQuery data source.
bigquery_staging_full_dataset_id: The BigQuery staging full dataset id for storing intermediate tables.
weight_column: The weight column name.
transform_dataflow_machine_type: The dataflow machine type for transform component.
transform_dataflow_max_num_workers: The max number of Dataflow workers for transform component.
transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for transform component.
worker_pool_specs_override: The dictionary for overriding training and evaluation worker pool specs. The dictionary should be of format https://github.com/googleapis/googleapis/blob/4e836c7c257e3e20b1de14d470993a2b1f4736a8/google/cloud/aiplatform/v1beta1/custom_job.proto#L172.
run_evaluation: Whether to run evaluation steps during training.
evaluation_batch_predict_machine_type: The prediction server machine type for batch predict components during evaluation.
evaluation_batch_predict_starting_replica_count: The initial number of prediction server for batch predict components during evaluation.
evaluation_batch_predict_max_replica_count: The max number of prediction server for batch predict components during evaluation.
evaluation_dataflow_machine_type: The dataflow machine type for evaluation components.
evaluation_dataflow_starting_num_workers: The initial number of Dataflow workers for evaluation components.
evaluation_dataflow_max_num_workers: The max number of Dataflow workers for evaluation components.
evaluation_dataflow_disk_size_gb: Dataflow worker's disk size in GB for evaluation components.
dataflow_service_account: Custom service account to run dataflow jobs.
dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty the default subnetwork will be used. Example: https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
dataflow_use_public_ips: Specifies whether Dataflow workers use public IP addresses.
encryption_spec_key_name: The KMS key name.
Returns:
Tuple of pipeline_definition_path and parameter_values.
"""
# fmt: on
if isinstance(tf_auto_transform_features, list):
tf_auto_transform_features = {'auto': tf_auto_transform_features}
if transform_config and tf_transformations_path:
raise ValueError(
'Only one of transform_config and tf_transformations_path can '
'be specified.'
)
elif transform_config:
warnings.warn(
'transform_config parameter is deprecated. '
'Please use the flattened transform config arguments instead.'
)
tf_transformations_path = transform_config
if not worker_pool_specs_override:
worker_pool_specs_override = []
parameter_values = {}
training_and_eval_parameters = {
'project': project,
'location': location,
'root_dir': root_dir,
'target_column': target_column,
'prediction_type': prediction_type,
'learning_rate': learning_rate,
'max_steps': max_steps,
'max_train_secs': max_train_secs,
'large_category_dim': large_category_dim,
'large_category_thresh': large_category_thresh,
'yeo_johnson_transform': yeo_johnson_transform,
'feature_dim': feature_dim,
'feature_dim_ratio': feature_dim_ratio,
'num_decision_steps': num_decision_steps,
'relaxation_factor': relaxation_factor,
'decay_every': decay_every,
'decay_rate': decay_rate,
'gradient_thresh': gradient_thresh,
'sparsity_loss_weight': sparsity_loss_weight,
'batch_momentum': batch_momentum,
'batch_size_ratio': batch_size_ratio,
'num_transformer_layers': num_transformer_layers,
'num_transformer_layers_ratio': num_transformer_layers_ratio,
'class_weight': class_weight,
'loss_function_type': loss_function_type,
'alpha_focal_loss': alpha_focal_loss,
'gamma_focal_loss': gamma_focal_loss,
'enable_profiler': enable_profiler,
'cache_data': cache_data,
'seed': seed,
'eval_steps': eval_steps,
'batch_size': batch_size,
'measurement_selection_type': measurement_selection_type,
'optimization_metric': optimization_metric,
'eval_frequency_secs': eval_frequency_secs,
'weight_column': weight_column,
'transform_dataflow_machine_type': transform_dataflow_machine_type,
'transform_dataflow_max_num_workers': transform_dataflow_max_num_workers,
'transform_dataflow_disk_size_gb': transform_dataflow_disk_size_gb,
'worker_pool_specs_override': worker_pool_specs_override,
'run_evaluation': run_evaluation,
'evaluation_batch_predict_machine_type': (
evaluation_batch_predict_machine_type
),
'evaluation_batch_predict_starting_replica_count': (
evaluation_batch_predict_starting_replica_count
),
'evaluation_batch_predict_max_replica_count': (
evaluation_batch_predict_max_replica_count
),
'evaluation_dataflow_machine_type': evaluation_dataflow_machine_type,
'evaluation_dataflow_starting_num_workers': (
evaluation_dataflow_starting_num_workers
),
'evaluation_dataflow_max_num_workers': (
evaluation_dataflow_max_num_workers
),
'evaluation_dataflow_disk_size_gb': evaluation_dataflow_disk_size_gb,
'dataflow_service_account': dataflow_service_account,
'dataflow_subnetwork': dataflow_subnetwork,
'dataflow_use_public_ips': dataflow_use_public_ips,
'encryption_spec_key_name': encryption_spec_key_name,
}
_update_parameters(parameter_values, training_and_eval_parameters)
fte_params = {
'dataset_level_custom_transformation_definitions': (
dataset_level_custom_transformation_definitions
if dataset_level_custom_transformation_definitions
else []
),
'dataset_level_transformations': (
dataset_level_transformations if dataset_level_transformations else []
),
'run_feature_selection': run_feature_selection,
'feature_selection_algorithm': feature_selection_algorithm,
'max_selected_features': max_selected_features,
'predefined_split_key': predefined_split_key,
'stratified_split_key': stratified_split_key,
'training_fraction': training_fraction,
'validation_fraction': validation_fraction,
'test_fraction': test_fraction,
'tf_auto_transform_features': (
tf_auto_transform_features if tf_auto_transform_features else {}
),
'tf_custom_transformation_definitions': (
tf_custom_transformation_definitions
if tf_custom_transformation_definitions
else []
),
'tf_transformations_path': tf_transformations_path,
'materialized_examples_format': (
materialized_examples_format
if materialized_examples_format
else 'tfrecords_gzip'
),
'tf_transform_execution_engine': (
tf_transform_execution_engine
if tf_transform_execution_engine
else 'dataflow'
),
}
_update_parameters(parameter_values, fte_params)
data_source_and_split_parameters = {
'data_source_csv_filenames': data_source_csv_filenames,
'data_source_bigquery_table_path': data_source_bigquery_table_path,
'bigquery_staging_full_dataset_id': bigquery_staging_full_dataset_id,
}
_update_parameters(parameter_values, data_source_and_split_parameters)
pipeline_definition_path = os.path.join(
pathlib.Path(__file__).parent.resolve(), 'tabnet_trainer_pipeline.yaml'
)
return pipeline_definition_path, parameter_values
def get_tabnet_study_spec_parameters_override(
dataset_size_bucket: str, prediction_type: str, training_budget_bucket: str
) -> List[Dict[str, Any]]:
"""Get study_spec_parameters_override for a TabNet hyperparameter tuning job.
Args:
dataset_size_bucket: Size of the dataset. One of "small" (< 1M rows),
"medium" (1M - 100M rows), or "large" (> 100M rows).
prediction_type: The type of prediction the model is to produce.
"classification" or "regression".
training_budget_bucket: Bucket of the estimated training budget. One of
"small" (< $600), "medium" ($600 - $2400), or "large" (> $2400). This
parameter is only used as a hint for the hyperparameter search space,
unrelated to the real cost.
Returns:
List of study_spec_parameters_override.
"""
if dataset_size_bucket not in ['small', 'medium', 'large']:
raise ValueError(
'Invalid dataset_size_bucket provided. Supported values '
' are "small", "medium" or "large".'
)
if training_budget_bucket not in ['small', 'medium', 'large']:
raise ValueError(
'Invalid training_budget_bucket provided. Supported values '
'are "small", "medium" or "large".'
)
param_path = os.path.join(
pathlib.Path(__file__).parent.resolve(),
f'configs/tabnet_params_{dataset_size_bucket}_data_{training_budget_bucket}_search_space.json',
)
with open(param_path, 'r') as f:
param_content = f.read()
params = json.loads(param_content)
if prediction_type == 'regression':
return _format_tabnet_regression_study_spec_parameters_override(
params, training_budget_bucket
)
return params
def _format_tabnet_regression_study_spec_parameters_override(
params: List[Dict[str, Any]], training_budget_bucket: str
) -> List[Dict[str, Any]]:
"""Get regression study_spec_parameters_override for a TabNet hyperparameter tuning job.
Args:
params: List of dictionaries representing parameters to optimize. The
dictionary key is the parameter_id, which is passed to training job as a
command line argument, and the dictionary value is the parameter
specification of the metric.
training_budget_bucket: Bucket of the estimated training budget. One of
"small" (< $600), "medium" ($600 - $2400), or "large" (> $2400). This
parameter is only used as a hint for the hyperparameter search space,
unrelated to the real cost.
Returns:
List of study_spec_parameters_override for regression.
"""
# To get regression study_spec_parameters, we need to set
# `loss_function_type` to ‘mae’ (‘mae’ and ‘mse’ for "large" search space),
# remove the `alpha_focal_loss`, `gamma_focal_loss`
# and `class_weight` parameters and increase the max for
# `sparsity_loss_weight` to 100.
formatted_params = []
for param in params:
if param['parameter_id'] in [
'alpha_focal_loss',
'gamma_focal_loss',
'class_weight',
]:
continue
elif param['parameter_id'] == 'sparsity_loss_weight':
param['double_value_spec']['max_value'] = 100
elif param['parameter_id'] == 'loss_function_type':
if training_budget_bucket == 'large':
param['categorical_value_spec']['values'] = ['mae', 'mse']
else:
param['categorical_value_spec']['values'] = ['mae']
formatted_params.append(param)
return formatted_params
def get_wide_and_deep_study_spec_parameters_override() -> List[Dict[str, Any]]:
"""Get study_spec_parameters_override for a Wide & Deep hyperparameter tuning job.
Returns:
List of study_spec_parameters_override.
"""
param_path = os.path.join(
pathlib.Path(__file__).parent.resolve(),
'configs/wide_and_deep_params.json',
)
with open(param_path, 'r') as f:
param_content = f.read()
params = json.loads(param_content)
return params
def get_xgboost_study_spec_parameters_override() -> List[Dict[str, Any]]:
"""Get study_spec_parameters_override for an XGBoost hyperparameter tuning job.
Returns:
List of study_spec_parameters_override.
"""
param_path = os.path.join(
pathlib.Path(__file__).parent.resolve(), 'configs/xgboost_params.json'
)
with open(param_path, 'r') as f:
param_content = f.read()
params = json.loads(param_content)
return params
[docs]def get_xgboost_trainer_pipeline_and_parameters(
project: str,
location: str,
root_dir: str,
target_column: str,
objective: str,
eval_metric: Optional[str] = None,
num_boost_round: Optional[int] = None,
early_stopping_rounds: Optional[int] = None,
base_score: Optional[float] = None,
disable_default_eval_metric: Optional[int] = None,
seed: Optional[int] = None,
seed_per_iteration: Optional[bool] = None,
booster: Optional[str] = None,
eta: Optional[float] = None,
gamma: Optional[float] = None,
max_depth: Optional[int] = None,
min_child_weight: Optional[float] = None,
max_delta_step: Optional[float] = None,
subsample: Optional[float] = None,
colsample_bytree: Optional[float] = None,
colsample_bylevel: Optional[float] = None,
colsample_bynode: Optional[float] = None,
reg_lambda: Optional[float] = None,
reg_alpha: Optional[float] = None,
tree_method: Optional[str] = None,
scale_pos_weight: Optional[float] = None,
updater: Optional[str] = None,
refresh_leaf: Optional[int] = None,
process_type: Optional[str] = None,
grow_policy: Optional[str] = None,
sampling_method: Optional[str] = None,
monotone_constraints: Optional[str] = None,
interaction_constraints: Optional[str] = None,
sample_type: Optional[str] = None,
normalize_type: Optional[str] = None,
rate_drop: Optional[float] = None,
one_drop: Optional[int] = None,
skip_drop: Optional[float] = None,
num_parallel_tree: Optional[int] = None,
feature_selector: Optional[str] = None,
top_k: Optional[int] = None,
max_cat_to_onehot: Optional[int] = None,
max_leaves: Optional[int] = None,
max_bin: Optional[int] = None,
tweedie_variance_power: Optional[float] = None,
huber_slope: Optional[float] = None,
dataset_level_custom_transformation_definitions: Optional[
List[Dict[str, Any]]
] = None,
dataset_level_transformations: Optional[List[Dict[str, Any]]] = None,
run_feature_selection: Optional[bool] = None,
feature_selection_algorithm: Optional[str] = None,
max_selected_features: Optional[int] = None,
predefined_split_key: Optional[str] = None,
stratified_split_key: Optional[str] = None,
training_fraction: Optional[float] = None,
validation_fraction: Optional[float] = None,
test_fraction: Optional[float] = None,
tf_auto_transform_features: Optional[
Union[List[str], Dict[str, List[str]]]
] = None,
tf_custom_transformation_definitions: Optional[List[Dict[str, Any]]] = None,
tf_transformations_path: Optional[str] = None,
data_source_csv_filenames: Optional[str] = None,
data_source_bigquery_table_path: Optional[str] = None,
bigquery_staging_full_dataset_id: Optional[str] = None,
weight_column: Optional[str] = None,
training_machine_type: Optional[str] = None,
training_total_replica_count: Optional[int] = None,
training_accelerator_type: Optional[str] = None,
training_accelerator_count: Optional[int] = None,
transform_dataflow_machine_type: Optional[str] = None,
transform_dataflow_max_num_workers: Optional[int] = None,
transform_dataflow_disk_size_gb: Optional[int] = None,
run_evaluation: Optional[bool] = None,
evaluation_batch_predict_machine_type: Optional[str] = None,
evaluation_batch_predict_starting_replica_count: Optional[int] = None,
evaluation_batch_predict_max_replica_count: Optional[int] = None,
evaluation_dataflow_machine_type: Optional[str] = None,
evaluation_dataflow_starting_num_workers: Optional[int] = None,
evaluation_dataflow_max_num_workers: Optional[int] = None,
evaluation_dataflow_disk_size_gb: Optional[int] = None,
dataflow_service_account: Optional[str] = None,
dataflow_subnetwork: Optional[str] = None,
dataflow_use_public_ips: Optional[bool] = None,
encryption_spec_key_name: Optional[str] = None,
):
# fmt: off
"""Get the XGBoost training pipeline.
Args:
project: The GCP project that runs the pipeline components.
location: The GCP region that runs the pipeline components.
root_dir: The root GCS directory for the pipeline components.
target_column: The target column name.
objective: Specifies the learning task and the learning objective. Must be one of [reg:squarederror, reg:squaredlogerror, reg:logistic, reg:gamma, reg:tweedie, reg:pseudohubererror, binary:logistic, multi:softprob].
eval_metric: Evaluation metrics for validation data represented as a comma-separated string.
num_boost_round: Number of boosting iterations.
early_stopping_rounds: Activates early stopping. Validation error needs to decrease at least every early_stopping_rounds round(s) to continue training.
base_score: The initial prediction score of all instances, global bias.
disable_default_eval_metric: Flag to disable default metric. Set to >0 to disable. Default to 0.
seed: Random seed.
seed_per_iteration: Seed PRNG determnisticly via iterator number.
booster: Which booster to use, can be gbtree, gblinear or dart. gbtree and dart use tree based model while gblinear uses linear function.
eta: Learning rate.
gamma: Minimum loss reduction required to make a further partition on a leaf node of the tree.
max_depth: Maximum depth of a tree.
min_child_weight: Minimum sum of instance weight(hessian) needed in a child.
max_delta_step: Maximum delta step we allow each tree's weight estimation to be.
subsample: Subsample ratio of the training instance.
colsample_bytree: Subsample ratio of columns when constructing each tree.
colsample_bylevel: Subsample ratio of columns for each split, in each level.
colsample_bynode: Subsample ratio of columns for each node (split).
reg_lambda: L2 regularization term on weights.
reg_alpha: L1 regularization term on weights.
tree_method: The tree construction algorithm used in XGBoost. Choices: ["auto", "exact", "approx", "hist", "gpu_exact", "gpu_hist"].
scale_pos_weight: Control the balance of positive and negative weights.
updater: A comma separated string defining the sequence of tree updaters to run.
refresh_leaf: Refresh updater plugin. Update tree leaf and nodes's stats if True. When it is False, only node stats are updated.
process_type: A type of boosting process to run. Choices:["default", "update"]
grow_policy: Controls a way new nodes are added to the tree. Only supported if tree_method is hist. Choices:["depthwise", "lossguide"]
sampling_method: The method to use to sample the training instances.
monotone_constraints: Constraint of variable monotonicity.
interaction_constraints: Constraints for interaction representing permitted interactions.
sample_type: [dart booster only] Type of sampling algorithm. Choices:["uniform", "weighted"]
normalize_type: [dart booster only] Type of normalization algorithm, Choices:["tree", "forest"]
rate_drop: [dart booster only] Dropout rate.'
one_drop: [dart booster only] When this flag is enabled, at least one tree is always dropped during the dropout (allows Binomial-plus-one or epsilon-dropout from the original DART paper).
skip_drop: [dart booster only] Probability of skipping the dropout procedure during a boosting iteration.
num_parallel_tree: Number of parallel trees constructed during each iteration. This option is used to support boosted random forest.
feature_selector: [linear booster only] Feature selection and ordering method.
top_k: The number of top features to select in greedy and thrifty feature selector. The value of 0 means using all the features.
max_cat_to_onehot: A threshold for deciding whether XGBoost should use one-hot encoding based split for categorical data.
max_leaves: Maximum number of nodes to be added.
max_bin: Maximum number of discrete bins to bucket continuous features.
tweedie_variance_power: Parameter that controls the variance of the Tweedie distribution.
huber_slope: A parameter used for Pseudo-Huber loss to define the delta term.
dataset_level_custom_transformation_definitions: Dataset-level custom transformation definitions in string format.
dataset_level_transformations: Dataset-level transformation configuration in string format.
run_feature_selection: Whether to enable feature selection.
feature_selection_algorithm: Feature selection algorithm.
max_selected_features: Maximum number of features to select.
predefined_split_key: Predefined split key.
stratified_split_key: Stratified split key.
training_fraction: Training fraction.
validation_fraction: Validation fraction.
test_fraction: Test fraction.
tf_auto_transform_features: List of auto transform features in the comma-separated string format.
tf_custom_transformation_definitions: TF custom transformation definitions in string format.
tf_transformations_path: Path to TF transformation configuration.
data_source_csv_filenames: The CSV data source.
data_source_bigquery_table_path: The BigQuery data source.
bigquery_staging_full_dataset_id: The BigQuery staging full dataset id for storing intermediate tables.
weight_column: The weight column name.
training_machine_type: Machine type.
training_total_replica_count: Number of workers.
training_accelerator_type: Accelerator type.
training_accelerator_count: Accelerator count.
transform_dataflow_machine_type: The dataflow machine type for transform component.
transform_dataflow_max_num_workers: The max number of Dataflow workers for transform component.
transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for transform component.
run_evaluation: Whether to run evaluation steps during training.
evaluation_batch_predict_machine_type: The prediction server machine type for batch predict components during evaluation.
evaluation_batch_predict_starting_replica_count: The initial number of prediction server for batch predict components during evaluation.
evaluation_batch_predict_max_replica_count: The max number of prediction server for batch predict components during evaluation.
evaluation_dataflow_machine_type: The dataflow machine type for evaluation components.
evaluation_dataflow_starting_num_workers: The initial number of Dataflow workers for evaluation components.
evaluation_dataflow_max_num_workers: The max number of Dataflow workers for evaluation components.
evaluation_dataflow_disk_size_gb: Dataflow worker's disk size in GB for evaluation components.
dataflow_service_account: Custom service account to run dataflow jobs.
dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty the default subnetwork will be used. Example: https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
dataflow_use_public_ips: Specifies whether Dataflow workers use public IP addresses.
encryption_spec_key_name: The KMS key name.
Returns:
Tuple of pipeline_definition_path and parameter_values.
"""
# fmt: on
parameter_values = {}
if isinstance(tf_auto_transform_features, list):
tf_auto_transform_features = {'auto': tf_auto_transform_features}
training_and_eval_parameters = {
'project': project,
'location': location,
'root_dir': root_dir,
'target_column': target_column,
'objective': objective,
'eval_metric': eval_metric,
'num_boost_round': num_boost_round,
'early_stopping_rounds': early_stopping_rounds,
'base_score': base_score,
'disable_default_eval_metric': disable_default_eval_metric,
'seed': seed,
'seed_per_iteration': seed_per_iteration,
'booster': booster,
'eta': eta,
'gamma': gamma,
'max_depth': max_depth,
'min_child_weight': min_child_weight,
'max_delta_step': max_delta_step,
'subsample': subsample,
'colsample_bytree': colsample_bytree,
'colsample_bylevel': colsample_bylevel,
'colsample_bynode': colsample_bynode,
'reg_lambda': reg_lambda,
'reg_alpha': reg_alpha,
'tree_method': tree_method,
'scale_pos_weight': scale_pos_weight,
'updater': updater,
'refresh_leaf': refresh_leaf,
'process_type': process_type,
'grow_policy': grow_policy,
'sampling_method': sampling_method,
'monotone_constraints': monotone_constraints,
'interaction_constraints': interaction_constraints,
'sample_type': sample_type,
'normalize_type': normalize_type,
'rate_drop': rate_drop,
'one_drop': one_drop,
'skip_drop': skip_drop,
'num_parallel_tree': num_parallel_tree,
'feature_selector': feature_selector,
'top_k': top_k,
'max_cat_to_onehot': max_cat_to_onehot,
'max_leaves': max_leaves,
'max_bin': max_bin,
'tweedie_variance_power': tweedie_variance_power,
'huber_slope': huber_slope,
'weight_column': weight_column,
'training_machine_type': training_machine_type,
'training_total_replica_count': training_total_replica_count,
'training_accelerator_type': training_accelerator_type,
'training_accelerator_count': training_accelerator_count,
'transform_dataflow_machine_type': transform_dataflow_machine_type,
'transform_dataflow_max_num_workers': transform_dataflow_max_num_workers,
'transform_dataflow_disk_size_gb': transform_dataflow_disk_size_gb,
'run_evaluation': run_evaluation,
'evaluation_batch_predict_machine_type': (
evaluation_batch_predict_machine_type
),
'evaluation_batch_predict_starting_replica_count': (
evaluation_batch_predict_starting_replica_count
),
'evaluation_batch_predict_max_replica_count': (
evaluation_batch_predict_max_replica_count
),
'evaluation_dataflow_machine_type': evaluation_dataflow_machine_type,
'evaluation_dataflow_starting_num_workers': (
evaluation_dataflow_starting_num_workers
),
'evaluation_dataflow_max_num_workers': (
evaluation_dataflow_max_num_workers
),
'evaluation_dataflow_disk_size_gb': evaluation_dataflow_disk_size_gb,
'dataflow_service_account': dataflow_service_account,
'dataflow_subnetwork': dataflow_subnetwork,
'dataflow_use_public_ips': dataflow_use_public_ips,
'encryption_spec_key_name': encryption_spec_key_name,
}
_update_parameters(parameter_values, training_and_eval_parameters)
fte_params = {
'dataset_level_custom_transformation_definitions': (
dataset_level_custom_transformation_definitions
if dataset_level_custom_transformation_definitions
else []
),
'dataset_level_transformations': (
dataset_level_transformations if dataset_level_transformations else []
),
'run_feature_selection': run_feature_selection,
'feature_selection_algorithm': feature_selection_algorithm,
'max_selected_features': max_selected_features,
'predefined_split_key': predefined_split_key,
'stratified_split_key': stratified_split_key,
'training_fraction': training_fraction,
'validation_fraction': validation_fraction,
'test_fraction': test_fraction,
'tf_auto_transform_features': (
tf_auto_transform_features if tf_auto_transform_features else {}
),
'tf_custom_transformation_definitions': (
tf_custom_transformation_definitions
if tf_custom_transformation_definitions
else []
),
'tf_transformations_path': tf_transformations_path,
}
_update_parameters(parameter_values, fte_params)
data_source_and_split_parameters = {
'data_source_csv_filenames': data_source_csv_filenames,
'data_source_bigquery_table_path': data_source_bigquery_table_path,
'bigquery_staging_full_dataset_id': bigquery_staging_full_dataset_id,
}
_update_parameters(parameter_values, data_source_and_split_parameters)
pipeline_definition_path = os.path.join(
pathlib.Path(__file__).parent.resolve(), 'xgboost_trainer_pipeline.yaml'
)
return pipeline_definition_path, parameter_values
[docs]def get_xgboost_hyperparameter_tuning_job_pipeline_and_parameters(
project: str,
location: str,
root_dir: str,
target_column: str,
objective: str,
study_spec_metric_id: str,
study_spec_metric_goal: str,
max_trial_count: int,
parallel_trial_count: int,
study_spec_parameters_override: Optional[List[Dict[str, Any]]] = None,
eval_metric: Optional[str] = None,
disable_default_eval_metric: Optional[int] = None,
seed: Optional[int] = None,
seed_per_iteration: Optional[bool] = None,
dataset_level_custom_transformation_definitions: Optional[
List[Dict[str, Any]]
] = None,
dataset_level_transformations: Optional[List[Dict[str, Any]]] = None,
run_feature_selection: Optional[bool] = None,
feature_selection_algorithm: Optional[str] = None,
max_selected_features: Optional[int] = None,
predefined_split_key: Optional[str] = None,
stratified_split_key: Optional[str] = None,
training_fraction: Optional[float] = None,
validation_fraction: Optional[float] = None,
test_fraction: Optional[float] = None,
tf_auto_transform_features: Optional[
Union[List[str], Dict[str, List[str]]]
] = None,
tf_custom_transformation_definitions: Optional[List[Dict[str, Any]]] = None,
tf_transformations_path: Optional[str] = None,
data_source_csv_filenames: Optional[str] = None,
data_source_bigquery_table_path: Optional[str] = None,
bigquery_staging_full_dataset_id: Optional[str] = None,
weight_column: Optional[str] = None,
max_failed_trial_count: Optional[int] = None,
training_machine_type: Optional[str] = None,
training_total_replica_count: Optional[int] = None,
training_accelerator_type: Optional[str] = None,
training_accelerator_count: Optional[int] = None,
study_spec_algorithm: Optional[str] = None,
study_spec_measurement_selection_type: Optional[str] = None,
transform_dataflow_machine_type: Optional[str] = None,
transform_dataflow_max_num_workers: Optional[int] = None,
transform_dataflow_disk_size_gb: Optional[int] = None,
run_evaluation: Optional[bool] = None,
evaluation_batch_predict_machine_type: Optional[str] = None,
evaluation_batch_predict_starting_replica_count: Optional[int] = None,
evaluation_batch_predict_max_replica_count: Optional[int] = None,
evaluation_dataflow_machine_type: Optional[str] = None,
evaluation_dataflow_starting_num_workers: Optional[int] = None,
evaluation_dataflow_max_num_workers: Optional[int] = None,
evaluation_dataflow_disk_size_gb: Optional[int] = None,
dataflow_service_account: Optional[str] = None,
dataflow_subnetwork: Optional[str] = None,
dataflow_use_public_ips: Optional[bool] = None,
encryption_spec_key_name: Optional[str] = None,
):
# fmt: off
"""Get the XGBoost HyperparameterTuningJob pipeline.
Args:
project: The GCP project that runs the pipeline components.
location: The GCP region that runs the pipeline components.
root_dir: The root GCS directory for the pipeline components.
target_column: The target column name.
objective: Specifies the learning task and the learning objective. Must be one of [reg:squarederror, reg:squaredlogerror, reg:logistic, reg:gamma, reg:tweedie, reg:pseudohubererror, binary:logistic, multi:softprob].
study_spec_metric_id: Metric to optimize. For options, please look under 'eval_metric' at https://xgboost.readthedocs.io/en/stable/parameter.html#learning-task-parameters.
study_spec_metric_goal: Optimization goal of the metric, possible values: "MAXIMIZE", "MINIMIZE".
max_trial_count: The desired total number of trials.
parallel_trial_count: The desired number of trials to run in parallel.
study_spec_parameters_override: List of dictionaries representing parameters to optimize. The dictionary key is the parameter_id, which is passed to training job as a command line argument, and the dictionary value is the parameter specification of the metric.
eval_metric: Evaluation metrics for validation data represented as a comma-separated string.
disable_default_eval_metric: Flag to disable default metric. Set to >0 to disable. Default to 0.
seed: Random seed.
seed_per_iteration: Seed PRNG determnisticly via iterator number.
dataset_level_custom_transformation_definitions: Dataset-level custom transformation definitions in string format.
dataset_level_transformations: Dataset-level transformation configuration in string format.
run_feature_selection: Whether to enable feature selection.
feature_selection_algorithm: Feature selection algorithm.
max_selected_features: Maximum number of features to select.
predefined_split_key: Predefined split key.
stratified_split_key: Stratified split key.
training_fraction: Training fraction.
validation_fraction: Validation fraction.
test_fraction: Test fraction.
tf_auto_transform_features: List of auto transform features in the comma-separated string format.
tf_custom_transformation_definitions: TF custom transformation definitions in string format.
tf_transformations_path: Path to TF transformation configuration.
data_source_csv_filenames: The CSV data source.
data_source_bigquery_table_path: The BigQuery data source.
bigquery_staging_full_dataset_id: The BigQuery staging full dataset id for storing intermediate tables.
weight_column: The weight column name.
max_failed_trial_count: The number of failed trials that need to be seen before failing the HyperparameterTuningJob. If set to 0, Vertex AI decides how many trials must fail before the whole job fails.
training_machine_type: Machine type.
training_total_replica_count: Number of workers.
training_accelerator_type: Accelerator type.
training_accelerator_count: Accelerator count.
study_spec_algorithm: The search algorithm specified for the study. One of 'ALGORITHM_UNSPECIFIED', 'GRID_SEARCH', or 'RANDOM_SEARCH'.
study_spec_measurement_selection_type: Which measurement to use if/when the service automatically selects the final measurement from previously reported intermediate measurements. One of "BEST_MEASUREMENT" or "LAST_MEASUREMENT".
transform_dataflow_machine_type: The dataflow machine type for transform component.
transform_dataflow_max_num_workers: The max number of Dataflow workers for transform component.
transform_dataflow_disk_size_gb: Dataflow worker's disk size in GB for transform component.
run_evaluation: Whether to run evaluation steps during training.
evaluation_batch_predict_machine_type: The prediction server machine type for batch predict components during evaluation.
evaluation_batch_predict_starting_replica_count: The initial number of prediction server for batch predict components during evaluation.
evaluation_batch_predict_max_replica_count: The max number of prediction server for batch predict components during evaluation.
evaluation_dataflow_machine_type: The dataflow machine type for evaluation components.
evaluation_dataflow_starting_num_workers: The initial number of Dataflow workers for evaluation components.
evaluation_dataflow_max_num_workers: The max number of Dataflow workers for evaluation components.
evaluation_dataflow_disk_size_gb: Dataflow worker's disk size in GB for evaluation components.
dataflow_service_account: Custom service account to run dataflow jobs.
dataflow_subnetwork: Dataflow's fully qualified subnetwork name, when empty the default subnetwork will be used. Example: https://cloud.google.com/dataflow/docs/guides/specifying-networks#example_network_and_subnetwork_specifications
dataflow_use_public_ips: Specifies whether Dataflow workers use public IP addresses.
encryption_spec_key_name: The KMS key name.
Returns:
Tuple of pipeline_definition_path and parameter_values.
"""
# fmt: on
parameter_values = {}
if isinstance(tf_auto_transform_features, list):
tf_auto_transform_features = {'auto': tf_auto_transform_features}
training_and_eval_parameters = {
'project': project,
'location': location,
'root_dir': root_dir,
'target_column': target_column,
'objective': objective,
'eval_metric': eval_metric,
'study_spec_metric_id': study_spec_metric_id,
'study_spec_metric_goal': study_spec_metric_goal,
'max_trial_count': max_trial_count,
'parallel_trial_count': parallel_trial_count,
'study_spec_parameters_override': (
study_spec_parameters_override
if study_spec_parameters_override
else []
),
'disable_default_eval_metric': disable_default_eval_metric,
'seed': seed,
'seed_per_iteration': seed_per_iteration,
'weight_column': weight_column,
'max_failed_trial_count': max_failed_trial_count,
'training_machine_type': training_machine_type,
'training_total_replica_count': training_total_replica_count,
'training_accelerator_type': training_accelerator_type,
'training_accelerator_count': training_accelerator_count,
'study_spec_algorithm': study_spec_algorithm,
'study_spec_measurement_selection_type': (
study_spec_measurement_selection_type
),
'transform_dataflow_machine_type': transform_dataflow_machine_type,
'transform_dataflow_max_num_workers': transform_dataflow_max_num_workers,
'transform_dataflow_disk_size_gb': transform_dataflow_disk_size_gb,
'run_evaluation': run_evaluation,
'evaluation_batch_predict_machine_type': (
evaluation_batch_predict_machine_type
),
'evaluation_batch_predict_starting_replica_count': (
evaluation_batch_predict_starting_replica_count
),
'evaluation_batch_predict_max_replica_count': (
evaluation_batch_predict_max_replica_count
),
'evaluation_dataflow_machine_type': evaluation_dataflow_machine_type,
'evaluation_dataflow_starting_num_workers': (
evaluation_dataflow_starting_num_workers
),
'evaluation_dataflow_max_num_workers': (
evaluation_dataflow_max_num_workers
),
'evaluation_dataflow_disk_size_gb': evaluation_dataflow_disk_size_gb,
'dataflow_service_account': dataflow_service_account,
'dataflow_subnetwork': dataflow_subnetwork,
'dataflow_use_public_ips': dataflow_use_public_ips,
'encryption_spec_key_name': encryption_spec_key_name,
}
_update_parameters(parameter_values, training_and_eval_parameters)
fte_params = {
'dataset_level_custom_transformation_definitions': (
dataset_level_custom_transformation_definitions
if dataset_level_custom_transformation_definitions
else []
),
'dataset_level_transformations': (
dataset_level_transformations if dataset_level_transformations else []
),
'run_feature_selection': run_feature_selection,
'feature_selection_algorithm': feature_selection_algorithm,
'max_selected_features': max_selected_features,
'predefined_split_key': predefined_split_key,
'stratified_split_key': stratified_split_key,
'training_fraction': training_fraction,
'validation_fraction': validation_fraction,
'test_fraction': test_fraction,
'tf_auto_transform_features': (
tf_auto_transform_features if tf_auto_transform_features else {}
),
'tf_custom_transformation_definitions': (
tf_custom_transformation_definitions
if tf_custom_transformation_definitions
else []
),
'tf_transformations_path': tf_transformations_path,
}
_update_parameters(parameter_values, fte_params)
data_source_and_split_parameters = {
'data_source_csv_filenames': data_source_csv_filenames,
'data_source_bigquery_table_path': data_source_bigquery_table_path,
'bigquery_staging_full_dataset_id': bigquery_staging_full_dataset_id,
}
_update_parameters(parameter_values, data_source_and_split_parameters)
pipeline_definition_path = os.path.join(
pathlib.Path(__file__).parent.resolve(),
'xgboost_hyperparameter_tuning_job_pipeline.yaml',
)
return pipeline_definition_path, parameter_values
def get_feature_selection_pipeline_and_parameters(
root_dir: str,
project: str,
location: str,
target_column: str,
prediction_type: str,
optimization_objective: str,
dataset_level_custom_transformation_definitions: Optional[
List[Dict[str, Any]]
] = None,
dataset_level_transformations: Optional[List[Dict[str, Any]]] = None,
run_feature_selection: Optional[bool] = None,
feature_selection_algorithm: Optional[str] = None,
feature_selection_execution_engine: Optional[
str
] = _FEATURE_SELECTION_EXECUTION_ENGINE_BIGQUERY,
max_selected_features: Optional[int] = None,
predefined_split_key: Optional[str] = None,
stratified_split_key: Optional[str] = None,
training_fraction: Optional[float] = None,
validation_fraction: Optional[float] = None,
test_fraction: Optional[float] = None,
tf_auto_transform_features: Optional[
Union[List[str], Dict[str, List[str]]]
] = None,
weight_column: Optional[str] = None,
data_source_csv_filenames: Optional[str] = None,
data_source_bigquery_table_path: Optional[str] = None,
bigquery_staging_full_dataset_id: Optional[str] = None,
dataflow_machine_type: Optional[str] = None,
dataflow_max_num_workers: Optional[int] = None,
dataflow_disk_size_gb: Optional[int] = None,
dataflow_subnetwork: Optional[str] = None,
dataflow_use_public_ips: Optional[bool] = None,
encryption_spec_key_name: Optional[str] = None,
stage_1_deadline_hours: Optional[float] = None,
stage_2_deadline_hours: Optional[float] = None,
):
"""Returns feature transform engine pipeline and formatted parameters."""
if isinstance(tf_auto_transform_features, list):
tf_auto_transform_features = {'auto': tf_auto_transform_features}
pipeline_definition_path = os.path.join(
pathlib.Path(__file__).parent.resolve(), 'feature_selection_pipeline.yaml'
)
parameter_values = {
'root_dir': root_dir,
'project': project,
'location': location,
'target_column': target_column,
'weight_column': weight_column,
'prediction_type': prediction_type,
'dataset_level_custom_transformation_definitions': (
dataset_level_custom_transformation_definitions
if dataset_level_custom_transformation_definitions
else []
),
'dataset_level_transformations': (
dataset_level_transformations if dataset_level_transformations else []
),
'run_feature_selection': run_feature_selection,
'feature_selection_algorithm': feature_selection_algorithm,
'feature_selection_execution_engine': feature_selection_execution_engine,
'max_selected_features': max_selected_features,
'predefined_split_key': predefined_split_key,
'stratified_split_key': stratified_split_key,
'training_fraction': training_fraction,
'validation_fraction': validation_fraction,
'test_fraction': test_fraction,
'tf_auto_transform_features': tf_auto_transform_features,
'optimization_objective': optimization_objective,
'data_source_csv_filenames': data_source_csv_filenames,
'data_source_bigquery_table_path': data_source_bigquery_table_path,
'bigquery_staging_full_dataset_id': bigquery_staging_full_dataset_id,
'dataflow_machine_type': dataflow_machine_type,
'dataflow_max_num_workers': dataflow_max_num_workers,
'dataflow_disk_size_gb': dataflow_disk_size_gb,
'dataflow_subnetwork': dataflow_subnetwork,
'dataflow_use_public_ips': dataflow_use_public_ips,
'encryption_spec_key_name': encryption_spec_key_name,
'stage_1_deadline_hours': stage_1_deadline_hours,
'stage_2_deadline_hours': stage_2_deadline_hours,
}
parameter_values = {
param: value
for param, value in parameter_values.items()
if value is not None
}
return pipeline_definition_path, parameter_values