Source code for google_cloud_pipeline_components.preview.model_evaluation.model_based_llm_evaluation.autosxs.autosxs_pipeline

# Copyright 2023 The Kubeflow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Optimization AI Inference and AutoSxS pipeline function."""

from typing import Any, Dict, List, NamedTuple

from google_cloud_pipeline_components import _placeholders
from google_cloud_pipeline_components._implementation.llm import batch_prediction_pairwise
from google_cloud_pipeline_components._implementation.llm import model_evaluation_text_generation_pairwise
from google_cloud_pipeline_components._implementation.llm import online_evaluation_pairwise
from kfp import dsl

PipelineOutput = NamedTuple(
    'Outputs',
    model_a_evaluation_resource_name=str,
    model_b_evaluation_resource_name=str,
    evaluation_count=int,
    evaluation_dataset_path=str,
)


# pylint: disable=dangerous-default-value,g-bare-generic,unused-argument
[docs]@dsl.pipeline(
    name='autosxs-template',
    description='Determines the SxS winrate between two models.',
)
def autosxs_pipeline(
    evaluation_dataset: str,
    task: str,
    id_columns: List[str],
    model_a: str = '',
    model_b: str = '',
    autorater_prompt_parameters: Dict[str, Dict[str, str]] = {},
    model_a_prompt_parameters: Dict[str, Dict[str, str]] = {},
    model_b_prompt_parameters: Dict[str, Dict[str, str]] = {},
    response_column_a: str = '',
    response_column_b: str = '',
    model_a_parameters: Dict[str, str] = {},
    model_b_parameters: Dict[str, str] = {},
    human_preference_column: str = '',
    project: str = _placeholders.PROJECT_ID_PLACEHOLDER,
    location: str = _placeholders.LOCATION_PLACEHOLDER,
    judgments_format: str = 'jsonl',
    bigquery_destination_prefix: str = '',
    experimental_args: Dict[str, Any] = {},
    encryption_spec_key_name: str = '',
) -> PipelineOutput:
  # fmt: off
  """Evaluates two models side-by-side using an arbiter model.

  Args:
    evaluation_dataset: A BigQuery table or comma-separated list of GCS paths to a JSONL dataset containing evaluation examples.
    task: Evaluation task in the form `{task}@{version}`. task can be one of `[summarization, question_answering]`. Version is an integer with 3 digits or "latest". Ex: `summarization@001` or `question_answering@latest`.
    id_columns: The columns which distinguish unique evaluation examples.
    model_a: A fully-qualified model resource name (`projects/{project}/locations/{location}/models/{model}@{version}`) or publisher model resource name (`publishers/{publisher}/models/{model}`).  This parameter is optional if Model A responses are specified.
    model_b: A fully-qualified model resource name (`projects/{project}/locations/{location}/models/{model}@{version}`) or publisher model resource name (`publishers/{publisher}/models/{model}`).  This parameter is optional if Model B responses are specified.
    autorater_prompt_parameters: Map of autorater prompt parameters to columns or templates. The expected parameters are: `inference_instruction` (details on how to perform a task) and `inference_context` (content to reference to perform the task). As an example, `{'inference_context': {'column': 'my_prompt'}}` uses the evaluation dataset's `my_prompt` column for the AutoRater's context.
    model_a_prompt_parameters: Map of Model A prompt template parameters to columns or templates. This parameter is optional if Model A predictions are predefined. Example - `{'prompt': {'column': 'my_prompt'}}` uses the evaluation dataset's `my_prompt` column for the prompt parameter named `prompt`.
    model_b_prompt_parameters: Map of Model B prompt template parameters to columns or templates. This parameter is optional if Model B predictions are predefined. Example - `{'prompt': {'column': 'my_prompt'}}` uses the evaluation dataset's `my_prompt` column for the prompt parameter named `prompt`.
    response_column_a: Either the name of a column in the evaluation dataset containing predefined predictions, or the name of the column in the Model A output containing predictions. If no value is provided, the correct model output column name will attempt to be inferred.
    response_column_b: Either the name of a column in the evaluation dataset containing predefined predictions, or the name of the column in the Model B output containing predictions. If no value is provided, the correct model output column name will attempt to be inferred.
    model_a_parameters: The parameters that govern the predictions from model A, such as temperature or maximum output tokens.
    model_b_parameters: The parameters that govern the predictions from model B, such as temperature or maximum output tokens.
    human_preference_column: The column containing ground truth winners for each example. Providing this parameter adds additional metrics for checking the AutoRater alignment with human preferences.
    project: Project used to run custom jobs. This should be the same project used to run the pipeline.
    location: Location used to run custom jobs. This should be the same location used to run the pipeline.
    judgments_format: The format to write judgments to. Can be either `[json, bigquery]`.
    bigquery_destination_prefix: BigQuery table to write judgments to if the specified format is 'bigquery'.
    experimental_args: Experimentally released arguments. Subject to change.
    encryption_spec_key_name: Customer-managed encryption key options. If this is set, then all resources created by the pipeline will be encrypted with the provided encryption key.

  Returns:
    model_a_evaluation_resource_name: The path to write the ModelEvaluation for Model A to if Model A is a ModelRegistry Model.
    model_b_evaluation_resource_name: The path to write the ModelEvaluation for Model B to if Model B is a ModelRegistry Model.
    evaluation_count: The count of how many evaluations were included for this AutoSxS run.
    evaluation_dataset_path: The path to the overall evaluation dataset including judgments.
  """
  # fmt: on
  responses = batch_prediction_pairwise.batch_prediction_pairwise(
      display_name='autosxs-{{$.pipeline_job_uuid}}-{{$.pipeline_task_uuid}}',
      evaluation_dataset=evaluation_dataset,
      id_columns=id_columns,
      task=task,
      autorater_prompt_parameters=autorater_prompt_parameters,
      response_column_a=response_column_a,
      response_column_b=response_column_b,
      model_a=model_a,
      model_b=model_b,
      model_a_prompt_parameters=model_a_prompt_parameters,
      model_b_prompt_parameters=model_b_prompt_parameters,
      model_a_parameters=model_a_parameters,
      model_b_parameters=model_b_parameters,
      human_preference_column=human_preference_column,
      experimental_args=experimental_args,
      project=project,
      location=location,
      encryption_spec_key_name=encryption_spec_key_name,
  ).set_display_name('AutoSxS Batch Prediction')

  winners = online_evaluation_pairwise.online_evaluation_pairwise(
      inference_output_uri=responses.outputs[
          'preprocessed_evaluation_dataset_uri'
      ],
      id_columns=id_columns,
      human_preference_column=human_preference_column,
      task=task,
      judgments_format=judgments_format,
      bigquery_destination_prefix=bigquery_destination_prefix,
      experimental_args=experimental_args,
      project=project,
      location=location,
      encryption_spec_key_name=encryption_spec_key_name,
  ).set_display_name('AutoSxS Autorater')

  metrics = model_evaluation_text_generation_pairwise.model_evaluation_text_generation_pairwise(
      judgments_dir=winners.outputs['judgments_uri'],
      human_preference_column=human_preference_column,
      project=project,
      location=location,
      encryption_spec_key_name=encryption_spec_key_name,
      model_a=model_a,
      model_b=model_b,
      evaluation_dataset=evaluation_dataset,
      evaluation_dataset_metadata=winners.outputs['metadata'],
      task=task,
  ).set_display_name(
      'AutoSxS Metrics'
  )

  return PipelineOutput(
      model_a_evaluation_resource_name=metrics.outputs[
          'model_a_evaluation_path'
      ],
      model_b_evaluation_resource_name=metrics.outputs[
          'model_b_evaluation_path'
      ],
      evaluation_count=metrics.outputs['evaluation_count_path'],
      # Needs to be a component output
      evaluation_dataset_path=metrics.outputs['evaluation_dataset_path'],
  )