neuralmagic · anmarques · May 8, 2024 · Dec 21, 2023 · Dec 21, 2023 · Dec 21, 2023
diff --git a/src/alpaca_eval/completion_parsers.py b/src/alpaca_eval/completion_parsers.py
@@ -87,6 +87,10 @@ def regex_parser(completion: str, outputs_to_match: dict[Any, Any]) -> list[Any]
         responses.append(key)
         # avoid matching the same output twice
         completion = completion[match.end() :]
+
+    if len(responses) > 1:
+        responses = [responses[0]]
+
     return responses
 
 

diff --git a/src/alpaca_eval/decoders/__init__.py b/src/alpaca_eval/decoders/__init__.py
@@ -56,6 +56,16 @@ def get_fn_completions(name: Union[str, Callable]) -> Callable:
             raise e
 
         return huggingface_local_completions
+
+    elif name == "sparseml_local_completions":
+        try:
+            from .sparseml_local import sparseml_local_completions
+        except ImportError as e:
+            packages = ["accelerate", "nm-transformers"]
+            logging.exception(f"You need {packages} to use sparseml_local_completions. Error:")
+            raise e
+
+        return sparseml_local_completions
 
     elif name == "cohere_completions":
         try:

diff --git a/src/alpaca_eval/decoders/sparseml_local.py b/src/alpaca_eval/decoders/sparseml_local.py
@@ -0,0 +1,175 @@
+import logging
+from typing import Optional, Sequence
+
+import numpy as np
+import torch
+import transformers
+import os
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoConfig
+from sparseml.transformers.utils.sparse_model import SparseAutoModel
+import sparseml.core.session as session_manager
+
+from .. import constants, utils
+
+__all__ = ["sparseml_local_completions"]
+
+
+class ListDataset(Dataset):
+    def __init__(self, original_list):
+        self.original_list = original_list
+
+    def __len__(self):
+        return len(self.original_list)
+
+    def __getitem__(self, i):
+        return self.original_list[i]
+
+
+def sparseml_local_completions(
+    prompts: Sequence[str],
+    model_name: str,
+    do_sample: bool = False,
+    batch_size: int = 1,
+    model_kwargs=None,
+    cache_dir: Optional[str] = constants.DEFAULT_CACHE_DIR,
+    remove_ending: Optional[str] = None,
+    is_fast_tokenizer: bool = True,
+    **kwargs,
+) -> dict[str, list]:
+    """Decode locally using huggingface transformers pipeline.
+
+    Parameters
+    ----------
+    prompts : list of str
+        Prompts to get completions for.
+
+    model_name : str, optional
+        Name of the model (repo on hugging face hub)  to use for decoding.
+
+    do_sample : bool, optional
+        Whether to use sampling for decoding.
+
+    batch_size : int, optional
+        Batch size to use for decoding. This currently does not work well with to_bettertransformer.
+
+    model_kwargs : dict, optional
+        Additional kwargs to pass to from_pretrained.
+
+    cache_dir : str, optional
+        Directory to use for caching the model.
+
+    remove_ending : str, optional
+        The ending string to be removed from completions. Typically eos_token.
+
+    kwargs :
+        Additional kwargs to pass to `InferenceClient.__call__`.
+    """
+    model_kwargs = model_kwargs or {}
+    if "device_map" not in model_kwargs:
+        model_kwargs["device_map"] = "auto"
+    if "torch_dtype" in model_kwargs and isinstance(model_kwargs["torch_dtype"], str):
+        model_kwargs["torch_dtype"] = getattr(torch, model_kwargs["torch_dtype"])
+
+    n_examples = len(prompts)
+    if n_examples == 0:
+        logging.info("No samples to annotate.")
+        return []
+    else:
+        logging.info(f"Using `huggingface_local_completions` on {n_examples} prompts using {model_name}.")
+
+    if not torch.cuda.is_available():
+        model_kwargs["torch_dtype"] = None
+
+    #  faster but slightly less accurate matrix multiplications
+    torch.backends.cuda.matmul.allow_tf32 = torch.backends.cudnn.allow_tf32 = True
+
+    recipe_file = os.path.join(model_name, "recipe.yaml")
+    if not os.path.exists(recipe_file):
+        recipe_file = None
+    config = AutoConfig.from_pretrained(model_name)
+
+    model = SparseAutoModel.text_generation_from_pretrained(
+        model_name_or_path=model_name,
+        config=config,
+        recipe=recipe_file,
+        trust_remote_code=model_kwargs.get("trust_remote_code", False),
+        torch_dtype=model_kwargs["torch_dtype"],
+    )
+
+    model.eval().to("cuda:0")
+    del model_kwargs["device_map"]
+    kwargs["device"] = 0
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name,
+        cache_dir=cache_dir,
+        padding_side="left",
+        use_fast=is_fast_tokenizer,
+        **model_kwargs,
+    )
+
+    logging.info(f"Model memory: {model.get_memory_footprint() / 1e9} GB")
+
+    if batch_size > 1:
+        # sort the prompts by length so that we don't necessarily pad them by too much
+        # save also index to reorder the completions
+        original_order, prompts = zip(*sorted(enumerate(prompts), key=lambda x: len(x[1])))
+        prompts = list(prompts)
+
+    if not tokenizer.pad_token_id:
+        # set padding token if not set
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+        tokenizer.pad_token = tokenizer.eos_token
+
+    default_kwargs = dict(
+        do_sample=do_sample,
+        model_kwargs={k: v for k, v in model_kwargs.items() if k != "trust_remote_code"},
+        batch_size=batch_size,
+    )
+    default_kwargs.update(kwargs)
+    logging.info(f"Kwargs to completion: {default_kwargs}")
+    pipeline = transformers.pipeline(
+        task="text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        **default_kwargs,
+        trust_remote_code=model_kwargs.get("trust_remote_code", False),
+    )
+
+    ## compute and log the time for completions
+    prompts_dataset = ListDataset(prompts)
+    completions = []
+
+    with utils.Timer() as t:
+        for out in tqdm(
+            pipeline(
+                prompts_dataset,
+                return_full_text=False,
+                pad_token_id=tokenizer.pad_token_id,
+            )
+        ):
+            generated_text = out[0]["generated_text"]
+            if remove_ending is not None and generated_text.endswith(remove_ending):
+                generated_text = generated_text[: -len(remove_ending)]
+            completions.append(generated_text)
+
+    logging.info(f"Time for {n_examples} completions: {t}")
+
+    if batch_size > 1:
+        # reorder the completions to match the original order
+        completions, _ = zip(*sorted(list(zip(completions, original_order)), key=lambda x: x[1]))
+        completions = list(completions)
+
+    # local => price is really your compute
+    price = [np.nan] * len(completions)
+    avg_time = [t.duration / n_examples] * len(completions)
+
+    if os.path.exists(recipe_file):
+        if session_manager.active_session():
+            active_session = session_manager.active_session()
+            active_session.reset()
+    torch.cuda.empty_cache()
+
+    return dict(completions=completions, price_per_example=price, time_per_example=avg_time)
diff --git a/src/alpaca_eval/evaluators_configs/llama-2-70b-chat-hf/configs.yaml b/src/alpaca_eval/evaluators_configs/llama-2-70b-chat-hf/configs.yaml
@@ -0,0 +1,11 @@
+llama-2-70b-chat-hf:
+  prompt_template: "llama-2-70b-chat-hf/prompt.txt"
+  fn_completions: "huggingface_local_completions"
+  completions_kwargs:
+    model_name: "meta-llama/Llama-2-70b-chat-hf"
+    max_new_tokens: 50
+    temperature: 0
+  completion_parser_kwargs:
+    outputs_to_match:
+      1: '(?i)output \(a\)'
+      2: '(?i)output \(b\)'
diff --git a/src/alpaca_eval/evaluators_configs/llama-2-70b-chat-hf/prompt.txt b/src/alpaca_eval/evaluators_configs/llama-2-70b-chat-hf/prompt.txt
@@ -0,0 +1,37 @@
+<<SYS>>
+You are a helpful instruction-following assistant that prints the best model by selecting the best outputs for a given instruction.
+<</SYS>>
+
+[INST]
+Select the output (a) or (b) that best matches the given instruction. Choose your preferred output, which can be subjective. Your answer should ONLY contain: Output (a) or Output (b). Here's an example:
+
+# Example:
+## Instruction:
+Give a description of the following job: "ophthalmologist"
+
+## Output (a):
+An ophthalmologist is a medical doctor who specializes in the diagnosis and treatment of eye diseases and conditions.
+
+## Output (b):
+An ophthalmologist is a medical doctor who pokes and prods at your eyes while asking you to read letters from a chart.
+
+## Which is best, Output (a) or Output (b)?
+Output (a)
+
+Here the answer is Output (a) because it provides a comprehensive and accurate description of the job of an ophthalmologist. In contrast, output (b) is more of
+a joke.
+
+# Task:
+Now is the real task, do not explain your answer, just say Output (a) or Output (b).
+
+## Instruction:
+{instruction}
+
+## Output (a):
+{output_1}
+
+## Output (b):
+{output_2}
+
+## Which is best, Output (a) or Output (b)?
+[/INST]
diff --git a/src/alpaca_eval/main.py b/src/alpaca_eval/main.py
@@ -31,6 +31,8 @@ def evaluate(
     sort_by: str = "length_controlled_winrate" if constants.IS_ALPACA_EVAL_2 else "win_rate",
     is_cache_leaderboard: Optional[bool] = None,
     max_instances: Optional[int] = None,
+    clearml_project: str = None,
+    clearml_task: str = None,
     annotation_kwargs: Optional[dict[str, Any]] = None,
     Annotator=annotators.PairwiseAnnotator,
     **annotator_kwargs,
@@ -209,6 +211,21 @@ def evaluate(
                 f"path but {type(precomputed_leaderboard)}."
             )
 
+    if clearml_project is not None and clearml_task is not None:
+        from clearml import Task
+        task = Task.get_task(project_name=clearml_project, task_name=clearml_task)
+        if task is None:
+            task = Task.init(project_name=clearml_project, task_name=clearml_task)
+        else:
+            task.started()
+
+        task.upload_artifact(name='alpaca-eval output', artifact_object=df_leaderboard)
+        for name in df_leaderboard:
+            value = df_leaderboard[name].values[0]
+            if not isinstance(value, str):
+                task.get_logger().report_single_value(name=name, value=value)
+        task.mark_completed()
+
     if is_return_instead_of_print:
         return df_leaderboard, annotations
     else:
@@ -230,6 +247,8 @@ def evaluate_from_model(
     is_strip_output: bool = True,
     is_load_outputs: bool = True,
     chunksize: int = 64,
+    clearml_project: str = None,
+    clearml_task: str = None,
     **kwargs,
 ):
     """Evaluate a model from HuggingFace or an API provider. This is a wrapper around `evaluate` which includes
@@ -371,6 +390,8 @@ def get_completions(configs, df: pd.DataFrame, old_output_path: Optional[Path] =
         annotators_config=annotators_config,
         output_path=output_path,
         max_instances=max_instances,
+        clearml_project=clearml_project,
+        clearml_task=clearml_task,
         **kwargs,
     )