Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for NM models #1

Merged
merged 19 commits into from
May 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/alpaca_eval/completion_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@ def regex_parser(completion: str, outputs_to_match: dict[Any, Any]) -> list[Any]
responses.append(key)
# avoid matching the same output twice
completion = completion[match.end() :]

if len(responses) > 1:
responses = [responses[0]]

return responses


Expand Down
10 changes: 10 additions & 0 deletions src/alpaca_eval/decoders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,16 @@ def get_fn_completions(name: Union[str, Callable]) -> Callable:
raise e

return huggingface_local_completions

elif name == "sparseml_local_completions":
try:
from .sparseml_local import sparseml_local_completions
except ImportError as e:
packages = ["accelerate", "nm-transformers"]
logging.exception(f"You need {packages} to use sparseml_local_completions. Error:")
raise e

return sparseml_local_completions

elif name == "cohere_completions":
try:
Expand Down
175 changes: 175 additions & 0 deletions src/alpaca_eval/decoders/sparseml_local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
import logging
from typing import Optional, Sequence

import numpy as np
import torch
import transformers
import os
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import AutoTokenizer, AutoConfig
from sparseml.transformers.utils.sparse_model import SparseAutoModel
import sparseml.core.session as session_manager

from .. import constants, utils

__all__ = ["sparseml_local_completions"]


class ListDataset(Dataset):
def __init__(self, original_list):
self.original_list = original_list

def __len__(self):
return len(self.original_list)

def __getitem__(self, i):
return self.original_list[i]


def sparseml_local_completions(
prompts: Sequence[str],
model_name: str,
do_sample: bool = False,
batch_size: int = 1,
model_kwargs=None,
cache_dir: Optional[str] = constants.DEFAULT_CACHE_DIR,
remove_ending: Optional[str] = None,
is_fast_tokenizer: bool = True,
**kwargs,
) -> dict[str, list]:
"""Decode locally using huggingface transformers pipeline.

Parameters
----------
prompts : list of str
Prompts to get completions for.

model_name : str, optional
Name of the model (repo on hugging face hub) to use for decoding.

do_sample : bool, optional
Whether to use sampling for decoding.

batch_size : int, optional
Batch size to use for decoding. This currently does not work well with to_bettertransformer.

model_kwargs : dict, optional
Additional kwargs to pass to from_pretrained.

cache_dir : str, optional
Directory to use for caching the model.

remove_ending : str, optional
The ending string to be removed from completions. Typically eos_token.

kwargs :
Additional kwargs to pass to `InferenceClient.__call__`.
"""
model_kwargs = model_kwargs or {}
if "device_map" not in model_kwargs:
model_kwargs["device_map"] = "auto"
if "torch_dtype" in model_kwargs and isinstance(model_kwargs["torch_dtype"], str):
model_kwargs["torch_dtype"] = getattr(torch, model_kwargs["torch_dtype"])

n_examples = len(prompts)
if n_examples == 0:
logging.info("No samples to annotate.")
return []
else:
logging.info(f"Using `huggingface_local_completions` on {n_examples} prompts using {model_name}.")

if not torch.cuda.is_available():
model_kwargs["torch_dtype"] = None

# faster but slightly less accurate matrix multiplications
torch.backends.cuda.matmul.allow_tf32 = torch.backends.cudnn.allow_tf32 = True

recipe_file = os.path.join(model_name, "recipe.yaml")
if not os.path.exists(recipe_file):
recipe_file = None
config = AutoConfig.from_pretrained(model_name)

model = SparseAutoModel.text_generation_from_pretrained(
model_name_or_path=model_name,
config=config,
recipe=recipe_file,
trust_remote_code=model_kwargs.get("trust_remote_code", False),
torch_dtype=model_kwargs["torch_dtype"],
)

model.eval().to("cuda:0")
del model_kwargs["device_map"]
kwargs["device"] = 0

tokenizer = AutoTokenizer.from_pretrained(
model_name,
cache_dir=cache_dir,
padding_side="left",
use_fast=is_fast_tokenizer,
**model_kwargs,
)

logging.info(f"Model memory: {model.get_memory_footprint() / 1e9} GB")

if batch_size > 1:
# sort the prompts by length so that we don't necessarily pad them by too much
# save also index to reorder the completions
original_order, prompts = zip(*sorted(enumerate(prompts), key=lambda x: len(x[1])))
prompts = list(prompts)

if not tokenizer.pad_token_id:
# set padding token if not set
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

default_kwargs = dict(
do_sample=do_sample,
model_kwargs={k: v for k, v in model_kwargs.items() if k != "trust_remote_code"},
batch_size=batch_size,
)
default_kwargs.update(kwargs)
logging.info(f"Kwargs to completion: {default_kwargs}")
pipeline = transformers.pipeline(
task="text-generation",
model=model,
tokenizer=tokenizer,
**default_kwargs,
trust_remote_code=model_kwargs.get("trust_remote_code", False),
)

## compute and log the time for completions
prompts_dataset = ListDataset(prompts)
completions = []

with utils.Timer() as t:
for out in tqdm(
pipeline(
prompts_dataset,
return_full_text=False,
pad_token_id=tokenizer.pad_token_id,
)
):
generated_text = out[0]["generated_text"]
if remove_ending is not None and generated_text.endswith(remove_ending):
generated_text = generated_text[: -len(remove_ending)]
completions.append(generated_text)

logging.info(f"Time for {n_examples} completions: {t}")

if batch_size > 1:
# reorder the completions to match the original order
completions, _ = zip(*sorted(list(zip(completions, original_order)), key=lambda x: x[1]))
completions = list(completions)

# local => price is really your compute
price = [np.nan] * len(completions)
avg_time = [t.duration / n_examples] * len(completions)

if os.path.exists(recipe_file):
if session_manager.active_session():
active_session = session_manager.active_session()
active_session.reset()
torch.cuda.empty_cache()

return dict(completions=completions, price_per_example=price, time_per_example=avg_time)
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
llama-2-70b-chat-hf:
prompt_template: "llama-2-70b-chat-hf/prompt.txt"
fn_completions: "huggingface_local_completions"
completions_kwargs:
model_name: "meta-llama/Llama-2-70b-chat-hf"
max_new_tokens: 50
temperature: 0
completion_parser_kwargs:
outputs_to_match:
1: '(?i)output \(a\)'
2: '(?i)output \(b\)'
37 changes: 37 additions & 0 deletions src/alpaca_eval/evaluators_configs/llama-2-70b-chat-hf/prompt.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<<SYS>>
You are a helpful instruction-following assistant that prints the best model by selecting the best outputs for a given instruction.
<</SYS>>

[INST]
Select the output (a) or (b) that best matches the given instruction. Choose your preferred output, which can be subjective. Your answer should ONLY contain: Output (a) or Output (b). Here's an example:

# Example:
## Instruction:
Give a description of the following job: "ophthalmologist"

## Output (a):
An ophthalmologist is a medical doctor who specializes in the diagnosis and treatment of eye diseases and conditions.

## Output (b):
An ophthalmologist is a medical doctor who pokes and prods at your eyes while asking you to read letters from a chart.

## Which is best, Output (a) or Output (b)?
Output (a)

Here the answer is Output (a) because it provides a comprehensive and accurate description of the job of an ophthalmologist. In contrast, output (b) is more of
a joke.

# Task:
Now is the real task, do not explain your answer, just say Output (a) or Output (b).

## Instruction:
{instruction}

## Output (a):
{output_1}

## Output (b):
{output_2}

## Which is best, Output (a) or Output (b)?
[/INST]
21 changes: 21 additions & 0 deletions src/alpaca_eval/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def evaluate(
sort_by: str = "length_controlled_winrate" if constants.IS_ALPACA_EVAL_2 else "win_rate",
is_cache_leaderboard: Optional[bool] = None,
max_instances: Optional[int] = None,
clearml_project: str = None,
clearml_task: str = None,
annotation_kwargs: Optional[dict[str, Any]] = None,
Annotator=annotators.PairwiseAnnotator,
**annotator_kwargs,
Expand Down Expand Up @@ -209,6 +211,21 @@ def evaluate(
f"path but {type(precomputed_leaderboard)}."
)

if clearml_project is not None and clearml_task is not None:
from clearml import Task
task = Task.get_task(project_name=clearml_project, task_name=clearml_task)
if task is None:
task = Task.init(project_name=clearml_project, task_name=clearml_task)
else:
task.started()

task.upload_artifact(name='alpaca-eval output', artifact_object=df_leaderboard)
for name in df_leaderboard:
value = df_leaderboard[name].values[0]
if not isinstance(value, str):
task.get_logger().report_single_value(name=name, value=value)
task.mark_completed()

if is_return_instead_of_print:
return df_leaderboard, annotations
else:
Expand All @@ -230,6 +247,8 @@ def evaluate_from_model(
is_strip_output: bool = True,
is_load_outputs: bool = True,
chunksize: int = 64,
clearml_project: str = None,
clearml_task: str = None,
**kwargs,
):
"""Evaluate a model from HuggingFace or an API provider. This is a wrapper around `evaluate` which includes
Expand Down Expand Up @@ -371,6 +390,8 @@ def get_completions(configs, df: pd.DataFrame, old_output_path: Optional[Path] =
annotators_config=annotators_config,
output_path=output_path,
max_instances=max_instances,
clearml_project=clearml_project,
clearml_task=clearml_task,
**kwargs,
)

Expand Down