Remove KoboldSamplingParams

neuralmagic · Apr 12, 2024 · 344699d · 344699d · github-actions · Apr 12, 2024
1 parent 00e2b56
commit 344699d
Show file tree

Hide file tree

Showing 2 changed files with 1 addition and 19 deletions.
diff --git a/vllm/entrypoints/kobold/api_server.py b/vllm/entrypoints/kobold/api_server.py
@@ -1,3 +1,4 @@
+# UPSTREAM SYNC: keep up-to-date with ../openai/api_server.py
 import asyncio
 import importlib
 import inspect

diff --git a/vllm/entrypoints/kobold/protocol.py b/vllm/entrypoints/kobold/protocol.py
@@ -5,25 +5,6 @@
 # ========== KoboldAI ========== #
 
 
-class KoboldSamplingParams(BaseModel):
-    n: int = Field(1, alias="n")
-    best_of: Optional[int] = Field(None, alias="best_of")
-    presence_penalty: float = Field(0.0, alias="presence_penalty")
-    frequency_penalty: float = Field(0.0, alias="rep_pen")
-    temperature: float = Field(1.0, alias="temperature")
-    top_p: float = Field(1.0, alias="top_p")
-    top_k: float = Field(-1, alias="top_k")
-    min_p: float = Field(0.0, alias="min_p")
-    use_beam_search: bool = Field(False, alias="use_beam_search")
-    length_penalty: float = Field(1.0, alias="length_penalty")
-    early_stopping: Union[bool, str] = Field(False, alias="early_stopping")
-    stop: Union[None, str, List[str]] = Field(None, alias="stop_sequence")
-    include_stop_str_in_output: Optional[bool] = False
-    ignore_eos: bool = Field(False, alias="ignore_eos")
-    max_tokens: int = Field(16, alias="max_length")
-    logprobs: Optional[int] = Field(None, alias="logprobs")
-
-
 class KAIGenerationInputSchema(BaseModel):
     genkey: Optional[str] = None
     prompt: str
Benchmark suite	Current: `344699d`	Previous: `788b4e5`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.2.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.2.1+cu121"}`	`3.8182870993420073` prompts/s	`3.8194073015836993` prompts/s	`1.00`
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.2.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.2.1+cu121"}`	`1466.222246147331` tokens/s	`1466.6524038081407` tokens/s	`1.00`