add example files

learningpro · web-flow · commit dffb789925c1 · 2024-11-27T16:09:53.000+08:00
diff --git a/examples/client.py b/examples/client.py
@@ -0,0 +1,42 @@
+import requests
+
+def chat_with_model(user_input, history):
+    url = "http://localhost:8000/chat/"
+    payload = {
+        "user_input": user_input,
+        "history": history
+    }
+    headers = {
+        "Content-Type": "application/json"
+    }
+
+    response = requests.post(url, json=payload, headers=headers)
+
+    if response.status_code == 200:
+        return response.json()
+    else:
+        print(f"Failed to send request: {response.status_code} {response.text}")
+        return None
+
+def main():
+    history = []
+    print("Enter 'q' to quit, 'c' to clear chat history.")
+    while True:
+        user_input = input("User: ").strip()
+        if user_input.lower() in ['q', 'quit']:
+            print("Exiting chat.")
+            break
+        if user_input.lower() == 'c':
+            print("Clearing chat history.")
+            history.clear()
+            continue
+
+        result = chat_with_model(user_input, history)
+        if result:
+            # Display the response from the model.
+            print(f"Assistant: {result['response']}")
+            # Update the chat history from the response.
+            history = result['history']
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/stream_client.py b/examples/stream_client.py
@@ -0,0 +1,46 @@
+import httpx
+
+
+def chat_with_model_stream(user_input, history, url="http://localhost:8000/chat/"):
+    payload = {
+        "user_input": user_input,
+        "history": history
+    }
+
+    headers = {
+        "Content-Type": "application/json"
+    }
+
+    # Use httpx to send a POST request without stream=True, handle streaming in response context
+    with httpx.Client() as client:
+        with client.stream("POST", url, json=payload, headers=headers) as response:
+            if response.status_code == 200:
+                print("Assistant:", end=" ")
+                for chunk in response.iter_text():
+                    print(chunk, end="", flush=True)
+                print()
+            else:
+                print(f"Failed to send request: {response.status_code} {response.text}")
+                return None
+
+
+def main():
+    history = []
+    print("Enter 'q' to quit, 'c' to clear chat history.")
+    while True:
+        user_input = input("User: ").strip()
+        if user_input.lower() in ['q', 'quit']:
+            print("Exiting chat.")
+            break
+        if user_input.lower() == 'c':
+            print("Clearing chat history.")
+            history.clear()
+            continue
+
+        chat_with_model_stream(user_input, history)
+        # Future improvement: Update history based on API response if needed.
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/examples/stream_vllm_fastapi.py b/examples/stream_vllm_fastapi.py
@@ -0,0 +1,69 @@
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+import torch
+from vllm import LLM, SamplingParams
+from transformers import AutoTokenizer
+
+app = FastAPI()
+
+
+class ChatRequest(BaseModel):
+    user_input: str
+    history: list
+
+
+tokenizer = None
+model = None
+
+
+@app.on_event("startup")
+def load_model_and_tokenizer():
+    global tokenizer, model
+    path = "AIDC-AI/Marco-o1"
+    tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
+    model = LLM(model=path, tensor_parallel_size=4)
+
+
+def generate_response_stream(model, text, max_new_tokens=4096):
+    new_output = ''
+    sampling_params = SamplingParams(
+        max_tokens=1,
+        temperature=0,
+        top_p=0.9
+    )
+    with torch.inference_mode():
+        for _ in range(max_new_tokens):
+            outputs = model.generate(
+                [f'{text}{new_output}'],
+                sampling_params=sampling_params,
+                use_tqdm=False
+            )
+            next_token = outputs[0].outputs[0].text
+            new_output += next_token
+            yield next_token  # Yield each part of the response
+
+            if new_output.endswith('</Output>'):
+                break
+
+
+@app.post("/chat/")
+async def chat(request: ChatRequest):
+    if not request.user_input:
+        raise HTTPException(status_code=400, detail="Input cannot be empty.")
+
+    if request.user_input.lower() in ['q', 'quit']:
+        return {"response": "Exiting chat."}
+
+    if request.user_input.lower() == 'c':
+        request.history.clear()
+        return {"response": "Clearing chat history."}
+
+    request.history.append({"role": "user", "content": request.user_input})
+    text = tokenizer.apply_chat_template(request.history, tokenize=False, add_generation_prompt=True)
+
+    response_stream = generate_response_stream(model, text)
+
+    # Stream the response using StreamingResponse
+    return StreamingResponse(response_stream, media_type="text/plain")
+
diff --git a/examples/vllm_fastapi.py b/examples/vllm_fastapi.py
@@ -0,0 +1,64 @@
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import torch
+from vllm import LLM, SamplingParams
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+app = FastAPI()
+
+
+class ChatRequest(BaseModel):
+    user_input: str
+    history: list
+
+
+tokenizer = None
+model = None
+
+
+@app.on_event("startup")
+def load_model_and_tokenizer():
+    global tokenizer, model
+    path = "AIDC-AI/Marco-o1"
+    tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
+    model = LLM(model=path, tensor_parallel_size=4)
+
+
+def generate_response(model, text, max_new_tokens=4096):
+    new_output = ''
+    sampling_params = SamplingParams(
+        max_tokens=1,
+        temperature=0,
+        top_p=0.9
+    )
+    with torch.inference_mode():
+        for _ in range(max_new_tokens):
+            outputs = model.generate(
+                [f'{text}{new_output}'],
+                sampling_params=sampling_params,
+                use_tqdm=False
+            )
+            new_output += outputs[0].outputs[0].text
+            if new_output.endswith('</Output>'):
+                break
+    return new_output
+
+
+@app.post("/chat/")
+async def chat(request: ChatRequest):
+    if not request.user_input:
+        raise HTTPException(status_code=400, detail="Input cannot be empty.")
+
+    if request.user_input.lower() in ['q', 'quit']:
+        return {"response": "Exiting chat."}
+
+    if request.user_input.lower() == 'c':
+        request.history.clear()
+        return {"response": "Clearing chat history."}
+
+    request.history.append({"role": "user", "content": request.user_input})
+    text = tokenizer.apply_chat_template(request.history, tokenize=False, add_generation_prompt=True)
+    response = generate_response(model, text)
+    request.history.append({"role": "assistant", "content": response})
+
+    return {"response": response, "history": request.history}