Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import download_model # Ensures model is downloaded | |
| from llama_cpp import Llama | |
| MODEL_PATH = "model/Mistral-7b-instruct-v0.3.Q4_K_M.gguf" | |
| # Load model | |
| llm = Llama( | |
| model_path=MODEL_PATH, | |
| n_ctx=2048, | |
| n_threads=6, | |
| use_mlock=True, | |
| use_mmap=True, | |
| ) | |
| # Format input prompt | |
| def format_prompt(user_input): | |
| return f"[INST] {user_input.strip()} [/INST]" | |
| # Non-streaming response (fast with 32 tokens) | |
| def chat_fn(message, history=None): | |
| prompt = format_prompt(message) | |
| output = llm(prompt, max_tokens=32, stop=["</s>"]) | |
| return output["choices"][0]["text"].strip() | |
| # Gradio interface | |
| gr.ChatInterface( | |
| fn=chat_fn, | |
| title="π¦ Mistral 7B v0.3 (Fast)", | |
| description="Chatbot using Mistral 7B with reduced token generation for faster responses.", | |
| theme="default", | |
| ).launch() | |