Spaces:

himamsa
/

mistral-7b-chat

Sleeping

File size: 848 Bytes

aeabd50
29cd19f
6092512
aeabd50
 
ecdd9b4
29cd19f
6092512
29cd19f
aeabd50
29cd19f
aeabd50
 
6092512
29cd19f
 
6092512
29cd19f
aeabd50
29cd19f
9e88cb6
 
29cd19f
9e88cb6
 
29cd19f
9e88cb6
aeabd50
9e88cb6
 
 
6092512
aeabd50

import os
import gradio as gr
import download_model  # Ensures model is downloaded
from llama_cpp import Llama

MODEL_PATH = "model/Mistral-7b-instruct-v0.3.Q4_K_M.gguf"

# Load model
llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=2048,
    n_threads=6,
    use_mlock=True,
    use_mmap=True,
)

# Format input prompt
def format_prompt(user_input):
    return f"[INST] {user_input.strip()} [/INST]"

# Non-streaming response (fast with 32 tokens)
def chat_fn(message, history=None):
    prompt = format_prompt(message)
    output = llm(prompt, max_tokens=32, stop=["</s>"])
    return output["choices"][0]["text"].strip()

# Gradio interface
gr.ChatInterface(
    fn=chat_fn,
    title="🦙 Mistral 7B v0.3 (Fast)",
    description="Chatbot using Mistral 7B with reduced token generation for faster responses.",
    theme="default",
).launch()