File size: 848 Bytes
aeabd50
29cd19f
6092512
aeabd50
 
ecdd9b4
29cd19f
6092512
29cd19f
aeabd50
29cd19f
aeabd50
 
6092512
29cd19f
 
6092512
29cd19f
aeabd50
29cd19f
9e88cb6
 
29cd19f
9e88cb6
 
29cd19f
9e88cb6
aeabd50
9e88cb6
 
 
6092512
aeabd50
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import os
import gradio as gr
import download_model  # Ensures model is downloaded
from llama_cpp import Llama

MODEL_PATH = "model/Mistral-7b-instruct-v0.3.Q4_K_M.gguf"

# Load model
llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=2048,
    n_threads=6,
    use_mlock=True,
    use_mmap=True,
)

# Format input prompt
def format_prompt(user_input):
    return f"[INST] {user_input.strip()} [/INST]"

# Non-streaming response (fast with 32 tokens)
def chat_fn(message, history=None):
    prompt = format_prompt(message)
    output = llm(prompt, max_tokens=32, stop=["</s>"])
    return output["choices"][0]["text"].strip()

# Gradio interface
gr.ChatInterface(
    fn=chat_fn,
    title="🦙 Mistral 7B v0.3 (Fast)",
    description="Chatbot using Mistral 7B with reduced token generation for faster responses.",
    theme="default",
).launch()