Spaces:

GIGAParviz
/

Parviz_Mind

Sleeping

GIGAParviz commited on Aug 20, 2024

Commit

4b4114f

verified ·

1 Parent(s): cf98681

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,45 +1,44 @@
 import gradio as gr
-import transformers
 import torch
-model_id = "unsloth/llama-2-7b-bnb-4bit"
-pipeline = transformers.pipeline(
-    "text-generation",
-    model=model_id,
-    model_kwargs={"torch_dtype": torch.bfloat16},
-    device="cpu",
 )
 def generate_response(user_input):
-    messages = [
-    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
-    {"role": "user", "content": str(user_input)},
-    ]
-    prompt = pipeline.tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-    )
-    terminators = [
-        pipeline.tokenizer.eos_token_id,
-        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
-    ]
-    outputs = pipeline(
-        prompt,
-        max_new_tokens=256,
-        eos_token_id=terminators,
-        do_sample=True,
-        temperature=0.6,
-        top_p=0.9,
-    )
-    return outputs[0]
 # Gradio interface
 iface = gr.Interface(
@@ -50,4 +49,5 @@ iface = gr.Interface(
     description="Ask a question in Persian or English."
 )
-iface.launch()

 import gradio as gr
+from unsloth import FastLanguageModel
+from transformers import BitsAndBytesConfig
 import torch
+# Model configuration
+model_name = "unsloth/llama-3-8b-bnb-4bit"
+max_seq_length = 512
+dtype = None
+load_in_4bit = True
+# Load model and tokenizer
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=model_name,
+    max_seq_length=2048,
+    dtype=None,
+    device_map="cpu",
+    load_in_4bit=True,
 )
+device = torch.device('cpu')
+model = FastLanguageModel().to(device)
+FastLanguageModel.for_inference(model)
+# Define Alpaca prompt format
+alpaca_prompt = """
+### Instruction:
+{0}
+### Input:
+{1}
+### Response:
+{2}
+"""
+# Function to generate response
 def generate_response(user_input):
+    prompt = alpaca_prompt.format(user_input, "", "")
+    inputs = tokenizer([prompt], return_tensors="pt").to("cpu")
+    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
+    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    return response[0]
 # Gradio interface
 iface = gr.Interface(
     description="Ask a question in Persian or English."
 )
+# Launch the interface
+iface.launch()