Spaces:

moraxgiga
/

Gemma_2b_instruct

Runtime error

App Files Files Community

moraxgiga commited on Feb 25, 2024

Commit

043ca42

verified ·

1 Parent(s): 9a25f52

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -34

app.py CHANGED Viewed

@@ -1,47 +1,64 @@
 import gradio as gr
-import torch, os
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers import StoppingCriteria, TextIteratorStreamer
 from threading import Thread
-torch.set_num_threads(2)
 HF_TOKEN = os.environ.get("HF_TOKEN")
-# Loading the tokenizer and model from Hugging Face's model hub.
-tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", use_auth_token=HF_TOKEN)
-model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", use_auth_token=HF_TOKEN)
 def count_tokens(text):
     return len(tokenizer.tokenize(text))
-# Function to generate model predictions.
 def predict(message, history):
     formatted_prompt = f"<start_of_turn>user\n{message}<end_of_turn>\n<start_of_turn>model\n"
     model_inputs = tokenizer(formatted_prompt, return_tensors="pt")
-    streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)
-    generate_kwargs = dict(
-        model_inputs,
-        streamer=streamer,
-        max_new_tokens=2048 - count_tokens(formatted_prompt),
-        top_p=0.2,
-        top_k=20,
-        temperature=0.1,
-        repetition_penalty=2.0,
-        length_penalty=-0.5,
-        num_beams=1
-    )
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()  # Starting the generation in a separate thread.
-    partial_message = ""
-    for new_token in streamer:
-        partial_message += new_token
-        yield partial_message
-# Setting up the Gradio chat interface.
-gr.ChatInterface(predict,
-                 title="Gemma 2b Instruct Chat",
-                 description=None
-                 ).launch()  # Launching the web interface.

 import gradio as gr
+import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from threading import Thread
+# Set the number of threads for PyTorch
+torch.set_num_threads(3)
+# Your Hugging Face token and model identifiers
 HF_TOKEN = os.environ.get("HF_TOKEN")
+MODEL_NAME = "google/gemma-2b-it"
+# Load the tokenizer
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=HF_TOKEN)
+# Load the model and switch it to evaluation mode
+model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, use_auth_token=HF_TOKEN).eval()
+# Apply dynamic quantization
+quantized_model = torch.quantization.quantize_dynamic(
+    model,
+    {torch.nn.Linear},  # Specify the layer types to quantize
+    dtype=torch.qint8  # Target datatype for quantized weights
+)
 def count_tokens(text):
+    """Count tokens in the input text."""
     return len(tokenizer.tokenize(text))
 def predict(message, history):
+    """Generate predictions using the quantized model."""
     formatted_prompt = f"<start_of_turn>user\n{message}<end_of_turn>\n<start_of_turn>model\n"
     model_inputs = tokenizer(formatted_prompt, return_tensors="pt")
+    # Ensure to use the quantized model for prediction
+    generate_kwargs = {
+        "input_ids": model_inputs["input_ids"],
+        "max_length": 2048 - count_tokens(formatted_prompt),
+        "top_p": 0.2,
+        "top_k": 20,
+        "temperature": 0.1,
+        "repetition_penalty": 2.0,
+        "length_penalty": -0.5,
+        "num_beams": 1,
+        "return_dict_in_generate": True,
+        "output_scores": True
+    }
+    with torch.no_grad():  # Ensure no gradient is computed to save memory and computation
+        output = quantized_model.generate(**generate_kwargs)
+    # Decode and return the generated text
+    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
+    return generated_text
+# Setting up the Gradio interface
+interface = gr.Interface(fn=predict,
+                         inputs=[gr.inputs.Textbox(label="Your message"), gr.inputs.Textbox(label="History", default="")],
+                         outputs="text",
+                         title="Quantized Gemma 2B Chat",
+                         description="This is a Gradio interface for interacting with a quantized version of the Gemma 2B model.")
+# Launch the interface
+interface.launch()