webpluging

Paused

App Files Files Community

ranamhamoud commited on Apr 17, 2024

Commit

297485e

verified ·

1 Parent(s): 1661753

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -38

app.py CHANGED Viewed

@@ -1,82 +1,82 @@
 import os
 from threading import Thread
 from typing import Iterator
 from mongoengine import connect, Document, StringField, SequenceField
 import gradio as gr
 import spaces
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
 from peft import PeftModel
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-DESCRIPTION = """\
 # ✨Storytell AI🧑🏽‍💻
-Welcome to the **Storytell AI** space, crafted with care by Ranam & George. Dive into the world of educational storytelling with our [Storytell](https://huggingface.co/ranamhamoud/storytell) model. This iteration of the Llama 2 model with 7 billion parameters is fine-tuned to generate educational stories that engage and educate. Enjoy a journey of discovery and creativity—your storytelling lesson begins here! You can prompt this model to explain any computer science concept. **Please check the examples below**.
 """
 LICENSE = """
-<p/>
 ---
-As a derivate work of [Llama-2-7b-chat](https://huggingface.co/meta-llama/Llama-2-7b-chat) by Meta,
 this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/USE_POLICY.md).
 """
 if not torch.cuda.is_available():
     DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
-if torch.cuda.is_available():
-    bnb_config = BitsAndBytesConfig(
-    load_in_8bit=True,
-    bnb_4bit_compute_dtype=torch.float16,
-    )
-    model_id = "meta-llama/Llama-2-7b-chat-hf"
-    base_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto",quantization_config=bnb_config)
-    model = PeftModel.from_pretrained(base_model,"ranamhamoud/storytell")
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    tokenizer.pad_token = tokenizer.eos_token
 PASSWORD = os.environ.get("MONGO_PASS")
-connect(host = f"mongodb+srv://ranamhammoud11:{PASSWORD}@stories.zf5v52a.mongodb.net/")
 class Story(Document):
     message = StringField()
     content = StringField()
     story_id = SequenceField(primary_key=True)
 def make_prompt(entry):
-    return f"TELL A STORY,RELATE TO COMPUTER SCIENCE,INCLUDE ASSESMENTS. MAKE IT REALISTIC AND AROUND 800 WORDS: {entry} "
 @spaces.GPU
 def generate(
     message: str,
     chat_history: list[tuple[str, str]],
-    max_new_tokens: int = 1024,
-    temperature: float = 0.1,
-    top_p: float = 0.6,
-    top_k: int = 20,
     repetition_penalty: float = 1.0,
 ) -> Iterator[str]:
     conversation = []
     for user, assistant in chat_history:
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": make_prompt(message)})
     enc = tokenizer(make_prompt(message), return_tensors="pt", padding=True, truncation=True)
-    input_ids = enc.input_ids
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-    input_ids = input_ids.to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         {"input_ids": input_ids},
@@ -98,26 +98,29 @@ def generate(
         yield "".join(outputs)
     final_story = "".join(outputs)
     try:
-        saved_story = Story(message=message, content=final_story).save()
         yield f"{final_story}\n\n Story saved with ID: {saved_story.story_id}"
     except Exception as e:
         yield f"Failed to save story: {str(e)}"
 chat_interface = gr.ChatInterface(
     fn=generate,
     stop_btn=None,
     examples=[
         ["Can you explain briefly to me what is the Python programming language?"],
-         ["Could you please provide an explanation about the concept of recursion?"],
         ["Could you explain what a URL is?"]
     ],
 )
 with gr.Blocks(css="style.css") as demo:
     gr.Markdown(DESCRIPTION)
     chat_interface.render()
     gr.Markdown(LICENSE)
 if __name__ == "__main__":
     demo.queue(max_size=20)
-    demo.launch(share=True)

 import os
+import torch
 from threading import Thread
 from typing import Iterator
 from mongoengine import connect, Document, StringField, SequenceField
 import gradio as gr
 import spaces
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from peft import PeftModel
+# Constants
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+# Description and License Texts
+DESCRIPTION = """
 # ✨Storytell AI🧑🏽‍💻
+Welcome to the **Storytell AI** space, crafted with care by Ranam & George. Dive into the world of educational storytelling with our model. This iteration of the Llama 2 model with 7 billion parameters is fine-tuned to generate educational stories that engage and educate. Enjoy a journey of discovery and creativity—your storytelling lesson begins here! You can prompt this model to explain any computer science concept. **Please check the examples below**.
 """
 LICENSE = """
 ---
+As a derivative work of [Llama-2-7b-chat](https://huggingface.co/meta-llama/Llama-2-7b-chat) by Meta,
 this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/USE_POLICY.md).
 """
+# GPU Check and add CPU warning
 if not torch.cuda.is_available():
     DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
+# Model and Tokenizer Configuration
+model_id = "meta-llama/Llama-2-7b-chat-hf"
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=False,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+base_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=bnb_config)
+model = PeftModel.from_pretrained(base_model, "ranamhamoud/storytell")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+tokenizer.pad_token = tokenizer.eos_token
+# MongoDB Connection
 PASSWORD = os.environ.get("MONGO_PASS")
+connect(host=f"mongodb+srv://ranamhammoud11:{PASSWORD}@stories.zf5v52a.mongodb.net/")
+# MongoDB Document
 class Story(Document):
     message = StringField()
     content = StringField()
     story_id = SequenceField(primary_key=True)
+# Utility function for prompts
 def make_prompt(entry):
+    return  f"### Human: {entry} ### Assistant:"
+    # f"TELL A STORY, RELATE TO COMPUTER SCIENCE, INCLUDE ASSESMENTS. MAKE IT REALISTIC AND AROUND 800 WORDS: {entry}"
+# Gradio Function
 @spaces.GPU
 def generate(
     message: str,
     chat_history: list[tuple[str, str]],
+    max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
+    temperature: float = 0.3,
+    top_p: float = 0.7,
+    top_k: int = 20,
     repetition_penalty: float = 1.0,
 ) -> Iterator[str]:
     conversation = []
     for user, assistant in chat_history:
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": make_prompt(message)})
     enc = tokenizer(make_prompt(message), return_tensors="pt", padding=True, truncation=True)
+    input_ids = enc.input_ids.to(model.device)
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         {"input_ids": input_ids},
         yield "".join(outputs)
     final_story = "".join(outputs)
     try:
+        saved_story = Story(message=message, content=final_story).save()
         yield f"{final_story}\n\n Story saved with ID: {saved_story.story_id}"
     except Exception as e:
         yield f"Failed to save story: {str(e)}"
+# Gradio Interface Setup
 chat_interface = gr.ChatInterface(
     fn=generate,
     stop_btn=None,
     examples=[
         ["Can you explain briefly to me what is the Python programming language?"],
+        ["Could you please provide an explanation about the concept of recursion?"],
         ["Could you explain what a URL is?"]
     ],
 )
+# Gradio Web Interface
 with gr.Blocks(css="style.css") as demo:
     gr.Markdown(DESCRIPTION)
     chat_interface.render()
     gr.Markdown(LICENSE)
+# Main Execution
 if __name__ == "__main__":
     demo.queue(max_size=20)
+    demo.launch(share=True)