Spaces:

Do0rMaMu
/

LLaMa3-assistant

Build error

Do0rMaMu commited on May 24, 2024

Commit

7eebe0e

verified ·

1 Parent(s): 801d6e4

Update main.py

Files changed (1) hide show

main.py CHANGED Viewed

@@ -1,24 +1,32 @@
-from ctransformers import AutoModelForCausalLM
-from fastapi import FastAPI, Form
 from pydantic import BaseModel
-# Model loading
-llm = AutoModelForCausalLM.from_pretrained(
-    "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf",
-    model_type='llama',
-    max_new_tokens=1096,
-    threads=3,
 )
-# Pydantic object
 class Validation(BaseModel):
-    user_prompt: str  # User's prompt
-    system_prompt: str  # System's instruction
-# FastAPI application
 app = FastAPI()
 # Endpoint for generating responses
 @app.post("/generate_response")
 async def generate_response(item: Validation):
     prompt = f"<s>[INST] <<SYS>> \n {item.system_prompt}<</SYS>> \n {item.user_prompt} [/INST]"
-    return llm.generate(prompt, do_sample=True)  # Adjusted to include the generation method with do_sample if needed

+from fastapi import FastAPI
 from pydantic import BaseModel
+# Assuming Llama class has been correctly imported and set up
+from llama_cpp import Llama
+# Model loading with specified path and configuration
+llm = Llama(
+    model_path="Meta-Llama-3-8B-Instruct.Q4_K_M.gguf",  # Update the path as necessary
+    n_ctx=4096,       # Maximum number of tokens for context (input + output)
+    n_threads=4,      # Number of CPU cores used
 )
+# Pydantic object for validation
 class Validation(BaseModel):
+    user_prompt: str  # User's input prompt
+    system_prompt: str  # System's guiding prompt
+# FastAPI application initialization
 app = FastAPI()
 # Endpoint for generating responses
 @app.post("/generate_response")
 async def generate_response(item: Validation):
+    # Construct the complete prompt using the given system and user prompts
     prompt = f"<s>[INST] <<SYS>> \n {item.system_prompt}<</SYS>> \n {item.user_prompt} [/INST]"
+    # Call the Llama model to generate a response
+    output = llm(prompt, max_tokens=1024, stop=["Q:", "\n"], echo=True)  # Update parameters as needed
+    # Extract and return the text from the response
+    return output['choices'][0]['text']