Do0rMaMu commited on
Commit
7eebe0e
·
verified ·
1 Parent(s): 801d6e4

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +21 -13
main.py CHANGED
@@ -1,24 +1,32 @@
1
- from ctransformers import AutoModelForCausalLM
2
- from fastapi import FastAPI, Form
3
  from pydantic import BaseModel
4
 
5
- # Model loading
6
- llm = AutoModelForCausalLM.from_pretrained(
7
- "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf",
8
- model_type='llama',
9
- max_new_tokens=1096,
10
- threads=3,
 
 
11
  )
12
- # Pydantic object
 
13
  class Validation(BaseModel):
14
- user_prompt: str # User's prompt
15
- system_prompt: str # System's instruction
16
 
17
- # FastAPI application
18
  app = FastAPI()
19
 
20
  # Endpoint for generating responses
21
  @app.post("/generate_response")
22
  async def generate_response(item: Validation):
 
23
  prompt = f"<s>[INST] <<SYS>> \n {item.system_prompt}<</SYS>> \n {item.user_prompt} [/INST]"
24
- return llm.generate(prompt, do_sample=True) # Adjusted to include the generation method with do_sample if needed
 
 
 
 
 
 
1
+ from fastapi import FastAPI
 
2
  from pydantic import BaseModel
3
 
4
+ # Assuming Llama class has been correctly imported and set up
5
+ from llama_cpp import Llama
6
+
7
+ # Model loading with specified path and configuration
8
+ llm = Llama(
9
+ model_path="Meta-Llama-3-8B-Instruct.Q4_K_M.gguf", # Update the path as necessary
10
+ n_ctx=4096, # Maximum number of tokens for context (input + output)
11
+ n_threads=4, # Number of CPU cores used
12
  )
13
+
14
+ # Pydantic object for validation
15
  class Validation(BaseModel):
16
+ user_prompt: str # User's input prompt
17
+ system_prompt: str # System's guiding prompt
18
 
19
+ # FastAPI application initialization
20
  app = FastAPI()
21
 
22
  # Endpoint for generating responses
23
  @app.post("/generate_response")
24
  async def generate_response(item: Validation):
25
+ # Construct the complete prompt using the given system and user prompts
26
  prompt = f"<s>[INST] <<SYS>> \n {item.system_prompt}<</SYS>> \n {item.user_prompt} [/INST]"
27
+
28
+ # Call the Llama model to generate a response
29
+ output = llm(prompt, max_tokens=1024, stop=["Q:", "\n"], echo=True) # Update parameters as needed
30
+
31
+ # Extract and return the text from the response
32
+ return output['choices'][0]['text']