daniellefranca96 commited on
Commit
73c7429
·
1 Parent(s): 7aaa05b

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +12 -1
main.py CHANGED
@@ -2,6 +2,8 @@ from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  import requests
4
  from llama_cpp import Llama
 
 
5
 
6
  llms = {
7
  "TinyLLama 1b 4_K_M 2048": {
@@ -39,6 +41,12 @@ llms = {
39
 
40
  #Fast API
41
  app = FastAPI()
 
 
 
 
 
 
42
 
43
  @app.post("/llm_on_cpu")
44
  async def stream(item: dict):
@@ -54,4 +62,7 @@ async def stream(item: dict):
54
  llm = Llama(model_path="./code/"+model['file'], n_ctx=nctx, verbose=True, n_threads=8)
55
 
56
  prompt = f"{prefix}{user.replace('{prompt}', item['prompt'])}{suffix}"
57
- return llm(prompt, max_tokens=max_tokens)
 
 
 
 
2
  from pydantic import BaseModel
3
  import requests
4
  from llama_cpp import Llama
5
+ import threading
6
+ import gc
7
 
8
  llms = {
9
  "TinyLLama 1b 4_K_M 2048": {
 
41
 
42
  #Fast API
43
  app = FastAPI()
44
+ llm = None
45
+
46
+ def clean_memory():
47
+ llm = None
48
+ gc.collect()
49
+
50
 
51
  @app.post("/llm_on_cpu")
52
  async def stream(item: dict):
 
62
  llm = Llama(model_path="./code/"+model['file'], n_ctx=nctx, verbose=True, n_threads=8)
63
 
64
  prompt = f"{prefix}{user.replace('{prompt}', item['prompt'])}{suffix}"
65
+ result = llm(prompt, max_tokens=max_tokens)
66
+ thread = threading.Thread(target=clean_memory)
67
+ thread.start()
68
+ return result