Phi3-ORPO

Paused

App Files Files Community

justinj92 commited on May 7, 2024

Commit

213c70e

verified ·

1 Parent(s): 54a2331

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -9

app.py CHANGED Viewed

@@ -86,10 +86,16 @@ if not os.path.exists(CFG.Embeddings_path + '/index.faiss'):
 embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
 vectordb = FAISS.load_local(CFG.Output_folder + '/faiss_index_ml_papers', embeddings, allow_dangerous_deserialization=True)
-@spaces.GPU
 def build_model(model_repo = CFG.model_name):
     tokenizer = AutoTokenizer.from_pretrained(model_repo)
     model = AutoModelForCausalLM.from_pretrained(model_repo, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
     device = torch.device("cuda")
     model = model.to(device)
@@ -107,12 +113,7 @@ terminators = [
 ]
-# if torch.cuda.is_available():
-#     device = torch.device("cuda")
-#     print(f"Using GPU: {torch.cuda.get_device_name(device)}")
-# else:
-#     device = torch.device("cpu")
-#     print("Using CPU")
 pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty)
@@ -167,7 +168,7 @@ qa_chain = RetrievalQA.from_chain_type(
     verbose = False
 )
-@spaces.GPU
 def wrap_text_preserve_newlines(text, width=1500):
     # Split the input text into lines based on newline characters
     lines = text.split('\n')
@@ -180,7 +181,7 @@ def wrap_text_preserve_newlines(text, width=1500):
     return wrapped_text
-@spaces.GPU
 def process_llm_response(llm_response):
     ans = wrap_text_preserve_newlines(llm_response['result'])

 embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
 vectordb = FAISS.load_local(CFG.Output_folder + '/faiss_index_ml_papers', embeddings, allow_dangerous_deserialization=True)
 def build_model(model_repo = CFG.model_name):
     tokenizer = AutoTokenizer.from_pretrained(model_repo)
     model = AutoModelForCausalLM.from_pretrained(model_repo, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
+    if torch.cuda.is_available():
+      device = torch.device("cuda")
+      print(f"Using GPU: {torch.cuda.get_device_name(device)}")
+    else:
+       device = torch.device("cpu")
+       print("Using CPU")
     device = torch.device("cuda")
     model = model.to(device)
 ]
 pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty)
     verbose = False
 )
 def wrap_text_preserve_newlines(text, width=1500):
     # Split the input text into lines based on newline characters
     lines = text.split('\n')
     return wrapped_text
 def process_llm_response(llm_response):
     ans = wrap_text_preserve_newlines(llm_response['result'])