Phi3-ORPO

Paused

App Files Files Community

justinj92 commited on May 7, 2024

Commit

1e9b57f

verified ·

1 Parent(s): 9f02f73

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -4

app.py CHANGED Viewed

@@ -311,7 +311,7 @@ if not os.path.exists(f"{CFG.Embeddings_path}/index.faiss"):
 embeddings = HuggingFaceInstructEmbeddings(model_name=CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
 vectordb = FAISS.load_local(f"{CFG.Output_folder}/faiss_index_ml_papers", embeddings, allow_dangerous_deserialization=True)
-@spaces.GPU
 def build_model(model_repo=CFG.model_name):
     tokenizer = AutoTokenizer.from_pretrained(model_repo)
     model = AutoModelForCausalLM.from_pretrained(model_repo, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
@@ -340,16 +340,37 @@ Question: {question}
 PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
 retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": CFG.k})
-qa_chain = RetrievalQA(llm=llm, retriever=retriever, prompt_template=PROMPT, return_source_documents=True, verbose=False)
 def process_llm_response(llm_response):
     ans = textwrap.fill(llm_response['result'], width=1500)
     sources_used = ' \n'.join([f"{source.metadata['source'].split('/')[-1][:-4]} - page: {str(source.metadata['page'])}" for source in llm_response['source_documents']])
     return f"{ans}\n\nSources:\n{sources_used}"
-@gr.Interface(fn=process_llm_response, inputs=["text", "state"], outputs="text", title="Chat With LLMs", description="Now Running Phi3-ORPO")
 def llm_ans(message, history):
     llm_response = qa_chain.invoke(message)
     return process_llm_response(llm_response)
-llm_ans.launch()

 embeddings = HuggingFaceInstructEmbeddings(model_name=CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
 vectordb = FAISS.load_local(f"{CFG.Output_folder}/faiss_index_ml_papers", embeddings, allow_dangerous_deserialization=True)
 def build_model(model_repo=CFG.model_name):
     tokenizer = AutoTokenizer.from_pretrained(model_repo)
     model = AutoModelForCausalLM.from_pretrained(model_repo, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
 PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
 retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": CFG.k})
 def process_llm_response(llm_response):
     ans = textwrap.fill(llm_response['result'], width=1500)
     sources_used = ' \n'.join([f"{source.metadata['source'].split('/')[-1][:-4]} - page: {str(source.metadata['page'])}" for source in llm_response['source_documents']])
     return f"{ans}\n\nSources:\n{sources_used}"
+@spaces.GPU
 def llm_ans(message, history):
+    tok, model = build_model()
+    terminators = [tok.eos_token_id, 32007, 32011, 32001, 32000]
+    pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty)
+    llm = HuggingFacePipeline(pipeline=pipe)
+    qa_chain = RetrievalQA(llm=llm, retriever=retriever, prompt_template=PROMPT, return_source_documents=True, verbose=False)
     llm_response = qa_chain.invoke(message)
     return process_llm_response(llm_response)
+demo = gr.ChatInterface(
+     fn=llm_ans,
+     examples=[["Write me a poem about Machine Learning."]],
+     # multimodal=False,
+     stop_btn="Stop Generation",
+     title="Chat With LLMs",
+     description="Now Running Phi3-ORPO",
+)
+demo.launch()