findemov3.4 / app.py
Danielrahmai1991's picture
Update app.py
4173e6d verified
raw
history blame
4.64 kB
import gradio as gr
from langchain_community.llms import LlamaCpp
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_core.callbacks import StreamingStdOutCallbackHandler
from langchain.retrievers import TFIDFRetriever
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "Danielrahmai1991/finbro-v0.1.0-llama-3-8B-instruct-1m",
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
from langchain_huggingface.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
FastLanguageModel.for_inference(model)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
from langchain_community.llms import HuggingFaceEndpoint
# gpu_llm = HuggingFacePipeline(
# pipeline=pipe,
# batch_size=5, # adjust as needed based on GPU map and model size.
# model_kwargs={"temperature": 0.75, "max_length": 512, "max_new_tokens": 256, "repetition_penalty": 1.15, "trust_remote_code": True},
# )
gpu_llm = HuggingFacePipeline(
pipeline=pipe,
batch_size=5, # adjust as needed based on GPU map and model size.
model_kwargs={"temperature": 0.75, "max_length": 512, "max_new_tokens": 256, "repetition_penalty": 1.15, "trust_remote_code": True},
)
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.schema import HumanMessage, SystemMessage, AIMessage
alpaca_prompt_simple = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{question}
### Input:
### Response:
"""
prompt = PromptTemplate.from_template(alpaca_prompt_simple)
llm_chain_model = LLMChain(prompt=prompt, llm=gpu_llm.bind(skip_prompt=True))
from langchain.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
examples = [
{
"query": "what is forex?",
"answer": "Forex is an abbreviation for foreign exchange. It involves trading currencies from different countries with one another at the current market price."
},
]
example_prompt = ChatPromptTemplate.from_messages(
[
("human", "{query}"),
("ai", "{answer}"),
]
)
few_shot_prompt = FewShotChatMessagePromptTemplate(
example_prompt=example_prompt,
examples=examples,
)
# with memory
from langchain_core.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
alpaca_prompt_memory = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
{chat_history}
### Instruction:
{question}
### Input:
### Response:
"""
prompt = PromptTemplate(
input_variables=["chat_history", "question"], template=alpaca_prompt_memory
)
memory = ConversationBufferMemory(memory_key="chat_history")
llm_chain_memory = LLMChain(
llm=gpu_llm.bind(skip_prompt=True),
prompt=prompt,
verbose=True,
memory=memory,
)
# question = "give me suggestion about inevstment"
def greet(question, model_type):
print(f"question is {question}")
if model_type == "With memory":
print("With memory")
response_of_llm = llm_chain_memory.predict(question=question)
else:
print("Without memory")
query = question
final_prompt = ChatPromptTemplate.from_messages(
[
("system", "You are a financial ai assitant "),
few_shot_prompt,
("human", "{userInput}"),
]
)
messages = final_prompt.format(userInput=query)
ai_out = llm_chain_model.invoke(messages)
response_of_llm = ai_out['text']
print(f"out is: {response_of_llm}")
return response_of_llm
demo = gr.Interface(fn=greet, inputs=["text", gr.Dropdown(
["With memory", "Without memory"], label="Memory status", info="With using memory, the output will be slow but strong"
),], outputs="text")
demo.launch(debug=True, share=True)