|
import gradio as gr |
|
|
|
from langchain_community.llms import LlamaCpp |
|
from langchain.prompts import PromptTemplate |
|
from langchain.chains import LLMChain |
|
from langchain_core.callbacks import StreamingStdOutCallbackHandler |
|
from langchain.retrievers import TFIDFRetriever |
|
from langchain.chains import RetrievalQA |
|
from langchain.memory import ConversationBufferMemory |
|
|
|
from unsloth import FastLanguageModel |
|
import torch |
|
max_seq_length = 2048 |
|
dtype = None |
|
load_in_4bit = True |
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name = "Danielrahmai1991/finbro-v0.1.0-llama-3-8B-instruct-1m", |
|
max_seq_length = max_seq_length, |
|
dtype = dtype, |
|
load_in_4bit = load_in_4bit, |
|
|
|
) |
|
|
|
from langchain_huggingface.llms import HuggingFacePipeline |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
|
FastLanguageModel.for_inference(model) |
|
|
|
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256) |
|
from langchain_community.llms import HuggingFaceEndpoint |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gpu_llm = HuggingFacePipeline( |
|
pipeline=pipe, |
|
batch_size=5, |
|
model_kwargs={"temperature": 0.75, "max_length": 512, "max_new_tokens": 256, "repetition_penalty": 1.15, "trust_remote_code": True}, |
|
|
|
) |
|
from langchain_core.prompts import PromptTemplate |
|
from langchain.chains import LLMChain |
|
from langchain.schema import HumanMessage, SystemMessage, AIMessage |
|
|
|
alpaca_prompt_simple = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. |
|
|
|
### Instruction: |
|
{question} |
|
|
|
### Input: |
|
|
|
|
|
### Response: |
|
""" |
|
|
|
prompt = PromptTemplate.from_template(alpaca_prompt_simple) |
|
llm_chain_model = LLMChain(prompt=prompt, llm=gpu_llm.bind(skip_prompt=True)) |
|
|
|
|
|
from langchain.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate |
|
|
|
|
|
|
|
examples = [ |
|
{ |
|
"query": "what is forex?", |
|
"answer": "Forex is an abbreviation for foreign exchange. It involves trading currencies from different countries with one another at the current market price." |
|
}, |
|
] |
|
example_prompt = ChatPromptTemplate.from_messages( |
|
[ |
|
("human", "{query}"), |
|
("ai", "{answer}"), |
|
] |
|
) |
|
|
|
|
|
few_shot_prompt = FewShotChatMessagePromptTemplate( |
|
example_prompt=example_prompt, |
|
examples=examples, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
from langchain_core.prompts import PromptTemplate |
|
from langchain.memory import ConversationBufferMemory |
|
|
|
alpaca_prompt_memory = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. |
|
{chat_history} |
|
|
|
### Instruction: |
|
|
|
{question} |
|
|
|
|
|
|
|
### Input: |
|
|
|
### Response: |
|
""" |
|
|
|
prompt = PromptTemplate( |
|
input_variables=["chat_history", "question"], template=alpaca_prompt_memory |
|
) |
|
memory = ConversationBufferMemory(memory_key="chat_history") |
|
|
|
llm_chain_memory = LLMChain( |
|
llm=gpu_llm.bind(skip_prompt=True), |
|
prompt=prompt, |
|
verbose=True, |
|
memory=memory, |
|
) |
|
|
|
|
|
|
|
def greet(question, model_type): |
|
print(f"question is {question}") |
|
if model_type == "With memory": |
|
print("With memory") |
|
response_of_llm = llm_chain_memory.predict(question=question) |
|
else: |
|
print("Without memory") |
|
query = question |
|
final_prompt = ChatPromptTemplate.from_messages( |
|
[ |
|
("system", "You are a financial ai assitant "), |
|
few_shot_prompt, |
|
("human", "{userInput}"), |
|
] |
|
) |
|
messages = final_prompt.format(userInput=query) |
|
|
|
ai_out = llm_chain_model.invoke(messages) |
|
response_of_llm = ai_out['text'] |
|
|
|
print(f"out is: {response_of_llm}") |
|
return response_of_llm |
|
|
|
demo = gr.Interface(fn=greet, inputs=["text", gr.Dropdown( |
|
["With memory", "Without memory"], label="Memory status", info="With using memory, the output will be slow but strong" |
|
),], outputs="text") |
|
demo.launch(debug=True, share=True) |
|
|