File size: 4,636 Bytes
0c46c89 4173e6d 0c46c89 4173e6d 0c46c89 4173e6d 0c46c89 4173e6d 0c46c89 4173e6d 0c46c89 4173e6d 0c46c89 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import gradio as gr
from langchain_community.llms import LlamaCpp
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_core.callbacks import StreamingStdOutCallbackHandler
from langchain.retrievers import TFIDFRetriever
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "Danielrahmai1991/finbro-v0.1.0-llama-3-8B-instruct-1m",
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
from langchain_huggingface.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
FastLanguageModel.for_inference(model)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
from langchain_community.llms import HuggingFaceEndpoint
# gpu_llm = HuggingFacePipeline(
# pipeline=pipe,
# batch_size=5, # adjust as needed based on GPU map and model size.
# model_kwargs={"temperature": 0.75, "max_length": 512, "max_new_tokens": 256, "repetition_penalty": 1.15, "trust_remote_code": True},
# )
gpu_llm = HuggingFacePipeline(
pipeline=pipe,
batch_size=5, # adjust as needed based on GPU map and model size.
model_kwargs={"temperature": 0.75, "max_length": 512, "max_new_tokens": 256, "repetition_penalty": 1.15, "trust_remote_code": True},
)
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.schema import HumanMessage, SystemMessage, AIMessage
alpaca_prompt_simple = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{question}
### Input:
### Response:
"""
prompt = PromptTemplate.from_template(alpaca_prompt_simple)
llm_chain_model = LLMChain(prompt=prompt, llm=gpu_llm.bind(skip_prompt=True))
from langchain.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
examples = [
{
"query": "what is forex?",
"answer": "Forex is an abbreviation for foreign exchange. It involves trading currencies from different countries with one another at the current market price."
},
]
example_prompt = ChatPromptTemplate.from_messages(
[
("human", "{query}"),
("ai", "{answer}"),
]
)
few_shot_prompt = FewShotChatMessagePromptTemplate(
example_prompt=example_prompt,
examples=examples,
)
# with memory
from langchain_core.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
alpaca_prompt_memory = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
{chat_history}
### Instruction:
{question}
### Input:
### Response:
"""
prompt = PromptTemplate(
input_variables=["chat_history", "question"], template=alpaca_prompt_memory
)
memory = ConversationBufferMemory(memory_key="chat_history")
llm_chain_memory = LLMChain(
llm=gpu_llm.bind(skip_prompt=True),
prompt=prompt,
verbose=True,
memory=memory,
)
# question = "give me suggestion about inevstment"
def greet(question, model_type):
print(f"question is {question}")
if model_type == "With memory":
print("With memory")
response_of_llm = llm_chain_memory.predict(question=question)
else:
print("Without memory")
query = question
final_prompt = ChatPromptTemplate.from_messages(
[
("system", "You are a financial ai assitant "),
few_shot_prompt,
("human", "{userInput}"),
]
)
messages = final_prompt.format(userInput=query)
ai_out = llm_chain_model.invoke(messages)
response_of_llm = ai_out['text']
print(f"out is: {response_of_llm}")
return response_of_llm
demo = gr.Interface(fn=greet, inputs=["text", gr.Dropdown(
["With memory", "Without memory"], label="Memory status", info="With using memory, the output will be slow but strong"
),], outputs="text")
demo.launch(debug=True, share=True)
|