File size: 4,636 Bytes
0c46c89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4173e6d
 
0c46c89
4173e6d
0c46c89
 
 
 
 
 
 
 
 
 
4173e6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c46c89
4173e6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c46c89
 
 
 
 
 
4173e6d
 
0c46c89
4173e6d
 
 
 
 
 
 
 
 
 
 
 
 
0c46c89
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import gradio as gr

from langchain_community.llms import LlamaCpp
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_core.callbacks import StreamingStdOutCallbackHandler
from langchain.retrievers import TFIDFRetriever
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory

from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Danielrahmai1991/finbro-v0.1.0-llama-3-8B-instruct-1m",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

from langchain_huggingface.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
FastLanguageModel.for_inference(model)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
from langchain_community.llms import HuggingFaceEndpoint


# gpu_llm = HuggingFacePipeline(
#     pipeline=pipe,
#     batch_size=5,  # adjust as needed based on GPU map and model size.
#     model_kwargs={"temperature": 0.75, "max_length": 512, "max_new_tokens": 256, "repetition_penalty": 1.15, "trust_remote_code": True},

# )
gpu_llm = HuggingFacePipeline(
    pipeline=pipe,
    batch_size=5,  # adjust as needed based on GPU map and model size.
    model_kwargs={"temperature": 0.75, "max_length": 512, "max_new_tokens": 256, "repetition_penalty": 1.15, "trust_remote_code": True},

)
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.schema import HumanMessage, SystemMessage, AIMessage

alpaca_prompt_simple = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{question}

### Input:


### Response:
"""

prompt = PromptTemplate.from_template(alpaca_prompt_simple)
llm_chain_model = LLMChain(prompt=prompt, llm=gpu_llm.bind(skip_prompt=True))


from langchain.prompts import  ChatPromptTemplate, FewShotChatMessagePromptTemplate



examples = [
 {
        "query": "what is forex?",
        "answer": "Forex is an abbreviation for foreign exchange. It involves trading currencies from different countries with one another at the current market price."
    },
]
example_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", "{query}"),
        ("ai", "{answer}"),
    ]
)


few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples=examples,
)




# with memory 
from langchain_core.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory

alpaca_prompt_memory = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
{chat_history}

### Instruction:

{question}



### Input:

### Response:
"""

prompt = PromptTemplate(
    input_variables=["chat_history", "question"], template=alpaca_prompt_memory
)
memory = ConversationBufferMemory(memory_key="chat_history")

llm_chain_memory = LLMChain(
    llm=gpu_llm.bind(skip_prompt=True),
    prompt=prompt,
    verbose=True,
    memory=memory,
)

# question = "give me suggestion about inevstment"

def greet(question, model_type):
    print(f"question is {question}")
    if model_type == "With memory":
        print("With memory")
        response_of_llm = llm_chain_memory.predict(question=question)
    else:
        print("Without memory")
        query = question
        final_prompt = ChatPromptTemplate.from_messages(
            [
                ("system", "You are a financial ai assitant "),
                few_shot_prompt,
                ("human", "{userInput}"),
            ]
        )
        messages = final_prompt.format(userInput=query)
        
        ai_out = llm_chain_model.invoke(messages)
        response_of_llm = ai_out['text']

    print(f"out is: {response_of_llm}")
    return response_of_llm

demo = gr.Interface(fn=greet, inputs=["text", gr.Dropdown(
            ["With memory", "Without memory"], label="Memory status", info="With using memory, the output will be slow but strong"
        ),], outputs="text")
demo.launch(debug=True, share=True)