Danielrahmai1991 commited on
Commit
0c46c89
·
verified ·
1 Parent(s): e8a768a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from langchain_community.llms import LlamaCpp
4
+ from langchain.prompts import PromptTemplate
5
+ from langchain.chains import LLMChain
6
+ from langchain_core.callbacks import StreamingStdOutCallbackHandler
7
+ from langchain.retrievers import TFIDFRetriever
8
+ from langchain.chains import RetrievalQA
9
+ from langchain.memory import ConversationBufferMemory
10
+
11
+ from unsloth import FastLanguageModel
12
+ import torch
13
+ max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
14
+ dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
15
+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
16
+
17
+ model, tokenizer = FastLanguageModel.from_pretrained(
18
+ model_name = "Danielrahmai1991/finbro-v0.1.0-llama-3-8B-instruct-1m",
19
+ max_seq_length = max_seq_length,
20
+ dtype = dtype,
21
+ load_in_4bit = load_in_4bit,
22
+ # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
23
+ )
24
+
25
+ from langchain_huggingface.llms import HuggingFacePipeline
26
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
27
+ FastLanguageModel.for_inference(model)
28
+
29
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
30
+ from langchain_community.llms import HuggingFaceEndpoint
31
+
32
+
33
+ # gpu_llm = HuggingFacePipeline(
34
+ # pipeline=pipe,
35
+ # batch_size=5, # adjust as needed based on GPU map and model size.
36
+ # model_kwargs={"temperature": 0.75, "max_length": 512, "max_new_tokens": 256, "repetition_penalty": 1.15, "trust_remote_code": True},
37
+
38
+ # )
39
+ gpu_llm = HuggingFacePipeline(
40
+ pipeline=pipe,
41
+ batch_size=5, # adjust as needed based on GPU map and model size.
42
+ model_kwargs={"temperature": 0.75, "max_length": 512, "max_new_tokens": 256, "repetition_penalty": 1.15, "trust_remote_code": True},
43
+
44
+ )
45
+ from langchain_core.prompts import PromptTemplate
46
+
47
+ alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
48
+
49
+ ### Instruction:
50
+ {question}
51
+
52
+ ### Input:
53
+
54
+
55
+ ### Response:
56
+ """
57
+
58
+ prompt = PromptTemplate.from_template(alpaca_prompt)
59
+
60
+ gpu_chain = prompt | gpu_llm.bind(stop=["\n\n"])
61
+
62
+ # question = "give me suggestion about inevstment"
63
+
64
+ def greet(question, model_type):
65
+ print(f"question is {question}")
66
+ if model_type == "With memory":
67
+ response_of_llm = gpu_chain.invoke({"question": question})
68
+ print("creating model created")
69
+ else:
70
+ template = """You are the Finiantial expert:
71
+ ### Instruction:
72
+ {question}
73
+ ### Input:
74
+ ### Response:
75
+ """
76
+ response_of_llm = gpu_chain.invoke({"question": question})
77
+
78
+ print(f"out is: {response_of_llm}")
79
+ return response_of_llm
80
+
81
+ demo = gr.Interface(fn=greet, inputs=["text", gr.Dropdown(
82
+ ["With memory", "Without memory"], label="Memory status", info="With using memory, the output will be slow but strong"
83
+ ),], outputs="text")
84
+ demo.launch(debug=True, share=True)