Ankitnau25 commited on
Commit
a2585c8
·
1 Parent(s): 4a0c081

Add application file

Browse files
Files changed (3) hide show
  1. Dockerfile +17 -0
  2. app.py +201 -0
  3. requirements.txt +11 -0
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ FROM python:3.9
3
+
4
+ RUN useradd -m -u 1000 user
5
+ USER user
6
+ ENV PATH="/home/user/.local/bin:$PATH"
7
+
8
+ WORKDIR /app
9
+
10
+ COPY --chown=user ./requirements.txt requirements.txt
11
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
12
+
13
+ COPY --chown=user . /app
14
+ EXPOSE 7860
15
+ ENV GRADIO_SERVER_NAME="0.0.0.0"
16
+
17
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from unsloth import FastLanguageModel
2
+ from unsloth.chat_templates import get_chat_template
3
+ import re
4
+ from typing import Any, AsyncIterator, Dict, Iterator, List, Optional
5
+
6
+ from langchain_core.callbacks import (
7
+ AsyncCallbackManagerForLLMRun,
8
+ CallbackManagerForLLMRun,
9
+ )
10
+ from langchain_core.language_models import BaseChatModel, SimpleChatModel
11
+ from langchain_core.messages import AIMessageChunk, BaseMessage, HumanMessage
12
+ from langchain.schema import AIMessage, HumanMessage
13
+ import gradio as gr
14
+ from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult
15
+ from langchain_core.runnables import run_in_executor
16
+
17
+ #loading model
18
+ model, tokenizer = FastLanguageModel.from_pretrained(
19
+ model_name = "Ankitnau25/govtbot-llama3.1-v1",
20
+ max_seq_length = 8192,
21
+ load_in_4bit = True,
22
+ # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
23
+ )
24
+ # loading tokenizer
25
+ tokenizer = get_chat_template(
26
+ tokenizer,
27
+ chat_template = "alpaca", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
28
+ mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
29
+ map_eos_token = True, # Maps <|im_end|> to </s> instead
30
+ )
31
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
32
+
33
+ def predict (inp_text):
34
+ messages = [
35
+ {"from": "human", "value": f"{inp_text}"},
36
+ ]
37
+ inputs = tokenizer.apply_chat_template(
38
+ messages,
39
+ tokenize = True,
40
+ add_generation_prompt = True, # Must add for generation
41
+ return_tensors = "pt",
42
+ ).to("cuda")
43
+ model.generation_config.pad_token_id = tokenizer.pad_token_id
44
+ outputs = model.generate(input_ids = inputs, use_cache = True ,temperature = 0.1,max_new_tokens = 512)
45
+ result = tokenizer.batch_decode(outputs)
46
+ # print(result)
47
+ return filter_user_assistant_msgs(result[0])
48
+
49
+ def filter_user_assistant_msgs(text):
50
+ msg_pattern = r".*Response:\n(.*?)<\|im_end\|>"
51
+ match = re.match(msg_pattern, text, re.DOTALL)
52
+ if match:
53
+ message = match.group(1).strip()
54
+ else:
55
+ message = text
56
+ return message
57
+
58
+
59
+
60
+ #defining custom Langchain chat model
61
+ class CustomChatModelAdvanced(BaseChatModel):
62
+ """A custom chat model that echoes the first `n` characters of the input.
63
+
64
+ When contributing an implementation to LangChain, carefully document
65
+ the model including the initialization parameters, include
66
+ an example of how to initialize the model and include any relevant
67
+ links to the underlying models documentation or API.
68
+
69
+ Example:
70
+
71
+ .. code-block:: python
72
+
73
+ model = CustomChatModel(n=2)
74
+ result = model.invoke([HumanMessage(content="hello")])
75
+ result = model.batch([[HumanMessage(content="hello")],
76
+ [HumanMessage(content="world")]])
77
+ """
78
+
79
+ model_name: str
80
+ """The name of the model"""
81
+ n: int
82
+ """The number of characters from the last message of the prompt to be echoed."""
83
+
84
+ def _generate(
85
+ self,
86
+ messages: List[BaseMessage],
87
+ stop: Optional[List[str]] = None,
88
+ run_manager: Optional[CallbackManagerForLLMRun] = None,
89
+ **kwargs: Any,
90
+ ) -> ChatResult:
91
+ """Override the _generate method to implement the chat model logic.
92
+
93
+ This can be a call to an API, a call to a local model, or any other
94
+ implementation that generates a response to the input prompt.
95
+
96
+ Args:
97
+ messages: the prompt composed of a list of messages.
98
+ stop: a list of strings on which the model should stop generating.
99
+ If generation stops due to a stop token, the stop token itself
100
+ SHOULD BE INCLUDED as part of the output. This is not enforced
101
+ across models right now, but it's a good practice to follow since
102
+ it makes it much easier to parse the output of the model
103
+ downstream and understand why generation stopped.
104
+ run_manager: A run manager with callbacks for the LLM.
105
+ """
106
+ # Replace this with actual logic to generate a response from a list
107
+ # of messages.
108
+ last_message = messages[-1]
109
+ tokens = predict(last_message)
110
+ message = AIMessage(
111
+ content=tokens,
112
+ additional_kwargs={}, # Used to add additional payload (e.g., function calling request)
113
+ response_metadata={ # Use for response metadata
114
+ "time_in_seconds": 3,
115
+ },
116
+ )
117
+ ##
118
+
119
+ generation = ChatGeneration(message=message)
120
+ return ChatResult(generations=[generation])
121
+
122
+ def _stream(
123
+ self,
124
+ messages: List[BaseMessage],
125
+ stop: Optional[List[str]] = None,
126
+ run_manager: Optional[CallbackManagerForLLMRun] = None,
127
+ **kwargs: Any,
128
+ ) -> Iterator[ChatGenerationChunk]:
129
+ """Stream the output of the model.
130
+
131
+ This method should be implemented if the model can generate output
132
+ in a streaming fashion. If the model does not support streaming,
133
+ do not implement it. In that case streaming requests will be automatically
134
+ handled by the _generate method.
135
+
136
+ Args:
137
+ messages: the prompt composed of a list of messages.
138
+ stop: a list of strings on which the model should stop generating.
139
+ If generation stops due to a stop token, the stop token itself
140
+ SHOULD BE INCLUDED as part of the output. This is not enforced
141
+ across models right now, but it's a good practice to follow since
142
+ it makes it much easier to parse the output of the model
143
+ downstream and understand why generation stopped.
144
+ run_manager: A run manager with callbacks for the LLM.
145
+ """
146
+ last_message = messages[-1]
147
+ tokens = last_message.content[: self.n]
148
+
149
+ for token in tokens:
150
+ chunk = ChatGenerationChunk(message=AIMessageChunk(content=token))
151
+
152
+ if run_manager:
153
+ # This is optional in newer versions of LangChain
154
+ # The on_llm_new_token will be called automatically
155
+ run_manager.on_llm_new_token(token, chunk=chunk)
156
+
157
+ yield chunk
158
+
159
+ # Let's add some other information (e.g., response metadata)
160
+ chunk = ChatGenerationChunk(
161
+ message=AIMessageChunk(content="", response_metadata={"time_in_sec": 3})
162
+ )
163
+ if run_manager:
164
+ # This is optional in newer versions of LangChain
165
+ # The on_llm_new_token will be called automatically
166
+ run_manager.on_llm_new_token(token, chunk=chunk)
167
+ yield chunk
168
+
169
+ @property
170
+ def _llm_type(self) -> str:
171
+ """Get the type of language model used by this chat model."""
172
+ return "echoing-chat-model-advanced"
173
+
174
+ @property
175
+ def _identifying_params(self) -> Dict[str, Any]:
176
+ """Return a dictionary of identifying parameters.
177
+
178
+ This information is used by the LangChain callback system, which
179
+ is used for tracing purposes make it possible to monitor LLMs.
180
+ """
181
+ return {
182
+ # The model name allows users to specify custom token counting
183
+ # rules in LLM monitoring applications (e.g., in LangSmith users
184
+ # can provide per token pricing for their model and monitor
185
+ # costs for the given LLM.)
186
+ "model_name": self.model_name,
187
+ }
188
+ llm_model = CustomChatModelAdvanced(model_name='unsloth_llama3.1',n=4)
189
+
190
+
191
+
192
+ def predict_chat(message, history):
193
+ history_langchain_format = []
194
+ for human, ai in history:
195
+ history_langchain_format.append(HumanMessage(content=human))
196
+ history_langchain_format.append(AIMessage(content=ai))
197
+ history_langchain_format.append(HumanMessage(content=message))
198
+ gpt_response = llm_model(history_langchain_format)
199
+ return gpt_response.content
200
+
201
+ gr.ChatInterface(predict_chat).launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ unsloth[colab-new] @ git+https://github.com/sebdg/unsloth.git
2
+ xformers<0.0.27
3
+ trl<0.9.0
4
+ peft
5
+ accelerate
6
+ bitsandbytes
7
+ gradio
8
+ gradio[oauth]
9
+ tensorboard
10
+ langchain
11
+ langchain-community