E-slam commited on
Commit
1a2217b
·
verified ·
1 Parent(s): 5d98430

Delete Allam_Backend_HF.py

Browse files
Files changed (1) hide show
  1. Allam_Backend_HF.py +0 -267
Allam_Backend_HF.py DELETED
@@ -1,267 +0,0 @@
1
- import pandas as pd
2
- import faiss
3
- import numpy as np
4
- import torch
5
- import requests
6
- import os
7
- #import huggingface_hub
8
- hf_token = os.getenv("hf_token")
9
- #huggingface_hub.login(hf_token)
10
-
11
- df = pd.read_excel("Allam_SA_Articles.xlsx")
12
- input_texts = df['Article_text'].tolist()
13
- MOJ_embeddings = np.load('Allam_embeddings.npy')
14
-
15
-
16
- def embed_single_text(query):
17
- headers = {
18
- "Authorization": f"Bearer {hf_token}"
19
- }
20
-
21
- url = f"https://allam-llm-e5-embeddings.hf.space/e5_embeddings?query={query}"
22
-
23
- response = requests.get(url, headers=headers)
24
-
25
- if response.status_code == 200:
26
- return torch.tensor(response.json())
27
- else:
28
- print(f"Error: {response.status_code}")
29
- return None
30
-
31
-
32
- #Faiss
33
- dimension = MOJ_embeddings.shape[1]
34
- index = faiss.IndexFlatIP(dimension)
35
- index.add(MOJ_embeddings)
36
-
37
- def query_search(query, K):
38
- query_embedding = embed_single_text(query)
39
- distances, indices = index.search(query_embedding, K)
40
-
41
- results = []
42
- for idx in indices[0]:
43
- file_id = df.iloc[idx]['File_ID']
44
- row_number = df.iloc[idx]['Row_Number']
45
- #results.append((file_id, row_number))
46
- results.append(idx)
47
- return results
48
-
49
- from sklearn.feature_extraction.text import TfidfVectorizer
50
- from sklearn.metrics.pairwise import cosine_similarity
51
-
52
- def return_top5_chunks(query):
53
- matching_indices = query_search(query, 15)
54
- relevant_rows = df.iloc[matching_indices]
55
-
56
- def chunk_text(text, max_words=150):
57
- words = text.split()
58
- return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]
59
-
60
- relevant_rows['Chunks'] = relevant_rows['Article_text'].apply(chunk_text)
61
-
62
- chunked_texts = []
63
- for idx, row in relevant_rows.iterrows():
64
- for chunk in row['Chunks']:
65
- chunked_texts.append((chunk, idx))
66
-
67
- def find_top_k_similar(texts, query, k):
68
- documents = [text for text, _ in texts]
69
-
70
- vectorizer = TfidfVectorizer()
71
-
72
- all_texts = documents + [query]
73
-
74
- tfidf_matrix = vectorizer.fit_transform(all_texts)
75
-
76
- similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
77
-
78
- top_k_indices = similarities.argsort()[-k:][::-1]
79
- return [(texts[i], similarities[i]) for i in top_k_indices]
80
-
81
- top_5_chunks = find_top_k_similar(chunked_texts, query, 5)
82
-
83
- chunks_txt = ''
84
- for i, ((chunk, idx), similarity) in enumerate(top_5_chunks):
85
- chunks_txt += f"Index: {idx},\nChunk: {chunk}\n"
86
-
87
- if i < len(top_5_chunks) - 1:
88
- chunks_txt += "##########\n"
89
-
90
- return chunks_txt
91
-
92
-
93
- import requests
94
-
95
-
96
- api_key = 'UEGtyhQpPCKfhsQ_rPlBbEsgZErSh8xPU57qm9DQ-ZkC'
97
-
98
- url = "https://iam.cloud.ibm.com/identity/token"
99
-
100
- headers = {
101
- "Content-Type": "application/x-www-form-urlencoded"
102
- }
103
-
104
- data = {
105
- "grant_type": "urn:ibm:params:oauth:grant-type:apikey",
106
- "apikey": api_key
107
- }
108
-
109
- response = requests.post(url, headers=headers, data=data)
110
- token_info = response.json()
111
- access_token = token_info['access_token']
112
-
113
-
114
-
115
-
116
- def allam_response(context, query):
117
- url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29"
118
-
119
- input_text_base = f"""
120
- [Context]: {context}
121
- [System]:
122
- You are an Arabic frindley chatbot named مستنير.
123
- You will be provided with an Arabic context ,
124
- Your task is to extract and Answer for the questions only from the context provided
125
- elaborate on the answer from the context
126
- At the end of your response mention the Article : مادة
127
- if no answer is found apologize
128
-
129
- Question: {query}
130
- """
131
- body = {
132
- "input": input_text_base,
133
- "parameters": {
134
- "decoding_method": "greedy",
135
- "max_new_tokens": 900,
136
- "min_new_tokens": 0,
137
- "stop_sequences": [],
138
- "repetition_penalty": 1
139
- },
140
- "model_id": "sdaia/allam-1-13b-instruct",
141
- "project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936"
142
- }
143
-
144
- headers = {
145
- "Accept": "application/json",
146
- "Content-Type": "application/json",
147
- "Authorization": f"Bearer {access_token}"
148
- }
149
-
150
- response = requests.post(url, headers=headers, json=body)
151
-
152
- if response.status_code != 200:
153
- raise Exception("Non-200 response: " + str(response.text))
154
-
155
- response = response.json()
156
-
157
- return response['results'][0]['generated_text']
158
-
159
-
160
-
161
- import json
162
-
163
- import re
164
-
165
- def index_num(text):
166
-
167
- match = re.search(r'"Index":\s*"(\d+)"', text)
168
- index_number = match.group(1) if match else None
169
-
170
- return int(index_number)
171
-
172
- def get_top_matching_chunk(text, query, max_words=500):
173
- def chunk_text(text, max_words):
174
- words = text.split()
175
- return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]
176
-
177
- chunks = chunk_text(text, max_words)
178
-
179
- vectorizer = TfidfVectorizer()
180
- all_texts = chunks + [query]
181
- tfidf_matrix = vectorizer.fit_transform(all_texts)
182
-
183
- similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
184
-
185
- top_chunk_index = similarities.argmax()
186
-
187
- return chunks[top_chunk_index]
188
-
189
- def reformat_indentation(text, indent_spaces=4):
190
- indent = ' ' * indent_spaces
191
-
192
- lines = text.splitlines()
193
-
194
- formatted_lines = [indent + line.strip() for line in lines]
195
-
196
- return '\n'.join(formatted_lines)
197
-
198
- def return_index_num(data_text, query):
199
-
200
- url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29"
201
-
202
- sys_prompt = """
203
- Identify the **first** Index chunk with the answer to a given question.
204
- Chunks are seperated by ##########
205
- Respond only with **Json** format **do not return any words**:
206
-
207
- {"Index": "extracted_Index"}
208
-
209
- Or:
210
-
211
- {"Index": "not_found"}
212
-
213
- **No additional text allowed**.
214
-
215
- """
216
- sys_prompt += f"Question : {query}"
217
-
218
- input_text = f"""
219
- [Context]: {data_text.strip()}
220
- [System]: {sys_prompt.strip()}
221
- """
222
-
223
- input_text = reformat_indentation(input_text, indent_spaces=0)
224
- body = {
225
- "input": input_text,
226
- "parameters": {
227
- "decoding_method": "greedy",
228
- "max_new_tokens": 20,
229
- "repetition_penalty": 1
230
- },
231
- "model_id": "sdaia/allam-1-13b-instruct",
232
- "project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936"
233
- }
234
-
235
- headers = {
236
- "Accept": "application/json",
237
- "Content-Type": "application/json",
238
- "Authorization": f"Bearer {access_token}" # access_token must be defined elsewhere
239
- }
240
-
241
-
242
- response = requests.post(url, headers=headers, json=body)
243
-
244
- if response.status_code != 200:
245
- raise Exception("Non-200 response: " + str(response.text))
246
-
247
- response = response.json()
248
-
249
- return(response['results'][0]['generated_text'])
250
-
251
-
252
-
253
- def allam_llm(q):
254
-
255
- chunks_text = return_top5_chunks(q)
256
-
257
- targeted_chunk = return_index_num(chunks_text, q)
258
-
259
- index_number = index_num(targeted_chunk)
260
-
261
- text_to_chunk = df['Article_text'][index_number]
262
-
263
- top_chunk = get_top_matching_chunk(text_to_chunk, q)
264
-
265
- allam_res = allam_response(top_chunk, q)
266
-
267
- return allam_res