Update app.py
Browse files
app.py
CHANGED
@@ -1,343 +1,3 @@
|
|
1 |
-
# import gradio as gr
|
2 |
-
# from langchain_mistralai.chat_models import ChatMistralAI
|
3 |
-
# from langchain.prompts import ChatPromptTemplate
|
4 |
-
# import os
|
5 |
-
# from pathlib import Path
|
6 |
-
# from typing import List, Dict, Optional
|
7 |
-
# import json
|
8 |
-
# import faiss
|
9 |
-
# import numpy as np
|
10 |
-
# from langchain.schema import Document
|
11 |
-
# from sentence_transformers import SentenceTransformer
|
12 |
-
# import pickle
|
13 |
-
# import re
|
14 |
-
|
15 |
-
# os.environ.get('HUGGINGFACE_TOKEN')
|
16 |
-
|
17 |
-
# class RAGLoader:
|
18 |
-
# def __init__(self,
|
19 |
-
# docs_folder: str = "./docs",
|
20 |
-
# splits_folder: str = "./splits",
|
21 |
-
# index_folder: str = "./index",):
|
22 |
-
# # model_name: str = "intfloat/multilingual-e5-large")
|
23 |
-
# """
|
24 |
-
# Initialise le RAG Loader
|
25 |
-
|
26 |
-
# Args:
|
27 |
-
# docs_folder: Dossier contenant les documents sources
|
28 |
-
# splits_folder: Dossier où seront stockés les morceaux de texte
|
29 |
-
# index_folder: Dossier où sera stocké l'index FAISS
|
30 |
-
# model_name: Nom du modèle SentenceTransformer à utiliser
|
31 |
-
# """
|
32 |
-
# self.docs_folder = Path(docs_folder)
|
33 |
-
# self.splits_folder = Path(splits_folder)
|
34 |
-
# self.index_folder = Path(index_folder)
|
35 |
-
# # self.model_name = model_name
|
36 |
-
|
37 |
-
# # Créer les dossiers s'ils n'existent pas
|
38 |
-
# self.splits_folder.mkdir(parents=True, exist_ok=True)
|
39 |
-
# self.index_folder.mkdir(parents=True, exist_ok=True)
|
40 |
-
|
41 |
-
# # Chemins des fichiers
|
42 |
-
# self.splits_path = self.splits_folder / "splits.json"
|
43 |
-
# self.index_path = self.index_folder / "faiss.index"
|
44 |
-
# self.documents_path = self.index_folder / "documents.pkl"
|
45 |
-
|
46 |
-
# # Initialiser le modèle
|
47 |
-
# # self.model = None
|
48 |
-
# self.index = None
|
49 |
-
# self.indexed_documents = None
|
50 |
-
|
51 |
-
# def encode(self,payload):
|
52 |
-
# token = os.environ.get('HUGGINGFACE_TOKEN')
|
53 |
-
# API_URL = "https://api-inference.huggingface.co/models/intfloat/multilingual-e5-large"
|
54 |
-
# headers = {"Authorization": "Bearer {token}"}
|
55 |
-
# response = requests.post(API_URL, headers=headers, json=payload)
|
56 |
-
# return response.json()
|
57 |
-
|
58 |
-
# def load_and_split_texts(self) -> List[Document]:
|
59 |
-
# """
|
60 |
-
# Charge les textes du dossier docs, les découpe en morceaux et les sauvegarde
|
61 |
-
# dans un fichier JSON unique.
|
62 |
-
|
63 |
-
# Returns:
|
64 |
-
# Liste de Documents contenant les morceaux de texte et leurs métadonnées
|
65 |
-
# """
|
66 |
-
# documents = []
|
67 |
-
|
68 |
-
# # Vérifier d'abord si les splits existent déjà
|
69 |
-
# if self._splits_exist():
|
70 |
-
# print("Chargement des splits existants...")
|
71 |
-
# return self._load_existing_splits()
|
72 |
-
|
73 |
-
# print("Création de nouveaux splits...")
|
74 |
-
# # Parcourir tous les fichiers du dossier docs
|
75 |
-
# for file_path in self.docs_folder.glob("*.txt"):
|
76 |
-
# with open(file_path, 'r', encoding='utf-8') as file:
|
77 |
-
# text = file.read()
|
78 |
-
|
79 |
-
# # Découper le texte en phrases
|
80 |
-
# # chunks = [chunk.strip() for chunk in text.split('.') if chunk.strip()]
|
81 |
-
# chunks = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
|
82 |
-
|
83 |
-
# # Créer un Document pour chaque morceau
|
84 |
-
# for i, chunk in enumerate(chunks):
|
85 |
-
# doc = Document(
|
86 |
-
# page_content=chunk,
|
87 |
-
# metadata={
|
88 |
-
# 'source': file_path.name,
|
89 |
-
# 'chunk_id': i,
|
90 |
-
# 'total_chunks': len(chunks)
|
91 |
-
# }
|
92 |
-
# )
|
93 |
-
# documents.append(doc)
|
94 |
-
|
95 |
-
# # Sauvegarder tous les splits dans un seul fichier JSON
|
96 |
-
# self._save_splits(documents)
|
97 |
-
|
98 |
-
# print(f"Nombre total de morceaux créés: {len(documents)}")
|
99 |
-
# return documents
|
100 |
-
|
101 |
-
# def _splits_exist(self) -> bool:
|
102 |
-
# """Vérifie si le fichier de splits existe"""
|
103 |
-
# return self.splits_path.exists()
|
104 |
-
|
105 |
-
# def _save_splits(self, documents: List[Document]):
|
106 |
-
# """Sauvegarde tous les documents découpés dans un seul fichier JSON"""
|
107 |
-
# splits_data = {
|
108 |
-
# 'splits': [
|
109 |
-
# {
|
110 |
-
# 'text': doc.page_content,
|
111 |
-
# 'metadata': doc.metadata
|
112 |
-
# }
|
113 |
-
# for doc in documents
|
114 |
-
# ]
|
115 |
-
# }
|
116 |
-
|
117 |
-
# with open(self.splits_path, 'w', encoding='utf-8') as f:
|
118 |
-
# json.dump(splits_data, f, ensure_ascii=False, indent=2)
|
119 |
-
|
120 |
-
# def _load_existing_splits(self) -> List[Document]:
|
121 |
-
# """Charge les splits depuis le fichier JSON unique"""
|
122 |
-
# with open(self.splits_path, 'r', encoding='utf-8') as f:
|
123 |
-
# splits_data = json.load(f)
|
124 |
-
|
125 |
-
# documents = [
|
126 |
-
# Document(
|
127 |
-
# page_content=split['text'],
|
128 |
-
# metadata=split['metadata']
|
129 |
-
# )
|
130 |
-
# for split in splits_data['splits']
|
131 |
-
# ]
|
132 |
-
|
133 |
-
# print(f"Nombre de splits chargés: {len(documents)}")
|
134 |
-
# return documents
|
135 |
-
|
136 |
-
# def load_index(self) -> bool:
|
137 |
-
# """
|
138 |
-
# Charge l'index FAISS et les documents associés s'ils existent
|
139 |
-
|
140 |
-
# Returns:
|
141 |
-
# bool: True si l'index a été chargé, False sinon
|
142 |
-
# """
|
143 |
-
# if not self._index_exists():
|
144 |
-
# print("Aucun index trouvé.")
|
145 |
-
# return False
|
146 |
-
|
147 |
-
# print("Chargement de l'index existant...")
|
148 |
-
# try:
|
149 |
-
# # Charger l'index FAISS
|
150 |
-
# self.index = faiss.read_index(str(self.index_path))
|
151 |
-
|
152 |
-
# # Charger les documents associés
|
153 |
-
# with open(self.documents_path, 'rb') as f:
|
154 |
-
# self.indexed_documents = pickle.load(f)
|
155 |
-
|
156 |
-
# print(f"Index chargé avec {self.index.ntotal} vecteurs")
|
157 |
-
# return True
|
158 |
-
|
159 |
-
# except Exception as e:
|
160 |
-
# print(f"Erreur lors du chargement de l'index: {e}")
|
161 |
-
# return False
|
162 |
-
|
163 |
-
# def create_index(self, documents: Optional[List[Document]] = None) -> bool:
|
164 |
-
# """
|
165 |
-
# Crée un nouvel index FAISS à partir des documents.
|
166 |
-
# Si aucun document n'est fourni, charge les documents depuis le fichier JSON.
|
167 |
-
|
168 |
-
# Args:
|
169 |
-
# documents: Liste optionnelle de Documents à indexer
|
170 |
-
|
171 |
-
# Returns:
|
172 |
-
# bool: True si l'index a été créé avec succès, False sinon
|
173 |
-
# """
|
174 |
-
# try:
|
175 |
-
# # # Initialiser le modèle si nécessaire
|
176 |
-
# # if self.model is None:
|
177 |
-
# # print("Chargement du modèle...")
|
178 |
-
# # self.model = SentenceTransformer(self.model_name)
|
179 |
-
|
180 |
-
# # Charger les documents si non fournis
|
181 |
-
# if documents is None:
|
182 |
-
# documents = self.load_and_split_texts()
|
183 |
-
|
184 |
-
# if not documents:
|
185 |
-
# print("Aucun document à indexer.")
|
186 |
-
# return False
|
187 |
-
|
188 |
-
# print("Création des embeddings...")
|
189 |
-
# texts = [doc.page_content for doc in documents]
|
190 |
-
# embeddings = self.encode(texts)
|
191 |
-
|
192 |
-
# # Initialiser l'index FAISS
|
193 |
-
# dimension = embeddings.shape[1]
|
194 |
-
# self.index = faiss.IndexFlatL2(dimension)
|
195 |
-
|
196 |
-
# # Ajouter les vecteurs à l'index
|
197 |
-
# self.index.add(np.array(embeddings).astype('float32'))
|
198 |
-
|
199 |
-
# # Sauvegarder l'index
|
200 |
-
# print("Sauvegarde de l'index...")
|
201 |
-
# faiss.write_index(self.index, str(self.index_path))
|
202 |
-
|
203 |
-
# # Sauvegarder les documents associés
|
204 |
-
# self.indexed_documents = documents
|
205 |
-
# with open(self.documents_path, 'wb') as f:
|
206 |
-
# pickle.dump(documents, f)
|
207 |
-
|
208 |
-
# print(f"Index créé avec succès : {self.index.ntotal} vecteurs")
|
209 |
-
# return True
|
210 |
-
|
211 |
-
# except Exception as e:
|
212 |
-
# print(f"Erreur lors de la création de l'index: {e}")
|
213 |
-
# return False
|
214 |
-
|
215 |
-
# def _index_exists(self) -> bool:
|
216 |
-
# """Vérifie si l'index et les documents associés existent"""
|
217 |
-
# return self.index_path.exists() and self.documents_path.exists()
|
218 |
-
|
219 |
-
# def get_retriever(self, k: int = 10):
|
220 |
-
# """
|
221 |
-
# Crée un retriever pour l'utilisation avec LangChain
|
222 |
-
|
223 |
-
# Args:
|
224 |
-
# k: Nombre de documents similaires à retourner
|
225 |
-
|
226 |
-
# Returns:
|
227 |
-
# Callable: Fonction de recherche compatible avec LangChain
|
228 |
-
# """
|
229 |
-
# if self.index is None:
|
230 |
-
# if not self.load_index():
|
231 |
-
# if not self.create_index():
|
232 |
-
# raise ValueError("Impossible de charger ou créer l'index")
|
233 |
-
|
234 |
-
# # if self.model is None:
|
235 |
-
# # self.model = SentenceTransformer(self.model_name)
|
236 |
-
|
237 |
-
# def retriever_function(query: str) -> List[Document]:
|
238 |
-
# # Créer l'embedding de la requête
|
239 |
-
# query_embedding = self.encode([query])[0]
|
240 |
-
|
241 |
-
# # Rechercher les documents similaires
|
242 |
-
# distances, indices = self.index.search(
|
243 |
-
# np.array([query_embedding]).astype('float32'),
|
244 |
-
# k
|
245 |
-
# )
|
246 |
-
|
247 |
-
# # Retourner les documents trouvés
|
248 |
-
# results = []
|
249 |
-
# for idx in indices[0]:
|
250 |
-
# if idx != -1: # FAISS retourne -1 pour les résultats invalides
|
251 |
-
# results.append(self.indexed_documents[idx])
|
252 |
-
|
253 |
-
# return results
|
254 |
-
|
255 |
-
# return retriever_function
|
256 |
-
|
257 |
-
# # Initialize the RAG system
|
258 |
-
# llm = ChatMistralAI(model="mistral-large-latest", mistral_api_key="QK0ZZpSxQbCEVgOLtI6FARQVmBYc6WGP")
|
259 |
-
# rag_loader = RAGLoader()
|
260 |
-
# retriever = rag_loader.get_retriever(k=10)
|
261 |
-
|
262 |
-
# prompt_template = ChatPromptTemplate.from_messages([
|
263 |
-
# ("system", """أنت مساعد مفيد يجيب على الأسئلة باللغة العربية باستخدام المعلومات المقدمة.
|
264 |
-
# استخدم المعلومات التالية للإجابة على ��لسؤال:
|
265 |
-
|
266 |
-
# {context}
|
267 |
-
|
268 |
-
# إذا لم تكن المعلومات كافية للإجابة على السؤال بشكل كامل، قم بتوضيح ذلك.
|
269 |
-
# أجب بشكل موجز ودقيق."""),
|
270 |
-
# ("human", "{question}")
|
271 |
-
# ])
|
272 |
-
|
273 |
-
# def process_question(question: str) -> tuple[str, str]:
|
274 |
-
# """
|
275 |
-
# Process a question and return both the answer and the relevant context
|
276 |
-
# """
|
277 |
-
# relevant_docs = retriever(question)
|
278 |
-
# context = "\n".join([doc.page_content for doc in relevant_docs])
|
279 |
-
|
280 |
-
# prompt = prompt_template.format_messages(
|
281 |
-
# context=context,
|
282 |
-
# question=question
|
283 |
-
# )
|
284 |
-
|
285 |
-
# response = llm(prompt)
|
286 |
-
# return response.content, context
|
287 |
-
|
288 |
-
# def gradio_interface(question: str) -> tuple[str, str]:
|
289 |
-
# """
|
290 |
-
# Gradio interface function that returns both answer and context as a tuple
|
291 |
-
# """
|
292 |
-
# return process_question(question)
|
293 |
-
|
294 |
-
# # Custom CSS for right-aligned text in textboxes
|
295 |
-
# custom_css = """
|
296 |
-
# .rtl-text {
|
297 |
-
# text-align: right !important;
|
298 |
-
# direction: rtl !important;
|
299 |
-
# }
|
300 |
-
# .rtl-text textarea {
|
301 |
-
# text-align: right !important;
|
302 |
-
# direction: rtl !important;
|
303 |
-
# }
|
304 |
-
# """
|
305 |
-
|
306 |
-
# # Define the Gradio interface
|
307 |
-
# with gr.Blocks(css=custom_css) as iface:
|
308 |
-
# with gr.Column():
|
309 |
-
# input_text = gr.Textbox(
|
310 |
-
# label="السؤال",
|
311 |
-
# placeholder="اكتب سؤالك هنا...",
|
312 |
-
# lines=2,
|
313 |
-
# elem_classes="rtl-text"
|
314 |
-
# )
|
315 |
-
|
316 |
-
# answer_box = gr.Textbox(
|
317 |
-
# label="الإجابة",
|
318 |
-
# lines=4,
|
319 |
-
# elem_classes="rtl-text"
|
320 |
-
# )
|
321 |
-
|
322 |
-
# context_box = gr.Textbox(
|
323 |
-
# label="السياق المستخدم",
|
324 |
-
# lines=8,
|
325 |
-
# elem_classes="rtl-text"
|
326 |
-
# )
|
327 |
-
|
328 |
-
# submit_btn = gr.Button("إرسال")
|
329 |
-
|
330 |
-
# submit_btn.click(
|
331 |
-
# fn=gradio_interface,
|
332 |
-
# inputs=input_text,
|
333 |
-
# outputs=[answer_box, context_box]
|
334 |
-
# )
|
335 |
-
|
336 |
-
# # Launch the interface
|
337 |
-
# if __name__ == "__main__":
|
338 |
-
# iface.launch(share=True)
|
339 |
-
|
340 |
-
|
341 |
import gradio as gr
|
342 |
from langchain_mistralai.chat_models import ChatMistralAI
|
343 |
from langchain.prompts import ChatPromptTemplate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
from langchain_mistralai.chat_models import ChatMistralAI
|
3 |
from langchain.prompts import ChatPromptTemplate
|