Spaces:

Trabis
/

RAG_loi

Running on CPU Upgrade

App Files Files Community

Trabis commited on 1 day ago

Commit

56f8c62

•

1 Parent(s): 84ef393

Upload 2 files

Browse files

Files changed (2) hide show

RAG_GRADIO.py +336 -0
requirements.txt +90 -0

RAG_GRADIO.py ADDED Viewed

	@@ -0,0 +1,336 @@

+import gradio as gr
+from langchain_mistralai.chat_models import ChatMistralAI
+from langchain.prompts import ChatPromptTemplate
+import os
+from pathlib import Path
+from typing import List, Dict, Optional
+import json
+import faiss
+import numpy as np
+from langchain.schema import Document
+from sentence_transformers import SentenceTransformer
+import pickle
+import re
+class RAGLoader:
+    def __init__(self,
+                 docs_folder: str = "./docs",
+                 splits_folder: str = "./splits",
+                 index_folder: str = "./index",
+                 model_name: str = "intfloat/multilingual-e5-large"):
+        """
+        Initialise le RAG Loader
+        Args:
+            docs_folder: Dossier contenant les documents sources
+            splits_folder: Dossier où seront stockés les morceaux de texte
+            index_folder: Dossier où sera stocké l'index FAISS
+            model_name: Nom du modèle SentenceTransformer à utiliser
+        """
+        self.docs_folder = Path(docs_folder)
+        self.splits_folder = Path(splits_folder)
+        self.index_folder = Path(index_folder)
+        self.model_name = model_name
+        # Créer les dossiers s'ils n'existent pas
+        self.splits_folder.mkdir(parents=True, exist_ok=True)
+        self.index_folder.mkdir(parents=True, exist_ok=True)
+        # Chemins des fichiers
+        self.splits_path = self.splits_folder / "splits.json"
+        self.index_path = self.index_folder / "faiss.index"
+        self.documents_path = self.index_folder / "documents.pkl"
+        # Initialiser le modèle
+        self.model = None
+        self.index = None
+        self.indexed_documents = None
+    def load_and_split_texts(self) -> List[Document]:
+        """
+        Charge les textes du dossier docs, les découpe en morceaux et les sauvegarde
+        dans un fichier JSON unique.
+        Returns:
+            Liste de Documents contenant les morceaux de texte et leurs métadonnées
+        """
+        documents = []
+        # Vérifier d'abord si les splits existent déjà
+        if self._splits_exist():
+            print("Chargement des splits existants...")
+            return self._load_existing_splits()
+        print("Création de nouveaux splits...")
+        # Parcourir tous les fichiers du dossier docs
+        for file_path in self.docs_folder.glob("*.txt"):
+            with open(file_path, 'r', encoding='utf-8') as file:
+                text = file.read()
+                # Découper le texte en phrases
+                # chunks = [chunk.strip() for chunk in text.split('.') if chunk.strip()]
+                chunks = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
+                # Créer un Document pour chaque morceau
+                for i, chunk in enumerate(chunks):
+                    doc = Document(
+                        page_content=chunk,
+                        metadata={
+                            'source': file_path.name,
+                            'chunk_id': i,
+                            'total_chunks': len(chunks)
+                        }
+                    )
+                    documents.append(doc)
+        # Sauvegarder tous les splits dans un seul fichier JSON
+        self._save_splits(documents)
+        print(f"Nombre total de morceaux créés: {len(documents)}")
+        return documents
+    def _splits_exist(self) -> bool:
+        """Vérifie si le fichier de splits existe"""
+        return self.splits_path.exists()
+    def _save_splits(self, documents: List[Document]):
+        """Sauvegarde tous les documents découpés dans un seul fichier JSON"""
+        splits_data = {
+            'splits': [
+                {
+                    'text': doc.page_content,
+                    'metadata': doc.metadata
+                }
+                for doc in documents
+            ]
+        }
+        with open(self.splits_path, 'w', encoding='utf-8') as f:
+            json.dump(splits_data, f, ensure_ascii=False, indent=2)
+    def _load_existing_splits(self) -> List[Document]:
+        """Charge les splits depuis le fichier JSON unique"""
+        with open(self.splits_path, 'r', encoding='utf-8') as f:
+            splits_data = json.load(f)
+        documents = [
+            Document(
+                page_content=split['text'],
+                metadata=split['metadata']
+            )
+            for split in splits_data['splits']
+        ]
+        print(f"Nombre de splits chargés: {len(documents)}")
+        return documents
+    def load_index(self) -> bool:
+        """
+        Charge l'index FAISS et les documents associés s'ils existent
+        Returns:
+            bool: True si l'index a été chargé, False sinon
+        """
+        if not self._index_exists():
+            print("Aucun index trouvé.")
+            return False
+        print("Chargement de l'index existant...")
+        try:
+            # Charger l'index FAISS
+            self.index = faiss.read_index(str(self.index_path))
+            # Charger les documents associés
+            with open(self.documents_path, 'rb') as f:
+                self.indexed_documents = pickle.load(f)
+            print(f"Index chargé avec {self.index.ntotal} vecteurs")
+            return True
+        except Exception as e:
+            print(f"Erreur lors du chargement de l'index: {e}")
+            return False
+    def create_index(self, documents: Optional[List[Document]] = None) -> bool:
+        """
+        Crée un nouvel index FAISS à partir des documents.
+        Si aucun document n'est fourni, charge les documents depuis le fichier JSON.
+        Args:
+            documents: Liste optionnelle de Documents à indexer
+        Returns:
+            bool: True si l'index a été créé avec succès, False sinon
+        """
+        try:
+            # Initialiser le modèle si nécessaire
+            if self.model is None:
+                print("Chargement du modèle...")
+                self.model = SentenceTransformer(self.model_name)
+            # Charger les documents si non fournis
+            if documents is None:
+                documents = self.load_and_split_texts()
+            if not documents:
+                print("Aucun document à indexer.")
+                return False
+            print("Création des embeddings...")
+            texts = [doc.page_content for doc in documents]
+            embeddings = self.model.encode(texts, show_progress_bar=True)
+            # Initialiser l'index FAISS
+            dimension = embeddings.shape[1]
+            self.index = faiss.IndexFlatL2(dimension)
+            # Ajouter les vecteurs à l'index
+            self.index.add(np.array(embeddings).astype('float32'))
+            # Sauvegarder l'index
+            print("Sauvegarde de l'index...")
+            faiss.write_index(self.index, str(self.index_path))
+            # Sauvegarder les documents associés
+            self.indexed_documents = documents
+            with open(self.documents_path, 'wb') as f:
+                pickle.dump(documents, f)
+            print(f"Index créé avec succès : {self.index.ntotal} vecteurs")
+            return True
+        except Exception as e:
+            print(f"Erreur lors de la création de l'index: {e}")
+            return False
+    def _index_exists(self) -> bool:
+        """Vérifie si l'index et les documents associés existent"""
+        return self.index_path.exists() and self.documents_path.exists()
+    def get_retriever(self, k: int = 5):
+        """
+        Crée un retriever pour l'utilisation avec LangChain
+        Args:
+            k: Nombre de documents similaires à retourner
+        Returns:
+            Callable: Fonction de recherche compatible avec LangChain
+        """
+        if self.index is None:
+            if not self.load_index():
+                if not self.create_index():
+                    raise ValueError("Impossible de charger ou créer l'index")
+        if self.model is None:
+            self.model = SentenceTransformer(self.model_name)
+        def retriever_function(query: str) -> List[Document]:
+            # Créer l'embedding de la requête
+            query_embedding = self.model.encode([query])[0]
+            # Rechercher les documents similaires
+            distances, indices = self.index.search(
+                np.array([query_embedding]).astype('float32'),
+                k
+            )
+            # Retourner les documents trouvés
+            results = []
+            for idx in indices[0]:
+                if idx != -1:  # FAISS retourne -1 pour les résultats invalides
+                    results.append(self.indexed_documents[idx])
+            return results
+        return retriever_function
+# Initialize the RAG system
+llm = ChatMistralAI(model="mistral-large-latest", mistral_api_key="QK0ZZpSxQbCEVgOLtI6FARQVmBYc6WGP")
+rag_loader = RAGLoader()
+retriever = rag_loader.get_retriever(k=5)
+prompt_template = ChatPromptTemplate.from_messages([
+    ("system", """أنت مساعد مفيد يجيب على الأسئلة باللغة العربية باستخدام المعلومات المقدمة.
+    استخدم المعلومات التالية للإجابة على السؤال:
+    {context}
+    إذا لم تكن المعلومات كافية للإجابة على السؤال بشكل كامل، قم بتوضيح ذلك.
+    أجب بشكل موجز ودقيق."""),
+    ("human", "{question}")
+])
+def process_question(question: str) -> tuple[str, str]:
+    """
+    Process a question and return both the answer and the relevant context
+    """
+    relevant_docs = retriever(question)
+    context = "\n".join([doc.page_content for doc in relevant_docs])
+    prompt = prompt_template.format_messages(
+        context=context,
+        question=question
+    )
+    response = llm(prompt)
+    return response.content, context
+def gradio_interface(question: str) -> tuple[str, str]:
+    """
+    Gradio interface function that returns both answer and context as a tuple.
+    """
+    # Replace with your actual function to process the question
+    return process_question(question)
+# Custom CSS for right-aligned and RTL text
+custom_css = """
+#question-box textarea, #answer-box textarea, #context-box textarea {
+    text-align: right !important;
+    direction: rtl !important;
+}
+"""
+# Test question
+question = "هل يجوز لرجل السلطة اقتناء عقار داخل مجال عمله"
+answer, context = process_question(question)  # Ensure `process_question` is defined
+# Print results for testing
+print("الإجابة:", answer)
+print("\nالسياق المستخدم:", context)
+# Define the Gradio interface with custom CSS
+with gr.Blocks(css=custom_css) as iface:
+    with gr.Column():
+        input_text = gr.Textbox(
+            label="السؤال",
+            placeholder="اكتب سؤالك هنا...",
+            lines=2,
+            elem_id="question-box"
+        )
+        answer_box = gr.Textbox(
+            label="الإجابة",
+            lines=4,
+            elem_id="answer-box"
+        )
+        context_box = gr.Textbox(
+            label="السياق المستخدم",
+            lines=8,
+            elem_id="context-box"
+        )
+        submit_btn = gr.Button("إرسال")
+        # Link submit button to processing function
+        submit_btn.click(
+            fn=gradio_interface,
+            inputs=input_text,
+            outputs=[answer_box, context_box]
+        )
+# Launch the interface
+if __name__ == "__main__":
+    iface.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,90 @@

+aiohappyeyeballs==2.4.3
+aiohttp==3.10.10
+aiosignal==1.3.1
+altair==5.4.1
+annotated-types==0.7.0
+anyio==4.6.2.post1
+attrs==24.2.0
+blinker==1.8.2
+cachetools==5.5.0
+certifi==2024.7.4
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+distro==1.9.0
+einops==0.8.0
+faiss-cpu==1.9.0
+filelock==3.16.1
+frozenlist==1.4.1
+fsspec==2024.9.0
+gitdb==4.0.11
+GitPython==3.1.43
+greenlet==3.1.1
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+httpx-sse==0.4.0
+huggingface-hub==0.26.0
+idna==3.7
+Jinja2==3.1.4
+jiter==0.6.1
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+langchain==0.3.4
+langchain-core==0.3.12
+langchain-mistralai==0.2.0
+langchain-openai==0.2.3
+langchain-text-splitters==0.3.0
+langsmith==0.1.136
+markdown-it-py==3.0.0
+MarkupSafe==3.0.1
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.1.0
+narwhals==1.9.4
+networkx==3.4.2
+numpy==1.26.4
+openai==1.52.0
+orjson==3.10.6
+packaging==24.1
+pandas==2.2.3
+pillow==10.4.0
+propcache==0.2.0
+protobuf==5.28.2
+pyarrow==17.0.0
+pydantic==2.8.2
+pydantic_core==2.20.1
+pydeck==0.9.1
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+pytz==2024.2
+PyYAML==6.0.1
+referencing==0.35.1
+regex==2024.9.11
+requests==2.32.3
+requests-toolbelt==1.0.0
+rich==13.9.2
+rpds-py==0.20.0
+safetensors==0.4.5
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+SQLAlchemy==2.0.36
+streamlit==1.39.0
+streamlit_arabic_support_wrapper==1.1
+sympy==1.13.1
+tenacity==8.5.0
+tiktoken==0.8.0
+tokenizers==0.20.1
+toml==0.10.2
+torch==2.5.0
+tornado==6.4.1
+tqdm==4.66.5
+transformers==4.45.2
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.2
+watchdog==5.0.3
+yarl==1.15.5