Spaces:

ramy2018
/

pope30

Sleeping

App Files Files Community

ramy2018 commited on 11 days ago

Commit

b6f8046

verified ·

1 Parent(s): 7153947

Upload 3 files

Browse files

Files changed (3) hide show

app.py +54 -0
rag_pipeline.py +54 -0
utils.py +62 -0

app.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import gradio as gr
+from rag_pipeline import RAGPipeline
+from utils import process_documents
+import time
+rag = RAGPipeline()
+def log_message(msg, logs):
+    logs = logs + msg + "\n"
+    return logs
+def upload_and_index(files, logs):
+    logs = log_message("[RAG] بدء معالجة الملفات...", logs)
+    all_chunks = []
+    for file in files:
+        logs = log_message(f"[RAG] معالجة الملف: {file.name}", logs)
+        chunks = process_documents(file.name)
+        all_chunks.extend(chunks)
+        logs = log_message(f"[RAG] تم استخراج {len(chunks)} مقطع من {file.name}", logs)
+    logs = log_message(f"[RAG] بناء الفهرس لـ {len(all_chunks)} مقطع...", logs)
+    start = time.time()
+    rag.build_index(all_chunks)
+    duration = time.time() - start
+    logs = log_message(f"[RAG] تم بناء الفهرس في {duration:.2f} ثانية.", logs)
+    return logs, gr.update(visible=True), gr.update(visible=True)
+def answer_question(question, logs):
+    logs = log_message(f"[RAG] استلام السؤال: {question}", logs)
+    start = time.time()
+    answer, sources = rag.answer(question)
+    duration = time.time() - start
+    logs = log_message(f"[RAG] تم الإجابة في {duration:.2f} ثانية.", logs)
+    logs = log_message(f"[RAG] المصادر: {sources}", logs)
+    return answer, logs
+with gr.Blocks() as demo:
+    logs = gr.State("")
+    gr.Markdown("# نظام استرجاع المعرفة (RAG) للغة العربية باستخدام mT5")
+    with gr.Row():
+        files_input = gr.File(file_types=['.pdf', '.docx', '.txt'], file_count="multiple", label="رفع الملفات")
+        upload_btn = gr.Button("رفع وبناء الفهرس")
+    logs_output = gr.Textbox(label="سجل العمليات", lines=12, interactive=False, value="")
+    question_input = gr.Textbox(label="اكتب سؤالك هنا", visible=False)
+    ask_btn = gr.Button("إرسال السؤال", visible=False)
+    answer_output = gr.Textbox(label="الإجابة", lines=5)
+    upload_btn.click(upload_and_index, inputs=[files_input, logs], outputs=[logs_output, question_input, ask_btn])
+    ask_btn.click(answer_question, inputs=[question_input, logs], outputs=[answer_output, logs_output])
+demo.launch()

rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from sentence_transformers import SentenceTransformer, models
+import numpy as np
+import torch
+import time
+class RAGPipeline:
+    def __init__(self):
+        print("[RAG] تحميل النماذج العربية...")
+        # SentenceTransformer for retrieval
+        word_embedding_model = models.Transformer('asafaya/bert-base-arabic')
+        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+        self.embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+        # Generative QA model (Arabic-capable)
+        self.tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
+        self.model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")
+        self.index = None
+        self.chunks = []
+        self.chunk_embeddings = []
+        print("[RAG] تم تحميل النماذج بنجاح.")
+    def build_index(self, chunks, logs=None):
+        self.chunks = chunks
+        self.chunk_embeddings = self.embedder.encode(chunks, convert_to_numpy=True)
+        if logs is not None:
+            logs.append(f"[RAG] تم بناء الفهرس بـ {self.chunk_embeddings.shape[0]} مقطع.")
+        self.index = np.array(self.chunk_embeddings)
+    def answer(self, question):
+        question_embedding = self.embedder.encode([question], convert_to_numpy=True)
+        similarities = np.dot(self.index, question_embedding.T).squeeze()
+        top_idx = similarities.argsort()[-5:][::-1]
+        sources = [self.chunks[i] for i in top_idx]
+        context = " ".join(sources)
+        # Prompt for generative QA
+        prompt = f"أجب عن السؤال التالي اعتماداً على النص:
+السياق:
+{context}
+السؤال: {question}
+الإجابة:"
+        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
+        outputs = self.model.generate(inputs["input_ids"], max_length=200)
+        answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return answer.strip(), sources

utils.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import os
+import re
+from docx import Document
+from PyPDF2 import PdfReader
+def read_txt_file(file_path):
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            return f.read()
+    except UnicodeDecodeError:
+        with open(file_path, "r", encoding="cp1256") as f:
+            return f.read()
+def read_docx_file(file_path):
+    doc = Document(file_path)
+    full_text = [para.text for para in doc.paragraphs]
+    return '\n'.join(full_text)
+def read_pdf_file(file_path):
+    reader = PdfReader(file_path)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text() or ""
+    return text
+def split_arabic_text(text, chunk_size=500, overlap=50):
+    sentences = re.split(r'(?<=[.؟!])\s+', text)
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) <= chunk_size:
+            current_chunk += sentence + " "
+        else:
+            chunks.append(current_chunk.strip())
+            current_chunk = sentence + " "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    overlapped_chunks = []
+    for i in range(0, len(chunks)):
+        start = max(0, i - 1)
+        overlapped_chunks.append(" ".join(chunks[start:i+1]))
+    return overlapped_chunks
+def process_documents(file_path):
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext == ".txt":
+        raw_text = read_txt_file(file_path)
+    elif ext == ".docx":
+        raw_text = read_docx_file(file_path)
+    elif ext == ".pdf":
+        raw_text = read_pdf_file(file_path)
+    else:
+        return []
+    clean_text = raw_text.replace('\n', ' ').replace('\r', ' ').strip()
+    chunks = split_arabic_text(clean_text, chunk_size=500)
+    return chunks