ramy2018 commited on
Commit
b6f8046
·
verified ·
1 Parent(s): 7153947

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +54 -0
  2. rag_pipeline.py +54 -0
  3. utils.py +62 -0
app.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ from rag_pipeline import RAGPipeline
4
+ from utils import process_documents
5
+ import time
6
+
7
+ rag = RAGPipeline()
8
+
9
+ def log_message(msg, logs):
10
+ logs = logs + msg + "\n"
11
+ return logs
12
+
13
+ def upload_and_index(files, logs):
14
+ logs = log_message("[RAG] بدء معالجة الملفات...", logs)
15
+ all_chunks = []
16
+ for file in files:
17
+ logs = log_message(f"[RAG] معالجة الملف: {file.name}", logs)
18
+ chunks = process_documents(file.name)
19
+ all_chunks.extend(chunks)
20
+ logs = log_message(f"[RAG] تم استخراج {len(chunks)} مقطع من {file.name}", logs)
21
+
22
+ logs = log_message(f"[RAG] بناء الفهرس لـ {len(all_chunks)} مقطع...", logs)
23
+ start = time.time()
24
+ rag.build_index(all_chunks)
25
+ duration = time.time() - start
26
+ logs = log_message(f"[RAG] تم بناء الفهرس في {duration:.2f} ثانية.", logs)
27
+ return logs, gr.update(visible=True), gr.update(visible=True)
28
+
29
+ def answer_question(question, logs):
30
+ logs = log_message(f"[RAG] استلام السؤال: {question}", logs)
31
+ start = time.time()
32
+ answer, sources = rag.answer(question)
33
+ duration = time.time() - start
34
+ logs = log_message(f"[RAG] تم الإجابة في {duration:.2f} ثانية.", logs)
35
+ logs = log_message(f"[RAG] المصادر: {sources}", logs)
36
+ return answer, logs
37
+
38
+ with gr.Blocks() as demo:
39
+ logs = gr.State("")
40
+ gr.Markdown("# نظام استرجاع المعرفة (RAG) للغة العربية باستخدام mT5")
41
+
42
+ with gr.Row():
43
+ files_input = gr.File(file_types=['.pdf', '.docx', '.txt'], file_count="multiple", label="رفع الملفات")
44
+ upload_btn = gr.Button("رفع وبناء الفهرس")
45
+
46
+ logs_output = gr.Textbox(label="سجل العمليات", lines=12, interactive=False, value="")
47
+ question_input = gr.Textbox(label="اكتب سؤالك هنا", visible=False)
48
+ ask_btn = gr.Button("إرسال السؤال", visible=False)
49
+ answer_output = gr.Textbox(label="الإجابة", lines=5)
50
+
51
+ upload_btn.click(upload_and_index, inputs=[files_input, logs], outputs=[logs_output, question_input, ask_btn])
52
+ ask_btn.click(answer_question, inputs=[question_input, logs], outputs=[answer_output, logs_output])
53
+
54
+ demo.launch()
rag_pipeline.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
+ from sentence_transformers import SentenceTransformer, models
4
+ import numpy as np
5
+ import torch
6
+ import time
7
+
8
+ class RAGPipeline:
9
+ def __init__(self):
10
+ print("[RAG] تحميل النماذج العربية...")
11
+
12
+ # SentenceTransformer for retrieval
13
+ word_embedding_model = models.Transformer('asafaya/bert-base-arabic')
14
+ pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
15
+ self.embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
16
+
17
+ # Generative QA model (Arabic-capable)
18
+ self.tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
19
+ self.model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")
20
+
21
+ self.index = None
22
+ self.chunks = []
23
+ self.chunk_embeddings = []
24
+
25
+ print("[RAG] تم تحميل النماذج بنجاح.")
26
+
27
+ def build_index(self, chunks, logs=None):
28
+ self.chunks = chunks
29
+ self.chunk_embeddings = self.embedder.encode(chunks, convert_to_numpy=True)
30
+ if logs is not None:
31
+ logs.append(f"[RAG] تم بناء الفهرس بـ {self.chunk_embeddings.shape[0]} مقطع.")
32
+ self.index = np.array(self.chunk_embeddings)
33
+
34
+ def answer(self, question):
35
+ question_embedding = self.embedder.encode([question], convert_to_numpy=True)
36
+ similarities = np.dot(self.index, question_embedding.T).squeeze()
37
+ top_idx = similarities.argsort()[-5:][::-1]
38
+ sources = [self.chunks[i] for i in top_idx]
39
+ context = " ".join(sources)
40
+
41
+ # Prompt for generative QA
42
+ prompt = f"أجب عن السؤال التالي اعتماداً على النص:
43
+
44
+ السياق:
45
+ {context}
46
+
47
+ السؤال: {question}
48
+ الإجابة:"
49
+
50
+ inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
51
+ outputs = self.model.generate(inputs["input_ids"], max_length=200)
52
+ answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
53
+
54
+ return answer.strip(), sources
utils.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import re
4
+ from docx import Document
5
+ from PyPDF2 import PdfReader
6
+
7
+ def read_txt_file(file_path):
8
+ try:
9
+ with open(file_path, "r", encoding="utf-8") as f:
10
+ return f.read()
11
+ except UnicodeDecodeError:
12
+ with open(file_path, "r", encoding="cp1256") as f:
13
+ return f.read()
14
+
15
+ def read_docx_file(file_path):
16
+ doc = Document(file_path)
17
+ full_text = [para.text for para in doc.paragraphs]
18
+ return '\n'.join(full_text)
19
+
20
+ def read_pdf_file(file_path):
21
+ reader = PdfReader(file_path)
22
+ text = ""
23
+ for page in reader.pages:
24
+ text += page.extract_text() or ""
25
+ return text
26
+
27
+ def split_arabic_text(text, chunk_size=500, overlap=50):
28
+ sentences = re.split(r'(?<=[.؟!])\s+', text)
29
+ chunks = []
30
+ current_chunk = ""
31
+
32
+ for sentence in sentences:
33
+ if len(current_chunk) + len(sentence) <= chunk_size:
34
+ current_chunk += sentence + " "
35
+ else:
36
+ chunks.append(current_chunk.strip())
37
+ current_chunk = sentence + " "
38
+
39
+ if current_chunk:
40
+ chunks.append(current_chunk.strip())
41
+
42
+ overlapped_chunks = []
43
+ for i in range(0, len(chunks)):
44
+ start = max(0, i - 1)
45
+ overlapped_chunks.append(" ".join(chunks[start:i+1]))
46
+
47
+ return overlapped_chunks
48
+
49
+ def process_documents(file_path):
50
+ ext = os.path.splitext(file_path)[1].lower()
51
+ if ext == ".txt":
52
+ raw_text = read_txt_file(file_path)
53
+ elif ext == ".docx":
54
+ raw_text = read_docx_file(file_path)
55
+ elif ext == ".pdf":
56
+ raw_text = read_pdf_file(file_path)
57
+ else:
58
+ return []
59
+
60
+ clean_text = raw_text.replace('\n', ' ').replace('\r', ' ').strip()
61
+ chunks = split_arabic_text(clean_text, chunk_size=500)
62
+ return chunks