Upload 3 files
Browse files- app.py +54 -0
- rag_pipeline.py +54 -0
- utils.py +62 -0
app.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import gradio as gr
|
3 |
+
from rag_pipeline import RAGPipeline
|
4 |
+
from utils import process_documents
|
5 |
+
import time
|
6 |
+
|
7 |
+
rag = RAGPipeline()
|
8 |
+
|
9 |
+
def log_message(msg, logs):
|
10 |
+
logs = logs + msg + "\n"
|
11 |
+
return logs
|
12 |
+
|
13 |
+
def upload_and_index(files, logs):
|
14 |
+
logs = log_message("[RAG] بدء معالجة الملفات...", logs)
|
15 |
+
all_chunks = []
|
16 |
+
for file in files:
|
17 |
+
logs = log_message(f"[RAG] معالجة الملف: {file.name}", logs)
|
18 |
+
chunks = process_documents(file.name)
|
19 |
+
all_chunks.extend(chunks)
|
20 |
+
logs = log_message(f"[RAG] تم استخراج {len(chunks)} مقطع من {file.name}", logs)
|
21 |
+
|
22 |
+
logs = log_message(f"[RAG] بناء الفهرس لـ {len(all_chunks)} مقطع...", logs)
|
23 |
+
start = time.time()
|
24 |
+
rag.build_index(all_chunks)
|
25 |
+
duration = time.time() - start
|
26 |
+
logs = log_message(f"[RAG] تم بناء الفهرس في {duration:.2f} ثانية.", logs)
|
27 |
+
return logs, gr.update(visible=True), gr.update(visible=True)
|
28 |
+
|
29 |
+
def answer_question(question, logs):
|
30 |
+
logs = log_message(f"[RAG] استلام السؤال: {question}", logs)
|
31 |
+
start = time.time()
|
32 |
+
answer, sources = rag.answer(question)
|
33 |
+
duration = time.time() - start
|
34 |
+
logs = log_message(f"[RAG] تم الإجابة في {duration:.2f} ثانية.", logs)
|
35 |
+
logs = log_message(f"[RAG] المصادر: {sources}", logs)
|
36 |
+
return answer, logs
|
37 |
+
|
38 |
+
with gr.Blocks() as demo:
|
39 |
+
logs = gr.State("")
|
40 |
+
gr.Markdown("# نظام استرجاع المعرفة (RAG) للغة العربية باستخدام mT5")
|
41 |
+
|
42 |
+
with gr.Row():
|
43 |
+
files_input = gr.File(file_types=['.pdf', '.docx', '.txt'], file_count="multiple", label="رفع الملفات")
|
44 |
+
upload_btn = gr.Button("رفع وبناء الفهرس")
|
45 |
+
|
46 |
+
logs_output = gr.Textbox(label="سجل العمليات", lines=12, interactive=False, value="")
|
47 |
+
question_input = gr.Textbox(label="اكتب سؤالك هنا", visible=False)
|
48 |
+
ask_btn = gr.Button("إرسال السؤال", visible=False)
|
49 |
+
answer_output = gr.Textbox(label="الإجابة", lines=5)
|
50 |
+
|
51 |
+
upload_btn.click(upload_and_index, inputs=[files_input, logs], outputs=[logs_output, question_input, ask_btn])
|
52 |
+
ask_btn.click(answer_question, inputs=[question_input, logs], outputs=[answer_output, logs_output])
|
53 |
+
|
54 |
+
demo.launch()
|
rag_pipeline.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
3 |
+
from sentence_transformers import SentenceTransformer, models
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
import time
|
7 |
+
|
8 |
+
class RAGPipeline:
|
9 |
+
def __init__(self):
|
10 |
+
print("[RAG] تحميل النماذج العربية...")
|
11 |
+
|
12 |
+
# SentenceTransformer for retrieval
|
13 |
+
word_embedding_model = models.Transformer('asafaya/bert-base-arabic')
|
14 |
+
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
|
15 |
+
self.embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
|
16 |
+
|
17 |
+
# Generative QA model (Arabic-capable)
|
18 |
+
self.tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
|
19 |
+
self.model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")
|
20 |
+
|
21 |
+
self.index = None
|
22 |
+
self.chunks = []
|
23 |
+
self.chunk_embeddings = []
|
24 |
+
|
25 |
+
print("[RAG] تم تحميل النماذج بنجاح.")
|
26 |
+
|
27 |
+
def build_index(self, chunks, logs=None):
|
28 |
+
self.chunks = chunks
|
29 |
+
self.chunk_embeddings = self.embedder.encode(chunks, convert_to_numpy=True)
|
30 |
+
if logs is not None:
|
31 |
+
logs.append(f"[RAG] تم بناء الفهرس بـ {self.chunk_embeddings.shape[0]} مقطع.")
|
32 |
+
self.index = np.array(self.chunk_embeddings)
|
33 |
+
|
34 |
+
def answer(self, question):
|
35 |
+
question_embedding = self.embedder.encode([question], convert_to_numpy=True)
|
36 |
+
similarities = np.dot(self.index, question_embedding.T).squeeze()
|
37 |
+
top_idx = similarities.argsort()[-5:][::-1]
|
38 |
+
sources = [self.chunks[i] for i in top_idx]
|
39 |
+
context = " ".join(sources)
|
40 |
+
|
41 |
+
# Prompt for generative QA
|
42 |
+
prompt = f"أجب عن السؤال التالي اعتماداً على النص:
|
43 |
+
|
44 |
+
السياق:
|
45 |
+
{context}
|
46 |
+
|
47 |
+
السؤال: {question}
|
48 |
+
الإجابة:"
|
49 |
+
|
50 |
+
inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
|
51 |
+
outputs = self.model.generate(inputs["input_ids"], max_length=200)
|
52 |
+
answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
53 |
+
|
54 |
+
return answer.strip(), sources
|
utils.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
from docx import Document
|
5 |
+
from PyPDF2 import PdfReader
|
6 |
+
|
7 |
+
def read_txt_file(file_path):
|
8 |
+
try:
|
9 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
10 |
+
return f.read()
|
11 |
+
except UnicodeDecodeError:
|
12 |
+
with open(file_path, "r", encoding="cp1256") as f:
|
13 |
+
return f.read()
|
14 |
+
|
15 |
+
def read_docx_file(file_path):
|
16 |
+
doc = Document(file_path)
|
17 |
+
full_text = [para.text for para in doc.paragraphs]
|
18 |
+
return '\n'.join(full_text)
|
19 |
+
|
20 |
+
def read_pdf_file(file_path):
|
21 |
+
reader = PdfReader(file_path)
|
22 |
+
text = ""
|
23 |
+
for page in reader.pages:
|
24 |
+
text += page.extract_text() or ""
|
25 |
+
return text
|
26 |
+
|
27 |
+
def split_arabic_text(text, chunk_size=500, overlap=50):
|
28 |
+
sentences = re.split(r'(?<=[.؟!])\s+', text)
|
29 |
+
chunks = []
|
30 |
+
current_chunk = ""
|
31 |
+
|
32 |
+
for sentence in sentences:
|
33 |
+
if len(current_chunk) + len(sentence) <= chunk_size:
|
34 |
+
current_chunk += sentence + " "
|
35 |
+
else:
|
36 |
+
chunks.append(current_chunk.strip())
|
37 |
+
current_chunk = sentence + " "
|
38 |
+
|
39 |
+
if current_chunk:
|
40 |
+
chunks.append(current_chunk.strip())
|
41 |
+
|
42 |
+
overlapped_chunks = []
|
43 |
+
for i in range(0, len(chunks)):
|
44 |
+
start = max(0, i - 1)
|
45 |
+
overlapped_chunks.append(" ".join(chunks[start:i+1]))
|
46 |
+
|
47 |
+
return overlapped_chunks
|
48 |
+
|
49 |
+
def process_documents(file_path):
|
50 |
+
ext = os.path.splitext(file_path)[1].lower()
|
51 |
+
if ext == ".txt":
|
52 |
+
raw_text = read_txt_file(file_path)
|
53 |
+
elif ext == ".docx":
|
54 |
+
raw_text = read_docx_file(file_path)
|
55 |
+
elif ext == ".pdf":
|
56 |
+
raw_text = read_pdf_file(file_path)
|
57 |
+
else:
|
58 |
+
return []
|
59 |
+
|
60 |
+
clean_text = raw_text.replace('\n', ' ').replace('\r', ' ').strip()
|
61 |
+
chunks = split_arabic_text(clean_text, chunk_size=500)
|
62 |
+
return chunks
|