Spaces:

ramy2018
/

pope30

Sleeping

App Files Files Community

ramy2018 commited on 10 days ago

Commit

d854811

verified ·

1 Parent(s): 47cd07d

Update rag_pipeline.py

Browse files

Files changed (1) hide show

rag_pipeline.py +26 -42

rag_pipeline.py CHANGED Viewed

@@ -14,64 +14,48 @@ class RAGPipeline:
         self.tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
         self.model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
-        self.index = None
         self.chunks = []
-        self.chunk_embeddings = []
-        self.summaries = []
         print("[RAG] تم تحميل النماذج بنجاح.")
     def summarize_text(self, text):
-        prompt = f"لخص النص التالي باللغة العربية:\n\n{text}"
         try:
             inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
             summary_ids = self.model.generate(inputs["input_ids"], max_length=256)
             return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True).strip()
-        except Exception as e:
-            print(f"[RAG] خطأ في التلخيص: {e}")
             return ""
-    def build_index(self, chunks, logs=None):
-        self.chunks = chunks
-        self.chunk_embeddings = self.embedder.encode(chunks, convert_to_numpy=True)
-        self.index = np.array(self.chunk_embeddings)
-        self.summaries = []
-        if logs is not None:
-            logs.append(f"[RAG] تم بناء الفهرس لـ {len(self.chunk_embeddings)} مقطع.")
-    def summarize_all_chunks(self, max_chunks=20):
-        self.summaries = []
-        total = min(max_chunks, len(self.chunks))
-        print(f"[RAG] تلخيص {total} من {len(self.chunks)} مقطع...")
-        for i, chunk in enumerate(self.chunks[:total]):
-            print(f"[RAG] تلخيص المقطع {i+1}/{total}")
-            summary = self.summarize_text(chunk)
-            self.summaries.append(summary)
-    def answer(self, question):
-        question_embedding = self.embedder.encode([question], convert_to_numpy=True)
-        similarities = np.dot(self.index, question_embedding.T).squeeze()
-        top_idx = similarities.argsort()[-5:][::-1]
-        sources = [self.chunks[i] for i in top_idx]
-        relevant_summaries = [
-            self.summaries[i]
-            for i in top_idx
-            if i < len(self.summaries) and self.summaries[i].strip()
-        ]
-        combined_summary = " ".join(relevant_summaries).strip()
-        if not combined_summary:
-            combined_summary = " ".join(sources)
-        qa_prompt = f"أجب عن السؤال التالي بناء على النص:\n\n{combined_summary}\n\nالسؤال: {question}\nالإجابة:"
         try:
-            inputs = self.tokenizer(qa_prompt, return_tensors="pt", truncation=True, max_length=512)
             output_ids = self.model.generate(inputs["input_ids"], max_length=200)
             answer = self.tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
-        except Exception as e:
-            print(f"[RAG] خطأ في توليد الإجابة: {e}")
             answer = ""
-        return answer, sources, combined_summary

         self.tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
         self.model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
         self.chunks = []
+        self.embeddings = []
         print("[RAG] تم تحميل النماذج بنجاح.")
+    def build_index(self, chunks):
+        self.chunks = chunks
+        self.embeddings = self.embedder.encode(chunks, convert_to_numpy=True)
+    def retrieve_passages(self, question, top_k=5):
+        if not self.embeddings or not self.chunks:
+            return []
+        question_embedding = self.embedder.encode([question], convert_to_numpy=True)
+        similarities = np.dot(self.embeddings, question_embedding.T).squeeze()
+        top_indices = similarities.argsort()[-top_k:][::-1]
+        return [self.chunks[i] for i in top_indices]
     def summarize_text(self, text):
+        prompt = f"لخص النص التالي:
+{text}"
         try:
             inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
             summary_ids = self.model.generate(inputs["input_ids"], max_length=256)
             return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True).strip()
+        except:
             return ""
+    def generate_answer_from_passages(self, question, passages):
+        context = " ".join(passages)
+        summary = self.summarize_text(context)
+        prompt = f"أجب عن السؤال التالي بناء على النص:
+{summary}
+السؤال: {question}
+الإجابة:"
         try:
+            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
             output_ids = self.model.generate(inputs["input_ids"], max_length=200)
             answer = self.tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
+        except:
             answer = ""
+        return answer, summary