Spaces:

muhammadshaheryar
/

RAG-REV-01

Sleeping

App Files Files Community

muhammadshaheryar commited on Nov 12, 2024

Commit

cf5502d

verified ·

1 Parent(s): 77f5320

Create app.py

Browse files

Files changed (1) hide show

app.py +101 -0

app.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# Install necessary libraries if not already installed
+!pip install transformers sentence-transformers faiss-cpu PyMuPDF pandas python-docx xlrd openpyxl streamlit
+import faiss
+import fitz  # PyMuPDF
+import pandas as pd
+from transformers import DPRQuestionEncoder, DPRContextEncoder, AutoTokenizer, pipeline
+from sentence_transformers import SentenceTransformer
+from docx import Document
+import streamlit as st
+import os
+from bs4 import BeautifulSoup
+# Initialize models and FAISS index
+embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+index = faiss.IndexFlatL2(384)  # 384-dimensional embeddings for this model
+document_texts = []
+document_mapping = {}
+# Function to load and convert files to text
+def load_text_from_files(file_path):
+    if file_path.endswith(".pdf"):
+        return extract_text_from_pdf(file_path)
+    elif file_path.endswith(".docx"):
+        return extract_text_from_docx(file_path)
+    elif file_path.endswith(".csv"):
+        return extract_text_from_csv(file_path)
+    elif file_path.endswith(".xlsx"):
+        return extract_text_from_xlsx(file_path)
+    elif file_path.endswith(".html"):
+        return extract_text_from_html(file_path)
+    else:
+        return ""
+def extract_text_from_pdf(file_path):
+    text = ""
+    with fitz.open(file_path) as doc:
+        for page in doc:
+            text += page.get_text()
+    return text
+def extract_text_from_docx(file_path):
+    doc = Document(file_path)
+    return " ".join([para.text for para in doc.paragraphs])
+def extract_text_from_csv(file_path):
+    df = pd.read_csv(file_path)
+    return " ".join(df.apply(lambda row: " ".join(map(str, row)), axis=1))
+def extract_text_from_xlsx(file_path):
+    df = pd.read_excel(file_path)
+    return " ".join(df.apply(lambda row: " ".join(map(str, row)), axis=1))
+def extract_text_from_html(file_path):
+    with open(file_path, "r") as file:
+        soup = BeautifulSoup(file, "html.parser")
+    return soup.get_text()
+# Indexing uploaded documents
+def index_documents(uploaded_files):
+    global document_texts, document_mapping
+    for file in uploaded_files:
+        file_path = os.path.join("/content/temp/", file.name)
+        with open(file_path, "wb") as f:
+            f.write(file.read())
+        text = load_text_from_files(file_path)
+        if text:
+            document_texts.append(text)
+            embeddings = embedding_model.encode([text])
+            index.add(embeddings)
+            document_mapping[len(document_texts) - 1] = text
+# Load retrieval and generation models
+question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
+context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
+question_tokenizer = AutoTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
+generator = pipeline("text-generation", model="gpt2")
+# RAG pipeline function
+def retrieve_and_generate(query):
+    query_embeddings = embedding_model.encode([query])
+    _, I = index.search(query_embeddings, k=5)  # Top-5 relevant contexts
+    retrieved_texts = [document_mapping[idx] for idx in I[0]]
+    context = " ".join(retrieved_texts)
+    response = generator(f"{query} [SEP] {context}", max_length=150, num_return_sequences=1)
+    return response[0]['generated_text']
+# Streamlit interface
+st.title("Electrical Engineering RAG System")
+st.write("Upload your files, ask questions, and get responses based on your data.")
+uploaded_files = st.file_uploader("Upload Documents", accept_multiple_files=True, type=["pdf", "docx", "csv", "xlsx", "html"])
+if uploaded_files:
+    index_documents(uploaded_files)
+    st.write("Files uploaded successfully! You can now ask questions.")
+    user_query = st.text_input("Ask a question:")
+    if user_query:
+        response = retrieve_and_generate(user_query)
+        st.write("Answer:", response)