muhammadshaheryar commited on
Commit
cf5502d
·
verified ·
1 Parent(s): 77f5320

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -0
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Install necessary libraries if not already installed
2
+ !pip install transformers sentence-transformers faiss-cpu PyMuPDF pandas python-docx xlrd openpyxl streamlit
3
+
4
+ import faiss
5
+ import fitz # PyMuPDF
6
+ import pandas as pd
7
+ from transformers import DPRQuestionEncoder, DPRContextEncoder, AutoTokenizer, pipeline
8
+ from sentence_transformers import SentenceTransformer
9
+ from docx import Document
10
+ import streamlit as st
11
+ import os
12
+ from bs4 import BeautifulSoup
13
+
14
+ # Initialize models and FAISS index
15
+ embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
16
+ index = faiss.IndexFlatL2(384) # 384-dimensional embeddings for this model
17
+ document_texts = []
18
+ document_mapping = {}
19
+
20
+ # Function to load and convert files to text
21
+ def load_text_from_files(file_path):
22
+ if file_path.endswith(".pdf"):
23
+ return extract_text_from_pdf(file_path)
24
+ elif file_path.endswith(".docx"):
25
+ return extract_text_from_docx(file_path)
26
+ elif file_path.endswith(".csv"):
27
+ return extract_text_from_csv(file_path)
28
+ elif file_path.endswith(".xlsx"):
29
+ return extract_text_from_xlsx(file_path)
30
+ elif file_path.endswith(".html"):
31
+ return extract_text_from_html(file_path)
32
+ else:
33
+ return ""
34
+
35
+ def extract_text_from_pdf(file_path):
36
+ text = ""
37
+ with fitz.open(file_path) as doc:
38
+ for page in doc:
39
+ text += page.get_text()
40
+ return text
41
+
42
+ def extract_text_from_docx(file_path):
43
+ doc = Document(file_path)
44
+ return " ".join([para.text for para in doc.paragraphs])
45
+
46
+ def extract_text_from_csv(file_path):
47
+ df = pd.read_csv(file_path)
48
+ return " ".join(df.apply(lambda row: " ".join(map(str, row)), axis=1))
49
+
50
+ def extract_text_from_xlsx(file_path):
51
+ df = pd.read_excel(file_path)
52
+ return " ".join(df.apply(lambda row: " ".join(map(str, row)), axis=1))
53
+
54
+ def extract_text_from_html(file_path):
55
+ with open(file_path, "r") as file:
56
+ soup = BeautifulSoup(file, "html.parser")
57
+ return soup.get_text()
58
+
59
+ # Indexing uploaded documents
60
+ def index_documents(uploaded_files):
61
+ global document_texts, document_mapping
62
+ for file in uploaded_files:
63
+ file_path = os.path.join("/content/temp/", file.name)
64
+ with open(file_path, "wb") as f:
65
+ f.write(file.read())
66
+ text = load_text_from_files(file_path)
67
+ if text:
68
+ document_texts.append(text)
69
+ embeddings = embedding_model.encode([text])
70
+ index.add(embeddings)
71
+ document_mapping[len(document_texts) - 1] = text
72
+
73
+ # Load retrieval and generation models
74
+ question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
75
+ context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
76
+ question_tokenizer = AutoTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
77
+ generator = pipeline("text-generation", model="gpt2")
78
+
79
+ # RAG pipeline function
80
+ def retrieve_and_generate(query):
81
+ query_embeddings = embedding_model.encode([query])
82
+ _, I = index.search(query_embeddings, k=5) # Top-5 relevant contexts
83
+ retrieved_texts = [document_mapping[idx] for idx in I[0]]
84
+ context = " ".join(retrieved_texts)
85
+ response = generator(f"{query} [SEP] {context}", max_length=150, num_return_sequences=1)
86
+ return response[0]['generated_text']
87
+
88
+ # Streamlit interface
89
+ st.title("Electrical Engineering RAG System")
90
+ st.write("Upload your files, ask questions, and get responses based on your data.")
91
+
92
+ uploaded_files = st.file_uploader("Upload Documents", accept_multiple_files=True, type=["pdf", "docx", "csv", "xlsx", "html"])
93
+
94
+ if uploaded_files:
95
+ index_documents(uploaded_files)
96
+ st.write("Files uploaded successfully! You can now ask questions.")
97
+
98
+ user_query = st.text_input("Ask a question:")
99
+ if user_query:
100
+ response = retrieve_and_generate(user_query)
101
+ st.write("Answer:", response)