Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- app.py +133 -0
- requirements.txt +5 -0
app.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import PyPDF2
|
3 |
+
import faiss
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
#Specify file paths
|
7 |
+
file_path1 = '/content/AST-1.pdf'
|
8 |
+
file_path2 = '/content/AST-2.pdf'
|
9 |
+
|
10 |
+
#Step 1 : Load the document files
|
11 |
+
def read_pdf(file_path):
|
12 |
+
with open(file_path, 'rb') as file:
|
13 |
+
reader = PyPDF2.PdfReader(file)
|
14 |
+
text = ''
|
15 |
+
for page_num in range(len(reader.pages)):
|
16 |
+
page = reader.pages[page_num]
|
17 |
+
text += page.extract_text()
|
18 |
+
return text
|
19 |
+
|
20 |
+
ast1_text = read_pdf(file_path1)
|
21 |
+
ast2_text = read_pdf(file_path2)
|
22 |
+
|
23 |
+
#Step 2 Split the loaded documents into chunks
|
24 |
+
# Split by Fixed Number of Words:
|
25 |
+
def chunk_text(text, chunk_size=200):
|
26 |
+
words = text.split()
|
27 |
+
chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
|
28 |
+
return chunks
|
29 |
+
|
30 |
+
ast1_chunks = chunk_text(ast1_text, chunk_size=100)
|
31 |
+
ast2_chunks = chunk_text(ast2_text, chunk_size=150)
|
32 |
+
|
33 |
+
#label the chunks
|
34 |
+
ast1_chunks = [(chunk, 'AST-1') for chunk in ast1_chunks]
|
35 |
+
ast2_chunks = [(chunk, 'AST-2') for chunk in ast2_chunks]
|
36 |
+
all_chunks = ast1_chunks + ast2_chunks
|
37 |
+
|
38 |
+
#Load the Embedding Model and LLM
|
39 |
+
from sentence_transformers import SentenceTransformer
|
40 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
41 |
+
|
42 |
+
# Load the pre-trained model from the MTEB leaderboard
|
43 |
+
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
44 |
+
|
45 |
+
embeddings = embedding_model.encode(all_chunks, convert_to_tensor=True)
|
46 |
+
|
47 |
+
# Convert embeddings to numpy arrays
|
48 |
+
embeddings_np = np.array([embedding.cpu().numpy() for embedding in embeddings])
|
49 |
+
|
50 |
+
# Create a FAISS index
|
51 |
+
dimension = embeddings_np.shape[1]
|
52 |
+
index = faiss.IndexFlatL2(dimension)
|
53 |
+
index.add(embeddings_np)
|
54 |
+
|
55 |
+
# Save the index
|
56 |
+
faiss.write_index(index, 'embeddings_index.faiss')
|
57 |
+
|
58 |
+
# Load FAISS index
|
59 |
+
stored_index = faiss.read_index('/content/embeddings_index.faiss')
|
60 |
+
|
61 |
+
#Function to retrieve chunks
|
62 |
+
def retrieve_chunks(query, top_k=10, use_mmr=False, diversity=0.5, target_doc='AST-1'):
|
63 |
+
query_embedding = embedding_model.encode(query, convert_to_tensor=True).cpu().numpy()
|
64 |
+
distances, indices = stored_index.search(np.array([query_embedding]), top_k)
|
65 |
+
|
66 |
+
if use_mmr:
|
67 |
+
# Implement MMR-based retrieval
|
68 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
69 |
+
|
70 |
+
selected_indices = []
|
71 |
+
selected_distances = []
|
72 |
+
candidate_indices = [i for i in indices[0] if all_chunks[i][1] == target_doc]
|
73 |
+
candidate_distances = [distances[0][i] for i in range(len(indices[0])) if all_chunks[indices[0][i]][1] == target_doc]
|
74 |
+
|
75 |
+
while len(selected_indices) < top_k and candidate_indices:
|
76 |
+
if not selected_indices:
|
77 |
+
selected_indices.append(candidate_indices.pop(0))
|
78 |
+
selected_distances.append(candidate_distances.pop(0))
|
79 |
+
else:
|
80 |
+
remaining_candidates = [candidate_indices[i] for i in range(len(candidate_indices))]
|
81 |
+
remaining_embeddings = embeddings_np[remaining_candidates]
|
82 |
+
selected_embeddings = embeddings_np[selected_indices]
|
83 |
+
|
84 |
+
similarities = cosine_similarity(remaining_embeddings, selected_embeddings)
|
85 |
+
mmr_scores = (1 - diversity) * np.array(candidate_distances[:len(remaining_candidates)]) - diversity * np.max(similarities, axis=1)
|
86 |
+
|
87 |
+
next_index = np.argmax(mmr_scores)
|
88 |
+
selected_indices.append(candidate_indices.pop(next_index))
|
89 |
+
selected_distances.append(candidate_distances.pop(next_index))
|
90 |
+
|
91 |
+
return [all_chunks[i][0] for i in selected_indices]
|
92 |
+
else:
|
93 |
+
retrieved_chunks = []
|
94 |
+
for idx in indices[0]:
|
95 |
+
chunk, doc_label = all_chunks[idx]
|
96 |
+
if doc_label == target_doc:
|
97 |
+
retrieved_chunks.append(chunk)
|
98 |
+
if len(retrieved_chunks) >= top_k:
|
99 |
+
break
|
100 |
+
return retrieved_chunks
|
101 |
+
|
102 |
+
|
103 |
+
# Generate response
|
104 |
+
def generate_response(query, retrieved_chunks):
|
105 |
+
#context = " ".join([chunk for chunk, _ in retrieved_chunks])
|
106 |
+
context = " ".join(retrieved_chunks) # for retrieved_chunks as array of strings
|
107 |
+
input_text = f"Query: {query}\nContext: {context}"
|
108 |
+
inputs = tokenizer(input_text, return_tensors='pt', max_length=1024, truncation=True)
|
109 |
+
summary_ids = generation_model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
|
110 |
+
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
111 |
+
|
112 |
+
def rag_system(query, use_mmr=False):
|
113 |
+
retrieved_chunks = retrieve_chunks(query, top_k=3, use_mmr=use_mmr)
|
114 |
+
response = generate_response(query, retrieved_chunks)
|
115 |
+
return response
|
116 |
+
|
117 |
+
import gradio as gr
|
118 |
+
|
119 |
+
def query_rag_system(query, use_mmr):
|
120 |
+
return rag_system(query, use_mmr=use_mmr)
|
121 |
+
|
122 |
+
interface = gr.Interface(
|
123 |
+
fn=query_rag_system,
|
124 |
+
inputs=[
|
125 |
+
gr.Textbox(lines=2, placeholder="Enter your query here..."),
|
126 |
+
gr.Checkbox(label="Use MMR")
|
127 |
+
],
|
128 |
+
outputs="text",
|
129 |
+
title="RAG System",
|
130 |
+
description="Enter a query to get a response from the RAG system. Optionally, use MMR for better results."
|
131 |
+
)
|
132 |
+
|
133 |
+
interface.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PyPDF2
|
2 |
+
gradio
|
3 |
+
transformers
|
4 |
+
sentence-transformers
|
5 |
+
faiss-cpu
|