gourisankar85 commited on
Commit
2f435af
·
verified ·
1 Parent(s): ba4cc80

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +133 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import PyPDF2
3
+ import faiss
4
+ import numpy as np
5
+
6
+ #Specify file paths
7
+ file_path1 = '/content/AST-1.pdf'
8
+ file_path2 = '/content/AST-2.pdf'
9
+
10
+ #Step 1 : Load the document files
11
+ def read_pdf(file_path):
12
+ with open(file_path, 'rb') as file:
13
+ reader = PyPDF2.PdfReader(file)
14
+ text = ''
15
+ for page_num in range(len(reader.pages)):
16
+ page = reader.pages[page_num]
17
+ text += page.extract_text()
18
+ return text
19
+
20
+ ast1_text = read_pdf(file_path1)
21
+ ast2_text = read_pdf(file_path2)
22
+
23
+ #Step 2 Split the loaded documents into chunks
24
+ # Split by Fixed Number of Words:
25
+ def chunk_text(text, chunk_size=200):
26
+ words = text.split()
27
+ chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
28
+ return chunks
29
+
30
+ ast1_chunks = chunk_text(ast1_text, chunk_size=100)
31
+ ast2_chunks = chunk_text(ast2_text, chunk_size=150)
32
+
33
+ #label the chunks
34
+ ast1_chunks = [(chunk, 'AST-1') for chunk in ast1_chunks]
35
+ ast2_chunks = [(chunk, 'AST-2') for chunk in ast2_chunks]
36
+ all_chunks = ast1_chunks + ast2_chunks
37
+
38
+ #Load the Embedding Model and LLM
39
+ from sentence_transformers import SentenceTransformer
40
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
41
+
42
+ # Load the pre-trained model from the MTEB leaderboard
43
+ embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
44
+
45
+ embeddings = embedding_model.encode(all_chunks, convert_to_tensor=True)
46
+
47
+ # Convert embeddings to numpy arrays
48
+ embeddings_np = np.array([embedding.cpu().numpy() for embedding in embeddings])
49
+
50
+ # Create a FAISS index
51
+ dimension = embeddings_np.shape[1]
52
+ index = faiss.IndexFlatL2(dimension)
53
+ index.add(embeddings_np)
54
+
55
+ # Save the index
56
+ faiss.write_index(index, 'embeddings_index.faiss')
57
+
58
+ # Load FAISS index
59
+ stored_index = faiss.read_index('/content/embeddings_index.faiss')
60
+
61
+ #Function to retrieve chunks
62
+ def retrieve_chunks(query, top_k=10, use_mmr=False, diversity=0.5, target_doc='AST-1'):
63
+ query_embedding = embedding_model.encode(query, convert_to_tensor=True).cpu().numpy()
64
+ distances, indices = stored_index.search(np.array([query_embedding]), top_k)
65
+
66
+ if use_mmr:
67
+ # Implement MMR-based retrieval
68
+ from sklearn.metrics.pairwise import cosine_similarity
69
+
70
+ selected_indices = []
71
+ selected_distances = []
72
+ candidate_indices = [i for i in indices[0] if all_chunks[i][1] == target_doc]
73
+ candidate_distances = [distances[0][i] for i in range(len(indices[0])) if all_chunks[indices[0][i]][1] == target_doc]
74
+
75
+ while len(selected_indices) < top_k and candidate_indices:
76
+ if not selected_indices:
77
+ selected_indices.append(candidate_indices.pop(0))
78
+ selected_distances.append(candidate_distances.pop(0))
79
+ else:
80
+ remaining_candidates = [candidate_indices[i] for i in range(len(candidate_indices))]
81
+ remaining_embeddings = embeddings_np[remaining_candidates]
82
+ selected_embeddings = embeddings_np[selected_indices]
83
+
84
+ similarities = cosine_similarity(remaining_embeddings, selected_embeddings)
85
+ mmr_scores = (1 - diversity) * np.array(candidate_distances[:len(remaining_candidates)]) - diversity * np.max(similarities, axis=1)
86
+
87
+ next_index = np.argmax(mmr_scores)
88
+ selected_indices.append(candidate_indices.pop(next_index))
89
+ selected_distances.append(candidate_distances.pop(next_index))
90
+
91
+ return [all_chunks[i][0] for i in selected_indices]
92
+ else:
93
+ retrieved_chunks = []
94
+ for idx in indices[0]:
95
+ chunk, doc_label = all_chunks[idx]
96
+ if doc_label == target_doc:
97
+ retrieved_chunks.append(chunk)
98
+ if len(retrieved_chunks) >= top_k:
99
+ break
100
+ return retrieved_chunks
101
+
102
+
103
+ # Generate response
104
+ def generate_response(query, retrieved_chunks):
105
+ #context = " ".join([chunk for chunk, _ in retrieved_chunks])
106
+ context = " ".join(retrieved_chunks) # for retrieved_chunks as array of strings
107
+ input_text = f"Query: {query}\nContext: {context}"
108
+ inputs = tokenizer(input_text, return_tensors='pt', max_length=1024, truncation=True)
109
+ summary_ids = generation_model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
110
+ return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
111
+
112
+ def rag_system(query, use_mmr=False):
113
+ retrieved_chunks = retrieve_chunks(query, top_k=3, use_mmr=use_mmr)
114
+ response = generate_response(query, retrieved_chunks)
115
+ return response
116
+
117
+ import gradio as gr
118
+
119
+ def query_rag_system(query, use_mmr):
120
+ return rag_system(query, use_mmr=use_mmr)
121
+
122
+ interface = gr.Interface(
123
+ fn=query_rag_system,
124
+ inputs=[
125
+ gr.Textbox(lines=2, placeholder="Enter your query here..."),
126
+ gr.Checkbox(label="Use MMR")
127
+ ],
128
+ outputs="text",
129
+ title="RAG System",
130
+ description="Enter a query to get a response from the RAG system. Optionally, use MMR for better results."
131
+ )
132
+
133
+ interface.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ PyPDF2
2
+ gradio
3
+ transformers
4
+ sentence-transformers
5
+ faiss-cpu