NaimaAqeel commited on
Commit
2737463
·
verified ·
1 Parent(s): de6a22c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -13
app.py CHANGED
@@ -9,7 +9,7 @@ import faiss
9
  import torch
10
 
11
  # ===============================
12
- # EMBEDDING MODEL SETUP
13
  # ===============================
14
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -21,15 +21,12 @@ def get_embeddings(texts):
21
  inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
22
  with torch.no_grad():
23
  outputs = embedding_model(**inputs)
24
- embeddings = outputs.last_hidden_state[:, 0].cpu().numpy()
25
- # Normalize embeddings to unit length for cosine similarity
26
- embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
27
- return embeddings
28
 
29
  # ===============================
30
  # TEXT CHUNKING
31
  # ===============================
32
- def chunk_text(text, chunk_size=500, overlap=50):
33
  chunks = []
34
  start = 0
35
  while start < len(text):
@@ -44,7 +41,7 @@ def chunk_text(text, chunk_size=500, overlap=50):
44
  index_path = "faiss_index.pkl"
45
  document_texts_path = "document_texts.pkl"
46
  document_texts = []
47
- embedding_dim = 384 # For all-MiniLM-L6-v2
48
 
49
  if os.path.exists(index_path) and os.path.exists(document_texts_path):
50
  try:
@@ -109,7 +106,7 @@ def upload_document(file):
109
  # ===============================
110
  qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
111
 
112
- def generate_answer_from_file(query, top_k=7):
113
  if not document_texts:
114
  return "No documents indexed yet."
115
 
@@ -118,20 +115,23 @@ def generate_answer_from_file(query, top_k=7):
118
  retrieved_chunks = [document_texts[i] for i in indices[0]]
119
  context = "\n\n".join(retrieved_chunks)
120
 
 
 
 
121
  prompt = (
122
- "You are a helpful and precise assistant reading student notes or textbook passages.\n\n"
123
- "Based on the context provided, answer the question accurately and in detail using full sentences.\n\n"
124
  "### Example\n"
125
  "Context:\nArtificial systems are created by people. These systems are designed to perform specific tasks, improve efficiency, and solve problems. Examples include knowledge systems, engineering systems, and social systems.\n\n"
126
  "Question: What is an Artificial System?\n"
127
- "Answer: Artificial systems are systems created by humans to perform specific tasks, improve efficiency, and solve problems. They include systems such as knowledge systems, engineering systems, and social systems.\n\n"
128
  "### Now answer this\n"
129
  f"Context:\n{context}\n\n"
130
  f"Question: {query}\n"
131
- "Answer:\nPlease answer ONLY based on the context above without adding extra information."
132
  )
133
 
134
- result = qa_pipeline(prompt, max_length=700, do_sample=False)[0]['generated_text']
135
  return result.strip()
136
 
137
  # ===============================
@@ -156,3 +156,4 @@ search_interface = gr.Interface(
156
  app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
157
  app.launch()
158
 
 
 
9
  import torch
10
 
11
  # ===============================
12
+ # EMBEDDING MODEL
13
  # ===============================
14
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
21
  inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
22
  with torch.no_grad():
23
  outputs = embedding_model(**inputs)
24
+ return outputs.last_hidden_state[:, 0].cpu().numpy()
 
 
 
25
 
26
  # ===============================
27
  # TEXT CHUNKING
28
  # ===============================
29
+ def chunk_text(text, chunk_size=800, overlap=100):
30
  chunks = []
31
  start = 0
32
  while start < len(text):
 
41
  index_path = "faiss_index.pkl"
42
  document_texts_path = "document_texts.pkl"
43
  document_texts = []
44
+ embedding_dim = 384
45
 
46
  if os.path.exists(index_path) and os.path.exists(document_texts_path):
47
  try:
 
106
  # ===============================
107
  qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
108
 
109
+ def generate_answer_from_file(query, top_k=10):
110
  if not document_texts:
111
  return "No documents indexed yet."
112
 
 
115
  retrieved_chunks = [document_texts[i] for i in indices[0]]
116
  context = "\n\n".join(retrieved_chunks)
117
 
118
+ print("\n--- Retrieved Context ---\n", context) # Debugging print
119
+
120
+ # Prompt Engineering
121
  prompt = (
122
+ "You are a helpful assistant reading student notes or textbook passages.\n\n"
123
+ "Based on the context provided, answer the question accurately and clearly.\n\n"
124
  "### Example\n"
125
  "Context:\nArtificial systems are created by people. These systems are designed to perform specific tasks, improve efficiency, and solve problems. Examples include knowledge systems, engineering systems, and social systems.\n\n"
126
  "Question: What is an Artificial System?\n"
127
+ "Answer: Artificial systems are systems created by humans to perform specific tasks, improve efficiency, and solve problems. They include systems like knowledge systems, engineering systems, and social systems.\n\n"
128
  "### Now answer this\n"
129
  f"Context:\n{context}\n\n"
130
  f"Question: {query}\n"
131
+ f"Answer:"
132
  )
133
 
134
+ result = qa_pipeline(prompt, max_length=512, do_sample=False)[0]['generated_text']
135
  return result.strip()
136
 
137
  # ===============================
 
156
  app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
157
  app.launch()
158
 
159
+