NaimaAqeel commited on
Commit
de6a22c
·
verified ·
1 Parent(s): 5b2f320

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -16
app.py CHANGED
@@ -9,7 +9,7 @@ import faiss
9
  import torch
10
 
11
  # ===============================
12
- # EMBEDDING MODEL
13
  # ===============================
14
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -22,16 +22,14 @@ def get_embeddings(texts):
22
  with torch.no_grad():
23
  outputs = embedding_model(**inputs)
24
  embeddings = outputs.last_hidden_state[:, 0].cpu().numpy()
 
 
25
  return embeddings
26
 
27
- def normalize_embeddings(embeddings):
28
- norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
29
- return embeddings / (norms + 1e-10)
30
-
31
  # ===============================
32
  # TEXT CHUNKING
33
  # ===============================
34
- def chunk_text(text, chunk_size=400, overlap=100):
35
  chunks = []
36
  start = 0
37
  while start < len(text):
@@ -46,7 +44,7 @@ def chunk_text(text, chunk_size=400, overlap=100):
46
  index_path = "faiss_index.pkl"
47
  document_texts_path = "document_texts.pkl"
48
  document_texts = []
49
- embedding_dim = 384
50
 
51
  if os.path.exists(index_path) and os.path.exists(document_texts_path):
52
  try:
@@ -78,7 +76,6 @@ def extract_text_from_docx(path):
78
  try:
79
  doc = Document(path)
80
  text = "\n".join([para.text for para in doc.paragraphs])
81
- print(f"Extracted DOCX text preview: {text[:500]}") # Preview first 500 chars for debug
82
  except Exception as e:
83
  print(f"DOCX error: {e}")
84
  return text
@@ -97,7 +94,6 @@ def upload_document(file):
97
 
98
  chunks = chunk_text(text)
99
  chunk_embeddings = get_embeddings(chunks)
100
- chunk_embeddings = normalize_embeddings(chunk_embeddings)
101
  index.add(np.array(chunk_embeddings).astype('float32'))
102
  document_texts.extend(chunks)
103
 
@@ -113,30 +109,29 @@ def upload_document(file):
113
  # ===============================
114
  qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
115
 
116
- def generate_answer_from_file(query, top_k=5):
117
  if not document_texts:
118
  return "No documents indexed yet."
119
 
120
  query_vector = get_embeddings(query).astype("float32")
121
- query_vector = normalize_embeddings(query_vector)
122
  scores, indices = index.search(query_vector, k=top_k)
123
  retrieved_chunks = [document_texts[i] for i in indices[0]]
124
  context = "\n\n".join(retrieved_chunks)
125
 
126
  prompt = (
127
- "You are a helpful assistant reading student notes or textbook passages.\n\n"
128
- "Based on the context provided, answer the question accurately.\n\n"
129
  "### Example\n"
130
  "Context:\nArtificial systems are created by people. These systems are designed to perform specific tasks, improve efficiency, and solve problems. Examples include knowledge systems, engineering systems, and social systems.\n\n"
131
  "Question: What is an Artificial System?\n"
132
- "Answer: Artificial systems are systems created by humans to perform specific tasks, improve efficiency, and solve problems. They include systems like knowledge systems, engineering systems, and social systems.\n\n"
133
  "### Now answer this\n"
134
  f"Context:\n{context}\n\n"
135
  f"Question: {query}\n"
136
- f"Answer:"
137
  )
138
 
139
- result = qa_pipeline(prompt, max_length=512, do_sample=False)[0]['generated_text']
140
  return result.strip()
141
 
142
  # ===============================
@@ -160,3 +155,4 @@ search_interface = gr.Interface(
160
 
161
  app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
162
  app.launch()
 
 
9
  import torch
10
 
11
  # ===============================
12
+ # EMBEDDING MODEL SETUP
13
  # ===============================
14
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
22
  with torch.no_grad():
23
  outputs = embedding_model(**inputs)
24
  embeddings = outputs.last_hidden_state[:, 0].cpu().numpy()
25
+ # Normalize embeddings to unit length for cosine similarity
26
+ embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
27
  return embeddings
28
 
 
 
 
 
29
  # ===============================
30
  # TEXT CHUNKING
31
  # ===============================
32
+ def chunk_text(text, chunk_size=500, overlap=50):
33
  chunks = []
34
  start = 0
35
  while start < len(text):
 
44
  index_path = "faiss_index.pkl"
45
  document_texts_path = "document_texts.pkl"
46
  document_texts = []
47
+ embedding_dim = 384 # For all-MiniLM-L6-v2
48
 
49
  if os.path.exists(index_path) and os.path.exists(document_texts_path):
50
  try:
 
76
  try:
77
  doc = Document(path)
78
  text = "\n".join([para.text for para in doc.paragraphs])
 
79
  except Exception as e:
80
  print(f"DOCX error: {e}")
81
  return text
 
94
 
95
  chunks = chunk_text(text)
96
  chunk_embeddings = get_embeddings(chunks)
 
97
  index.add(np.array(chunk_embeddings).astype('float32'))
98
  document_texts.extend(chunks)
99
 
 
109
  # ===============================
110
  qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
111
 
112
+ def generate_answer_from_file(query, top_k=7):
113
  if not document_texts:
114
  return "No documents indexed yet."
115
 
116
  query_vector = get_embeddings(query).astype("float32")
 
117
  scores, indices = index.search(query_vector, k=top_k)
118
  retrieved_chunks = [document_texts[i] for i in indices[0]]
119
  context = "\n\n".join(retrieved_chunks)
120
 
121
  prompt = (
122
+ "You are a helpful and precise assistant reading student notes or textbook passages.\n\n"
123
+ "Based on the context provided, answer the question accurately and in detail using full sentences.\n\n"
124
  "### Example\n"
125
  "Context:\nArtificial systems are created by people. These systems are designed to perform specific tasks, improve efficiency, and solve problems. Examples include knowledge systems, engineering systems, and social systems.\n\n"
126
  "Question: What is an Artificial System?\n"
127
+ "Answer: Artificial systems are systems created by humans to perform specific tasks, improve efficiency, and solve problems. They include systems such as knowledge systems, engineering systems, and social systems.\n\n"
128
  "### Now answer this\n"
129
  f"Context:\n{context}\n\n"
130
  f"Question: {query}\n"
131
+ "Answer:\nPlease answer ONLY based on the context above without adding extra information."
132
  )
133
 
134
+ result = qa_pipeline(prompt, max_length=700, do_sample=False)[0]['generated_text']
135
  return result.strip()
136
 
137
  # ===============================
 
155
 
156
  app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
157
  app.launch()
158
+