gaur3009 commited on
Commit
d7086c2
Β·
verified Β·
1 Parent(s): e0d703d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -35
app.py CHANGED
@@ -7,57 +7,68 @@ from transformers import AutoTokenizer, AutoModel
7
  from weaviate.classes.init import Auth
8
  import cohere
9
 
10
- # Load credentials from environment variables or hardcode them temporarily
11
  WEAVIATE_URL = "vgwhgmrlqrqqgnlb1avjaa.c0.us-west3.gcp.weaviate.cloud"
12
  WEAVIATE_API_KEY = "7VoeYTjkOS4aHINuhllGpH4JPgE2QquFmSMn"
13
  COHERE_API_KEY = "LEvCVeZkqZMW1aLYjxDqlstCzWi4Cvlt9PiysqT8"
14
 
15
- # Connect to Weaviate
16
  client = weaviate.connect_to_weaviate_cloud(
17
  cluster_url=WEAVIATE_URL,
18
  auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
19
  headers={"X-Cohere-Api-Key": COHERE_API_KEY}
20
  )
21
-
22
  cohere_client = cohere.Client(COHERE_API_KEY)
23
 
24
- # Load sentence-transformer model
25
  tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
26
  model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
27
 
 
28
  def load_pdf(file):
29
- """Extract text from PDF file."""
30
  reader = PyPDF2.PdfReader(file)
31
  return ''.join([page.extract_text() for page in reader.pages if page.extract_text()])
32
 
33
  def get_embeddings(text):
34
- """Generate mean pooled embedding for the input text."""
35
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
36
  with torch.no_grad():
37
  embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
38
  return embeddings
39
 
40
  def upload_document_chunks(chunks):
41
- """Insert document chunks into Weaviate collection with embeddings."""
42
- doc_collection = client.collections.get("Document")
 
 
 
 
43
  for chunk in chunks:
44
- embedding = get_embeddings(chunk)
45
- doc_collection.data.insert(
46
- properties={"content": chunk},
47
- vector=embedding.tolist()
48
- )
 
 
 
49
 
50
  def query_answer(query):
51
- """Search for top relevant document chunks based on query embedding."""
52
  query_embedding = get_embeddings(query)
53
- results = client.collections.get("Document").query.near_vector(
54
- near_vector=query_embedding.tolist(),
55
- limit=3
56
- )
57
- return results.objects
 
 
 
 
58
 
59
  def generate_response(context, query):
60
- """Generate answer using Cohere model based on context and query."""
61
  response = cohere_client.generate(
62
  model='command',
63
  prompt=f"Context: {context}\n\nQuestion: {query}\nAnswer:",
@@ -66,25 +77,23 @@ def generate_response(context, query):
66
  return response.generations[0].text.strip()
67
 
68
  def qa_pipeline(pdf_file, query):
69
- """Main pipeline for QA: parse PDF, embed chunks, query Weaviate, and generate answer."""
70
  try:
71
  document_text = load_pdf(pdf_file)
72
  document_chunks = [document_text[i:i+500] for i in range(0, len(document_text), 500)]
73
-
74
  upload_document_chunks(document_chunks)
75
  top_docs = query_answer(query)
76
-
77
- context = ' '.join([doc.properties['content'] for doc in top_docs])
78
  answer = generate_response(context, query)
79
 
80
  return str(context), str(answer)
81
  finally:
82
- client.close() # βœ… Properly close client to avoid memory leaks
83
 
84
- # Gradio UI
85
  with gr.Blocks(theme="compact") as demo:
86
- gr.Markdown(
87
- """
88
  <div style="text-align: center; font-size: 28px; font-weight: bold; margin-bottom: 20px; color: #2D3748;">
89
  πŸ“„ Interactive QA Bot πŸ”
90
  </div>
@@ -92,8 +101,7 @@ with gr.Blocks(theme="compact") as demo:
92
  Upload a PDF document, ask questions, and receive answers based on the document content.
93
  </p>
94
  <hr style="border: 1px solid #CBD5E0; margin: 20px 0;">
95
- """
96
- )
97
 
98
  with gr.Row():
99
  with gr.Column(scale=1):
@@ -111,8 +119,7 @@ with gr.Blocks(theme="compact") as demo:
111
  outputs=[doc_segments_output, answer_output]
112
  )
113
 
114
- gr.Markdown(
115
- """
116
  <style>
117
  body {
118
  background-color: #EDF2F7;
@@ -141,7 +148,6 @@ with gr.Blocks(theme="compact") as demo:
141
  background-color: #FAFAFA;
142
  }
143
  </style>
144
- """
145
- )
146
 
147
- demo.launch(share=True) # βœ… Required for Hugging Face Spaces
 
7
  from weaviate.classes.init import Auth
8
  import cohere
9
 
10
+ # --- Configuration ---
11
  WEAVIATE_URL = "vgwhgmrlqrqqgnlb1avjaa.c0.us-west3.gcp.weaviate.cloud"
12
  WEAVIATE_API_KEY = "7VoeYTjkOS4aHINuhllGpH4JPgE2QquFmSMn"
13
  COHERE_API_KEY = "LEvCVeZkqZMW1aLYjxDqlstCzWi4Cvlt9PiysqT8"
14
 
15
+ # --- Initialize Clients ---
16
  client = weaviate.connect_to_weaviate_cloud(
17
  cluster_url=WEAVIATE_URL,
18
  auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
19
  headers={"X-Cohere-Api-Key": COHERE_API_KEY}
20
  )
 
21
  cohere_client = cohere.Client(COHERE_API_KEY)
22
 
23
+ # --- Load Sentence Transformer ---
24
  tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
25
  model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
26
 
27
+ # --- Utility Functions ---
28
  def load_pdf(file):
29
+ """Extract text from a PDF file."""
30
  reader = PyPDF2.PdfReader(file)
31
  return ''.join([page.extract_text() for page in reader.pages if page.extract_text()])
32
 
33
  def get_embeddings(text):
34
+ """Compute mean-pooled embeddings using a transformer."""
35
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
36
  with torch.no_grad():
37
  embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
38
  return embeddings
39
 
40
  def upload_document_chunks(chunks):
41
+ """Insert document chunks into Weaviate."""
42
+ try:
43
+ doc_collection = client.collections.get("Document")
44
+ except Exception as e:
45
+ raise RuntimeError("❌ Collection 'Document' not found. Make sure it's defined in your Weaviate schema.") from e
46
+
47
  for chunk in chunks:
48
+ try:
49
+ embedding = get_embeddings(chunk)
50
+ doc_collection.data.insert(
51
+ properties={"content": chunk},
52
+ vector=embedding.tolist()
53
+ )
54
+ except Exception as e:
55
+ print(f"⚠️ Skipped chunk due to error: {e}")
56
 
57
  def query_answer(query):
58
+ """Query Weaviate for top relevant document chunks."""
59
  query_embedding = get_embeddings(query)
60
+ try:
61
+ results = client.collections.get("Document").query.near_vector(
62
+ near_vector=query_embedding.tolist(),
63
+ limit=3
64
+ )
65
+ return results.objects
66
+ except Exception as e:
67
+ print(f"⚠️ Query error: {e}")
68
+ return []
69
 
70
  def generate_response(context, query):
71
+ """Generate a natural language response using Cohere."""
72
  response = cohere_client.generate(
73
  model='command',
74
  prompt=f"Context: {context}\n\nQuestion: {query}\nAnswer:",
 
77
  return response.generations[0].text.strip()
78
 
79
  def qa_pipeline(pdf_file, query):
80
+ """Main QA pipeline."""
81
  try:
82
  document_text = load_pdf(pdf_file)
83
  document_chunks = [document_text[i:i+500] for i in range(0, len(document_text), 500)]
84
+
85
  upload_document_chunks(document_chunks)
86
  top_docs = query_answer(query)
87
+ context = ' '.join([doc.properties['content'] for doc in top_docs if 'content' in doc.properties])
 
88
  answer = generate_response(context, query)
89
 
90
  return str(context), str(answer)
91
  finally:
92
+ client.close()
93
 
94
+ # --- Gradio UI ---
95
  with gr.Blocks(theme="compact") as demo:
96
+ gr.Markdown("""
 
97
  <div style="text-align: center; font-size: 28px; font-weight: bold; margin-bottom: 20px; color: #2D3748;">
98
  πŸ“„ Interactive QA Bot πŸ”
99
  </div>
 
101
  Upload a PDF document, ask questions, and receive answers based on the document content.
102
  </p>
103
  <hr style="border: 1px solid #CBD5E0; margin: 20px 0;">
104
+ """)
 
105
 
106
  with gr.Row():
107
  with gr.Column(scale=1):
 
119
  outputs=[doc_segments_output, answer_output]
120
  )
121
 
122
+ gr.Markdown("""
 
123
  <style>
124
  body {
125
  background-color: #EDF2F7;
 
148
  background-color: #FAFAFA;
149
  }
150
  </style>
151
+ """)
 
152
 
153
+ demo.launch(share=True)