gaur3009 commited on
Commit
d37571a
·
verified ·
1 Parent(s): 64f9dc0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -19
app.py CHANGED
@@ -1,40 +1,44 @@
 
1
  import gradio as gr
2
  import PyPDF2
3
- from transformers import AutoTokenizer, AutoModel
4
  import torch
5
  import weaviate
6
- from weaviate import WeaviateClient
7
- from weaviate.auth import AuthApiKey
8
  from weaviate.classes.init import Auth
9
  import cohere
10
 
11
- weaviate_url = "vgwhgmrlqrqqgnlb1avjaa.c0.us-west3.gcp.weaviate.cloud"
12
- weaviate_api_key = "7VoeYTjkOS4aHINuhllGpH4JPgE2QquFmSMn"
 
 
13
 
 
14
  client = weaviate.connect_to_weaviate_cloud(
15
- cluster_url=weaviate_url,
16
- auth_credentials=Auth.api_key(weaviate_api_key),
 
17
  )
18
 
19
- cohere_client = cohere.Client("LEvCVeZkqZMW1aLYjxDqlstCzWi4Cvlt9PiysqT8")
20
-
21
- def load_pdf(file):
22
- reader = PyPDF2.PdfReader(file)
23
- text = ''
24
- for page in reader.pages:
25
- text += page.extract_text()
26
- return text
27
 
 
28
  tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
29
  model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
30
 
 
 
 
 
 
31
  def get_embeddings(text):
 
32
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
33
  with torch.no_grad():
34
  embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
35
  return embeddings
36
 
37
  def upload_document_chunks(chunks):
 
38
  doc_collection = client.collections.get("Document")
39
  for chunk in chunks:
40
  embedding = get_embeddings(chunk)
@@ -44,22 +48,25 @@ def upload_document_chunks(chunks):
44
  )
45
 
46
  def query_answer(query):
 
47
  query_embedding = get_embeddings(query)
48
- response = client.collections.get("Document").query.near_vector(
49
  near_vector=query_embedding.tolist(),
50
  limit=3
51
  )
52
- return response.objects
53
 
54
  def generate_response(context, query):
 
55
  response = cohere_client.generate(
56
  model='command',
57
- prompt=f"Context: {context}\n\nQuestion: {query}?\nAnswer:",
58
  max_tokens=100
59
  )
60
  return response.generations[0].text.strip()
61
 
62
  def qa_pipeline(pdf_file, query):
 
63
  document_text = load_pdf(pdf_file)
64
  document_chunks = [document_text[i:i+500] for i in range(0, len(document_text), 500)]
65
 
@@ -71,6 +78,7 @@ def qa_pipeline(pdf_file, query):
71
 
72
  return context, answer
73
 
 
74
  with gr.Blocks(theme="compact") as demo:
75
  gr.Markdown(
76
  """
@@ -133,4 +141,4 @@ with gr.Blocks(theme="compact") as demo:
133
  """
134
  )
135
 
136
- demo.launch(share = True)
 
1
+ import os
2
  import gradio as gr
3
  import PyPDF2
 
4
  import torch
5
  import weaviate
6
+ from transformers import AutoTokenizer, AutoModel
 
7
  from weaviate.classes.init import Auth
8
  import cohere
9
 
10
+ # Load credentials from environment variables
11
+ WEAVIATE_URL = "vgwhgmrlqrqqgnlb1avjaa.c0.us-west3.gcp.weaviate.cloud"
12
+ WEAVIATE_API_KEY = "7VoeYTjkOS4aHINuhllGpH4JPgE2QquFmSMn"
13
+ COHERE_API_KEY = "LEvCVeZkqZMW1aLYjxDqlstCzWi4Cvlt9PiysqT8"
14
 
15
+ # Connect to Weaviate
16
  client = weaviate.connect_to_weaviate_cloud(
17
+ cluster_url=WEAVIATE_URL,
18
+ auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
19
+ headers={"X-Cohere-Api-Key": COHERE_API_KEY}
20
  )
21
 
22
+ cohere_client = cohere.Client(COHERE_API_KEY)
 
 
 
 
 
 
 
23
 
24
+ # Load sentence-transformer model
25
  tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
26
  model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
27
 
28
+ def load_pdf(file):
29
+ """Extract text from PDF file."""
30
+ reader = PyPDF2.PdfReader(file)
31
+ return ''.join([page.extract_text() for page in reader.pages if page.extract_text()])
32
+
33
  def get_embeddings(text):
34
+ """Generate mean pooled embedding for the input text."""
35
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
36
  with torch.no_grad():
37
  embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
38
  return embeddings
39
 
40
  def upload_document_chunks(chunks):
41
+ """Insert document chunks into Weaviate collection with embeddings."""
42
  doc_collection = client.collections.get("Document")
43
  for chunk in chunks:
44
  embedding = get_embeddings(chunk)
 
48
  )
49
 
50
  def query_answer(query):
51
+ """Search for top relevant document chunks based on query embedding."""
52
  query_embedding = get_embeddings(query)
53
+ results = client.collections.get("Document").query.near_vector(
54
  near_vector=query_embedding.tolist(),
55
  limit=3
56
  )
57
+ return results.objects
58
 
59
  def generate_response(context, query):
60
+ """Generate answer using Cohere model based on context and query."""
61
  response = cohere_client.generate(
62
  model='command',
63
+ prompt=f"Context: {context}\n\nQuestion: {query}\nAnswer:",
64
  max_tokens=100
65
  )
66
  return response.generations[0].text.strip()
67
 
68
  def qa_pipeline(pdf_file, query):
69
+ """Main pipeline for QA: parse PDF, embed chunks, query Weaviate, and generate answer."""
70
  document_text = load_pdf(pdf_file)
71
  document_chunks = [document_text[i:i+500] for i in range(0, len(document_text), 500)]
72
 
 
78
 
79
  return context, answer
80
 
81
+ # Gradio UI
82
  with gr.Blocks(theme="compact") as demo:
83
  gr.Markdown(
84
  """
 
141
  """
142
  )
143
 
144
+ demo.launch()