gaur3009 commited on
Commit
40781f0
Β·
verified Β·
1 Parent(s): 5a750b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -9
app.py CHANGED
@@ -5,6 +5,7 @@ import torch
5
  import weaviate
6
  import cohere
7
 
 
8
  auth_config = weaviate.AuthApiKey(api_key="16LRz5YwOtnq8ov51Lhg1UuAollpsMgspulV")
9
  client = weaviate.Client(
10
  url="https://wkoll9rds3orbu9fhzfr2a.c0.asia-southeast1.gcp.weaviate.cloud",
@@ -12,6 +13,7 @@ client = weaviate.Client(
12
  )
13
  cohere_client = cohere.Client("LEvCVeZkqZMW1aLYjxDqlstCzWi4Cvlt9PiysqT8")
14
 
 
15
  def load_pdf(file):
16
  reader = PyPDF2.PdfReader(file)
17
  text = ''
@@ -19,15 +21,18 @@ def load_pdf(file):
19
  text += reader.pages[page].extract_text()
20
  return text
21
 
 
22
  tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
23
  model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
24
 
 
25
  def get_embeddings(text):
26
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
27
  with torch.no_grad():
28
  embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
29
  return embeddings
30
 
 
31
  def upload_document_chunks(chunks):
32
  for idx, chunk in enumerate(chunks):
33
  embedding = get_embeddings(chunk)
@@ -37,6 +42,7 @@ def upload_document_chunks(chunks):
37
  vector=embedding.tolist()
38
  )
39
 
 
40
  def query_answer(query):
41
  query_embedding = get_embeddings(query)
42
  result = client.query.get("Document", ["content"])\
@@ -45,6 +51,7 @@ def query_answer(query):
45
  .do()
46
  return result
47
 
 
48
  def generate_response(context, query):
49
  response = cohere_client.generate(
50
  model='command',
@@ -53,32 +60,90 @@ def generate_response(context, query):
53
  )
54
  return response.generations[0].text.strip()
55
 
 
56
  def qa_pipeline(pdf_file, query):
57
  document_text = load_pdf(pdf_file)
58
  document_chunks = [document_text[i:i+500] for i in range(0, len(document_text), 500)]
59
 
 
60
  upload_document_chunks(document_chunks)
61
 
 
62
  response = query_answer(query)
63
  context = ' '.join([doc['content'] for doc in response['data']['Get']['Document']])
64
 
 
65
  answer = generate_response(context, query)
66
 
67
  return context, answer
68
 
69
- with gr.Blocks() as demo:
70
- gr.Markdown("# Interactive QA Bot")
71
-
72
- pdf_input = gr.File(label="Upload a PDF file", file_types=[".pdf"])
73
- query_input = gr.Textbox(label="Ask a question")
74
-
75
- doc_segments_output = gr.Textbox(label="Retrieved Document Segments")
76
- answer_output = gr.Textbox(label="Answer")
 
 
 
 
 
 
77
 
78
- gr.Button("Submit").click(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  qa_pipeline,
80
  inputs=[pdf_input, query_input],
81
  outputs=[doc_segments_output, answer_output]
82
  )
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  demo.launch()
 
5
  import weaviate
6
  import cohere
7
 
8
+ # Initialize Weaviate and Cohere clients
9
  auth_config = weaviate.AuthApiKey(api_key="16LRz5YwOtnq8ov51Lhg1UuAollpsMgspulV")
10
  client = weaviate.Client(
11
  url="https://wkoll9rds3orbu9fhzfr2a.c0.asia-southeast1.gcp.weaviate.cloud",
 
13
  )
14
  cohere_client = cohere.Client("LEvCVeZkqZMW1aLYjxDqlstCzWi4Cvlt9PiysqT8")
15
 
16
+ # Function to extract text from uploaded PDF
17
  def load_pdf(file):
18
  reader = PyPDF2.PdfReader(file)
19
  text = ''
 
21
  text += reader.pages[page].extract_text()
22
  return text
23
 
24
+ # Initialize transformer model and tokenizer
25
  tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
26
  model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
27
 
28
+ # Function to get embeddings for text
29
  def get_embeddings(text):
30
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
31
  with torch.no_grad():
32
  embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
33
  return embeddings
34
 
35
+ # Upload document chunks to Weaviate
36
  def upload_document_chunks(chunks):
37
  for idx, chunk in enumerate(chunks):
38
  embedding = get_embeddings(chunk)
 
42
  vector=embedding.tolist()
43
  )
44
 
45
+ # Query Weaviate for relevant document chunks
46
  def query_answer(query):
47
  query_embedding = get_embeddings(query)
48
  result = client.query.get("Document", ["content"])\
 
51
  .do()
52
  return result
53
 
54
+ # Generate answer using Cohere
55
  def generate_response(context, query):
56
  response = cohere_client.generate(
57
  model='command',
 
60
  )
61
  return response.generations[0].text.strip()
62
 
63
+ # Function to handle the full pipeline: uploading PDF, generating embeddings, answering queries
64
  def qa_pipeline(pdf_file, query):
65
  document_text = load_pdf(pdf_file)
66
  document_chunks = [document_text[i:i+500] for i in range(0, len(document_text), 500)]
67
 
68
+ # Upload document chunks to Weaviate
69
  upload_document_chunks(document_chunks)
70
 
71
+ # Query Weaviate for document segments related to the query
72
  response = query_answer(query)
73
  context = ' '.join([doc['content'] for doc in response['data']['Get']['Document']])
74
 
75
+ # Generate response from the retrieved context
76
  answer = generate_response(context, query)
77
 
78
  return context, answer
79
 
80
+ # Define Gradio interface with enhanced UI
81
+ with gr.Blocks(theme="compact") as demo:
82
+ gr.Markdown(
83
+ """
84
+ <div style="text-align: center; font-size: 28px; font-weight: bold; margin-bottom: 20px; color: #2D3748;">
85
+ πŸ“„ Interactive QA Bot πŸ”
86
+ </div>
87
+ <p style="text-align: center; font-size: 16px; color: #4A5568;">
88
+ Upload a PDF document, ask questions, and receive answers based on the document content.<br>
89
+ Powered by <b>Weaviate</b> for document retrieval and <b>Cohere</b> for generating answers.
90
+ </p>
91
+ <hr style="border: 1px solid #CBD5E0; margin: 20px 0;">
92
+ """
93
+ )
94
 
95
+ with gr.Row():
96
+ with gr.Column(scale=1):
97
+ pdf_input = gr.File(label="πŸ“ Upload PDF", file_types=[".pdf"], show_label=True)
98
+ query_input = gr.Textbox(
99
+ label="❓ Ask a Question",
100
+ placeholder="Enter your question here...",
101
+ lines=1
102
+ )
103
+ submit_button = gr.Button("πŸ” Submit")
104
+
105
+ with gr.Column(scale=2):
106
+ doc_segments_output = gr.Textbox(label="πŸ“œ Retrieved Document Segments", placeholder="Document segments will be displayed here...", lines=10)
107
+ answer_output = gr.Textbox(label="πŸ’¬ Answer", placeholder="The answer will appear here...", lines=3)
108
+
109
+ submit_button.click(
110
  qa_pipeline,
111
  inputs=[pdf_input, query_input],
112
  outputs=[doc_segments_output, answer_output]
113
  )
114
 
115
+ gr.Markdown(
116
+ """
117
+ <style>
118
+ body {
119
+ background-color: #EDF2F7;
120
+ }
121
+ input[type="file"] {
122
+ background-color: #3182CE;
123
+ color: white;
124
+ padding: 8px;
125
+ border-radius: 5px;
126
+ }
127
+ button {
128
+ background-color: #3182CE;
129
+ color: white;
130
+ padding: 10px;
131
+ font-size: 16px;
132
+ border-radius: 5px;
133
+ cursor: pointer;
134
+ }
135
+ button:hover {
136
+ background-color: #2B6CB0;
137
+ }
138
+ textarea {
139
+ border: 2px solid #CBD5E0;
140
+ border-radius: 8px;
141
+ padding: 10px;
142
+ background-color: #FAFAFA;
143
+ }
144
+ </style>
145
+ """
146
+ )
147
+
148
+ # Launch the Gradio interface
149
  demo.launch()