gaur3009 commited on
Commit
c50dda1
Β·
verified Β·
1 Parent(s): a5762bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -27
app.py CHANGED
@@ -2,21 +2,23 @@ import gradio as gr
2
  import PyPDF2
3
  from transformers import AutoTokenizer, AutoModel
4
  import torch
5
- import weaviate
 
6
  import cohere
7
 
8
- auth_config = weaviate.AuthApiKey(api_key="7VoeYTjkOS4aHINuhllGpH4JPgE2QquFmSMn")
9
- client = weaviate.Client(
10
- "https://vgwhgmrlqrqqgnlb1avjaa.c0.us-west3.gcp.weaviate.cloud",
11
- auth_client_secret=auth_config
12
  )
 
13
  cohere_client = cohere.Client("LEvCVeZkqZMW1aLYjxDqlstCzWi4Cvlt9PiysqT8")
14
 
15
  def load_pdf(file):
16
  reader = PyPDF2.PdfReader(file)
17
  text = ''
18
- for page in range(len(reader.pages)):
19
- text += reader.pages[page].extract_text()
20
  return text
21
 
22
  tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
@@ -29,21 +31,21 @@ def get_embeddings(text):
29
  return embeddings
30
 
31
  def upload_document_chunks(chunks):
32
- for idx, chunk in enumerate(chunks):
 
33
  embedding = get_embeddings(chunk)
34
- client.data_object.create(
35
- {"content": chunk},
36
- "Document",
37
  vector=embedding.tolist()
38
  )
39
 
40
  def query_answer(query):
41
  query_embedding = get_embeddings(query)
42
- result = client.query.get("Document", ["content"])\
43
- .with_near_vector({"vector": query_embedding.tolist()})\
44
- .with_limit(3)\
45
- .do()
46
- return result
47
 
48
  def generate_response(context, query):
49
  response = cohere_client.generate(
@@ -58,10 +60,9 @@ def qa_pipeline(pdf_file, query):
58
  document_chunks = [document_text[i:i+500] for i in range(0, len(document_text), 500)]
59
 
60
  upload_document_chunks(document_chunks)
 
61
 
62
- response = query_answer(query)
63
- context = ' '.join([doc['content'] for doc in response['data']['Get']['Document']])
64
-
65
  answer = generate_response(context, query)
66
 
67
  return context, answer
@@ -81,17 +82,13 @@ with gr.Blocks(theme="compact") as demo:
81
 
82
  with gr.Row():
83
  with gr.Column(scale=1):
84
- pdf_input = gr.File(label="πŸ“ Upload PDF", file_types=[".pdf"], show_label=True)
85
- query_input = gr.Textbox(
86
- label="❓ Ask a Question",
87
- placeholder="Enter your question here...",
88
- lines=1
89
- )
90
  submit_button = gr.Button("πŸ” Submit")
91
 
92
  with gr.Column(scale=2):
93
- doc_segments_output = gr.Textbox(label="πŸ“œ Retrieved Document Segments", placeholder="Document segments will be displayed here...", lines=10)
94
- answer_output = gr.Textbox(label="πŸ’¬ Answer", placeholder="The answer will appear here...", lines=3)
95
 
96
  submit_button.click(
97
  qa_pipeline,
 
2
  import PyPDF2
3
  from transformers import AutoTokenizer, AutoModel
4
  import torch
5
+ from weaviate import WeaviateClient
6
+ from weaviate.auth import AuthApiKey
7
  import cohere
8
 
9
+ auth = AuthApiKey(api_key="7VoeYTjkOS4aHINuhllGpH4JPgE2QquFmSMn")
10
+ client = WeaviateClient(
11
+ url="https://vgwhgmrlqrqqgnlb1avjaa.c0.us-west3.gcp.weaviate.cloud",
12
+ auth_client=auth
13
  )
14
+
15
  cohere_client = cohere.Client("LEvCVeZkqZMW1aLYjxDqlstCzWi4Cvlt9PiysqT8")
16
 
17
  def load_pdf(file):
18
  reader = PyPDF2.PdfReader(file)
19
  text = ''
20
+ for page in reader.pages:
21
+ text += page.extract_text()
22
  return text
23
 
24
  tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
 
31
  return embeddings
32
 
33
  def upload_document_chunks(chunks):
34
+ doc_collection = client.collections.get("Document")
35
+ for chunk in chunks:
36
  embedding = get_embeddings(chunk)
37
+ doc_collection.data.insert(
38
+ properties={"content": chunk},
 
39
  vector=embedding.tolist()
40
  )
41
 
42
  def query_answer(query):
43
  query_embedding = get_embeddings(query)
44
+ response = client.collections.get("Document").query.near_vector(
45
+ near_vector=query_embedding.tolist(),
46
+ limit=3
47
+ )
48
+ return response.objects
49
 
50
  def generate_response(context, query):
51
  response = cohere_client.generate(
 
60
  document_chunks = [document_text[i:i+500] for i in range(0, len(document_text), 500)]
61
 
62
  upload_document_chunks(document_chunks)
63
+ top_docs = query_answer(query)
64
 
65
+ context = ' '.join([doc.properties['content'] for doc in top_docs])
 
 
66
  answer = generate_response(context, query)
67
 
68
  return context, answer
 
82
 
83
  with gr.Row():
84
  with gr.Column(scale=1):
85
+ pdf_input = gr.File(label="πŸ“ Upload PDF", file_types=[".pdf"])
86
+ query_input = gr.Textbox(label="❓ Ask a Question", placeholder="Enter your question here...")
 
 
 
 
87
  submit_button = gr.Button("πŸ” Submit")
88
 
89
  with gr.Column(scale=2):
90
+ doc_segments_output = gr.Textbox(label="πŸ“œ Retrieved Document Segments", lines=10)
91
+ answer_output = gr.Textbox(label="πŸ’¬ Answer", lines=3)
92
 
93
  submit_button.click(
94
  qa_pipeline,