Update app.py
Browse files
app.py
CHANGED
@@ -2,21 +2,23 @@ import gradio as gr
|
|
2 |
import PyPDF2
|
3 |
from transformers import AutoTokenizer, AutoModel
|
4 |
import torch
|
5 |
-
import
|
|
|
6 |
import cohere
|
7 |
|
8 |
-
|
9 |
-
client =
|
10 |
-
|
11 |
-
|
12 |
)
|
|
|
13 |
cohere_client = cohere.Client("LEvCVeZkqZMW1aLYjxDqlstCzWi4Cvlt9PiysqT8")
|
14 |
|
15 |
def load_pdf(file):
|
16 |
reader = PyPDF2.PdfReader(file)
|
17 |
text = ''
|
18 |
-
for page in
|
19 |
-
text +=
|
20 |
return text
|
21 |
|
22 |
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
@@ -29,21 +31,21 @@ def get_embeddings(text):
|
|
29 |
return embeddings
|
30 |
|
31 |
def upload_document_chunks(chunks):
|
32 |
-
|
|
|
33 |
embedding = get_embeddings(chunk)
|
34 |
-
|
35 |
-
{"content": chunk},
|
36 |
-
"Document",
|
37 |
vector=embedding.tolist()
|
38 |
)
|
39 |
|
40 |
def query_answer(query):
|
41 |
query_embedding = get_embeddings(query)
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
return
|
47 |
|
48 |
def generate_response(context, query):
|
49 |
response = cohere_client.generate(
|
@@ -58,10 +60,9 @@ def qa_pipeline(pdf_file, query):
|
|
58 |
document_chunks = [document_text[i:i+500] for i in range(0, len(document_text), 500)]
|
59 |
|
60 |
upload_document_chunks(document_chunks)
|
|
|
61 |
|
62 |
-
|
63 |
-
context = ' '.join([doc['content'] for doc in response['data']['Get']['Document']])
|
64 |
-
|
65 |
answer = generate_response(context, query)
|
66 |
|
67 |
return context, answer
|
@@ -81,17 +82,13 @@ with gr.Blocks(theme="compact") as demo:
|
|
81 |
|
82 |
with gr.Row():
|
83 |
with gr.Column(scale=1):
|
84 |
-
pdf_input = gr.File(label="π Upload PDF", file_types=[".pdf"]
|
85 |
-
query_input = gr.Textbox(
|
86 |
-
label="β Ask a Question",
|
87 |
-
placeholder="Enter your question here...",
|
88 |
-
lines=1
|
89 |
-
)
|
90 |
submit_button = gr.Button("π Submit")
|
91 |
|
92 |
with gr.Column(scale=2):
|
93 |
-
doc_segments_output = gr.Textbox(label="π Retrieved Document Segments",
|
94 |
-
answer_output = gr.Textbox(label="π¬ Answer",
|
95 |
|
96 |
submit_button.click(
|
97 |
qa_pipeline,
|
|
|
2 |
import PyPDF2
|
3 |
from transformers import AutoTokenizer, AutoModel
|
4 |
import torch
|
5 |
+
from weaviate import WeaviateClient
|
6 |
+
from weaviate.auth import AuthApiKey
|
7 |
import cohere
|
8 |
|
9 |
+
auth = AuthApiKey(api_key="7VoeYTjkOS4aHINuhllGpH4JPgE2QquFmSMn")
|
10 |
+
client = WeaviateClient(
|
11 |
+
url="https://vgwhgmrlqrqqgnlb1avjaa.c0.us-west3.gcp.weaviate.cloud",
|
12 |
+
auth_client=auth
|
13 |
)
|
14 |
+
|
15 |
cohere_client = cohere.Client("LEvCVeZkqZMW1aLYjxDqlstCzWi4Cvlt9PiysqT8")
|
16 |
|
17 |
def load_pdf(file):
|
18 |
reader = PyPDF2.PdfReader(file)
|
19 |
text = ''
|
20 |
+
for page in reader.pages:
|
21 |
+
text += page.extract_text()
|
22 |
return text
|
23 |
|
24 |
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
|
|
31 |
return embeddings
|
32 |
|
33 |
def upload_document_chunks(chunks):
|
34 |
+
doc_collection = client.collections.get("Document")
|
35 |
+
for chunk in chunks:
|
36 |
embedding = get_embeddings(chunk)
|
37 |
+
doc_collection.data.insert(
|
38 |
+
properties={"content": chunk},
|
|
|
39 |
vector=embedding.tolist()
|
40 |
)
|
41 |
|
42 |
def query_answer(query):
|
43 |
query_embedding = get_embeddings(query)
|
44 |
+
response = client.collections.get("Document").query.near_vector(
|
45 |
+
near_vector=query_embedding.tolist(),
|
46 |
+
limit=3
|
47 |
+
)
|
48 |
+
return response.objects
|
49 |
|
50 |
def generate_response(context, query):
|
51 |
response = cohere_client.generate(
|
|
|
60 |
document_chunks = [document_text[i:i+500] for i in range(0, len(document_text), 500)]
|
61 |
|
62 |
upload_document_chunks(document_chunks)
|
63 |
+
top_docs = query_answer(query)
|
64 |
|
65 |
+
context = ' '.join([doc.properties['content'] for doc in top_docs])
|
|
|
|
|
66 |
answer = generate_response(context, query)
|
67 |
|
68 |
return context, answer
|
|
|
82 |
|
83 |
with gr.Row():
|
84 |
with gr.Column(scale=1):
|
85 |
+
pdf_input = gr.File(label="π Upload PDF", file_types=[".pdf"])
|
86 |
+
query_input = gr.Textbox(label="β Ask a Question", placeholder="Enter your question here...")
|
|
|
|
|
|
|
|
|
87 |
submit_button = gr.Button("π Submit")
|
88 |
|
89 |
with gr.Column(scale=2):
|
90 |
+
doc_segments_output = gr.Textbox(label="π Retrieved Document Segments", lines=10)
|
91 |
+
answer_output = gr.Textbox(label="π¬ Answer", lines=3)
|
92 |
|
93 |
submit_button.click(
|
94 |
qa_pipeline,
|