gaur3009 commited on
Commit
eb15225
·
verified ·
1 Parent(s): 40781f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -13
app.py CHANGED
@@ -5,7 +5,6 @@ import torch
5
  import weaviate
6
  import cohere
7
 
8
- # Initialize Weaviate and Cohere clients
9
  auth_config = weaviate.AuthApiKey(api_key="16LRz5YwOtnq8ov51Lhg1UuAollpsMgspulV")
10
  client = weaviate.Client(
11
  url="https://wkoll9rds3orbu9fhzfr2a.c0.asia-southeast1.gcp.weaviate.cloud",
@@ -13,7 +12,6 @@ client = weaviate.Client(
13
  )
14
  cohere_client = cohere.Client("LEvCVeZkqZMW1aLYjxDqlstCzWi4Cvlt9PiysqT8")
15
 
16
- # Function to extract text from uploaded PDF
17
  def load_pdf(file):
18
  reader = PyPDF2.PdfReader(file)
19
  text = ''
@@ -21,18 +19,15 @@ def load_pdf(file):
21
  text += reader.pages[page].extract_text()
22
  return text
23
 
24
- # Initialize transformer model and tokenizer
25
  tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
26
  model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
27
 
28
- # Function to get embeddings for text
29
  def get_embeddings(text):
30
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
31
  with torch.no_grad():
32
  embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
33
  return embeddings
34
 
35
- # Upload document chunks to Weaviate
36
  def upload_document_chunks(chunks):
37
  for idx, chunk in enumerate(chunks):
38
  embedding = get_embeddings(chunk)
@@ -42,7 +37,6 @@ def upload_document_chunks(chunks):
42
  vector=embedding.tolist()
43
  )
44
 
45
- # Query Weaviate for relevant document chunks
46
  def query_answer(query):
47
  query_embedding = get_embeddings(query)
48
  result = client.query.get("Document", ["content"])\
@@ -51,7 +45,6 @@ def query_answer(query):
51
  .do()
52
  return result
53
 
54
- # Generate answer using Cohere
55
  def generate_response(context, query):
56
  response = cohere_client.generate(
57
  model='command',
@@ -60,24 +53,19 @@ def generate_response(context, query):
60
  )
61
  return response.generations[0].text.strip()
62
 
63
- # Function to handle the full pipeline: uploading PDF, generating embeddings, answering queries
64
  def qa_pipeline(pdf_file, query):
65
  document_text = load_pdf(pdf_file)
66
  document_chunks = [document_text[i:i+500] for i in range(0, len(document_text), 500)]
67
 
68
- # Upload document chunks to Weaviate
69
  upload_document_chunks(document_chunks)
70
 
71
- # Query Weaviate for document segments related to the query
72
  response = query_answer(query)
73
  context = ' '.join([doc['content'] for doc in response['data']['Get']['Document']])
74
 
75
- # Generate response from the retrieved context
76
  answer = generate_response(context, query)
77
 
78
  return context, answer
79
 
80
- # Define Gradio interface with enhanced UI
81
  with gr.Blocks(theme="compact") as demo:
82
  gr.Markdown(
83
  """
@@ -145,5 +133,4 @@ with gr.Blocks(theme="compact") as demo:
145
  """
146
  )
147
 
148
- # Launch the Gradio interface
149
  demo.launch()
 
5
  import weaviate
6
  import cohere
7
 
 
8
  auth_config = weaviate.AuthApiKey(api_key="16LRz5YwOtnq8ov51Lhg1UuAollpsMgspulV")
9
  client = weaviate.Client(
10
  url="https://wkoll9rds3orbu9fhzfr2a.c0.asia-southeast1.gcp.weaviate.cloud",
 
12
  )
13
  cohere_client = cohere.Client("LEvCVeZkqZMW1aLYjxDqlstCzWi4Cvlt9PiysqT8")
14
 
 
15
  def load_pdf(file):
16
  reader = PyPDF2.PdfReader(file)
17
  text = ''
 
19
  text += reader.pages[page].extract_text()
20
  return text
21
 
 
22
  tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
23
  model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
24
 
 
25
  def get_embeddings(text):
26
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
27
  with torch.no_grad():
28
  embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
29
  return embeddings
30
 
 
31
  def upload_document_chunks(chunks):
32
  for idx, chunk in enumerate(chunks):
33
  embedding = get_embeddings(chunk)
 
37
  vector=embedding.tolist()
38
  )
39
 
 
40
  def query_answer(query):
41
  query_embedding = get_embeddings(query)
42
  result = client.query.get("Document", ["content"])\
 
45
  .do()
46
  return result
47
 
 
48
  def generate_response(context, query):
49
  response = cohere_client.generate(
50
  model='command',
 
53
  )
54
  return response.generations[0].text.strip()
55
 
 
56
  def qa_pipeline(pdf_file, query):
57
  document_text = load_pdf(pdf_file)
58
  document_chunks = [document_text[i:i+500] for i in range(0, len(document_text), 500)]
59
 
 
60
  upload_document_chunks(document_chunks)
61
 
 
62
  response = query_answer(query)
63
  context = ' '.join([doc['content'] for doc in response['data']['Get']['Document']])
64
 
 
65
  answer = generate_response(context, query)
66
 
67
  return context, answer
68
 
 
69
  with gr.Blocks(theme="compact") as demo:
70
  gr.Markdown(
71
  """
 
133
  """
134
  )
135
 
 
136
  demo.launch()