cogcorp commited on
Commit
6b24be8
·
1 Parent(s): 7cac9bf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -89
app.py CHANGED
@@ -54,89 +54,6 @@ def text_to_chunks(texts, word_length=150, start_page=1):
54
  return chunks
55
 
56
 
57
- class SemanticSearch:
58
-
59
- def __init__(self):
60
- self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
61
- self.fitted = False
62
-
63
- def fit(self, data, batch=1000, n_neighbors=5):
64
- self.data = data
65
- self.embeddings = self.get_text_embedding(data, batch=batch)
66
- n_neighbors = min(n_neighbors, len(self.embeddings))
67
- self.nn = NearestNeighbors(n_neighbors=n_neighbors)
68
- self.nn.fit(self.embeddings)
69
- self.fitted = True
70
-
71
- def __call__(self, text, return_data=True):
72
- inp_emb = self.use([text])
73
- neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
74
-
75
- if return_data:
76
- return [self.data[i] for i in neighbors]
77
- else:
78
- return neighbors
79
-
80
- def get_text_embedding(self, texts, batch=1000):
81
- embeddings = []
82
- for i in range(0, len(texts), batch):
83
- text_batch = texts[i:(i+batch
84
- import urllib.request
85
- import fitz
86
- import re
87
- import numpy as np
88
- import tensorflow_hub as hub
89
- import openai
90
- import gradio as gr
91
- import os
92
- from sklearn.neighbors import NearestNeighbors
93
-
94
- def download_pdf(url, output_path):
95
- urllib.request.urlretrieve(url, output_path)
96
-
97
-
98
- def preprocess(text):
99
- text = text.replace('\n', ' ')
100
- text = re.sub('\s+', ' ', text)
101
- return text
102
-
103
-
104
- def pdf_to_text(path, start_page=1, end_page=None):
105
- doc = fitz.open(path)
106
- total_pages = doc.page_count
107
-
108
- if end_page is None:
109
- end_page = total_pages
110
-
111
- text_list = []
112
-
113
- for i in range(start_page-1, end_page):
114
- text = doc.load_page(i).get_text("text")
115
- text = preprocess(text)
116
- text_list.append(text)
117
-
118
- doc.close()
119
- return text_list
120
-
121
-
122
- def text_to_chunks(texts, word_length=150, start_page=1):
123
- text_toks = [t.split(' ') for t in texts]
124
- page_nums = []
125
- chunks = []
126
-
127
- for idx, words in enumerate(text_toks):
128
- for i in range(0, len(words), word_length):
129
- chunk = words[i:i+word_length]
130
- if (i+word_length) > len(words) and (len(chunk) < word_length) and (
131
- len(text_toks) != (idx+1)):
132
- text_toks[idx+1] = chunk + text_toks[idx+1]
133
- continue
134
- chunk = ' '.join(chunk).strip()
135
- chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
136
- chunks.append(chunk)
137
- return chunks
138
-
139
-
140
  class SemanticSearch:
141
 
142
  def __init__(self):
@@ -178,7 +95,7 @@ def load_recommender(path, start_page=1):
178
  return 'Corpus Loaded.'
179
 
180
 
181
- def generate_text(openAI_key, prompt, engine="text-davinci-003"):
182
  openai.api_key = openAI_key
183
  completions = openai.Completion.create(
184
  engine=engine,
@@ -205,11 +122,11 @@ def generate_answer(question, openAI_key):
205
  "with the same name, create separate answers for each. Only include information found in the results and "\
206
  "don't add any additional information. Make sure the answer is correct and don't output false content. "\
207
  "If the text does not relate to the query, simply state 'Text Not Found in PDF'. Ignore outlier "\
208
- "search results which has nothing to do with the question. Only answer what is asked. The "\
209
  "answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "
210
 
211
  prompt += f"Query: {question}\nAnswer:"
212
- answer = generate_text(openAI_key, prompt, "text-davinci-003")
213
  return answer
214
 
215
 
@@ -245,8 +162,7 @@ recommender = SemanticSearch()
245
  title = 'PDF GPT'
246
  description = """ PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Open AI. It gives hallucination free response than other tools as the embeddings are better than OpenAI. The returned response can even cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly."""
247
 
248
- with gr.Interface(fn=question_answer, inputs
249
- =[url, file, question, openAI_key], outputs=[answer], title=title, description=description) as iface:
250
- iface.launch()
251
 
252
 
 
54
  return chunks
55
 
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  class SemanticSearch:
58
 
59
  def __init__(self):
 
95
  return 'Corpus Loaded.'
96
 
97
 
98
+ def generate_text(openAI_key, prompt, engine="davinci"):
99
  openai.api_key = openAI_key
100
  completions = openai.Completion.create(
101
  engine=engine,
 
122
  "with the same name, create separate answers for each. Only include information found in the results and "\
123
  "don't add any additional information. Make sure the answer is correct and don't output false content. "\
124
  "If the text does not relate to the query, simply state 'Text Not Found in PDF'. Ignore outlier "\
125
+ "search results which have nothing to do with the question. Only answer what is asked. The "\
126
  "answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "
127
 
128
  prompt += f"Query: {question}\nAnswer:"
129
+ answer = generate_text(openAI_key, prompt, "davinci")
130
  return answer
131
 
132
 
 
162
  title = 'PDF GPT'
163
  description = """ PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Open AI. It gives hallucination free response than other tools as the embeddings are better than OpenAI. The returned response can even cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly."""
164
 
165
+ with gr.Interface(fn=question_answer, inputs=[url, file, question openAI_key], outputs=[answer], title=title, description=description) as iface:
166
+ iface.launch()
 
167
 
168