cogcorp commited on
Commit
6e868cd
·
1 Parent(s): 5c2bb8b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -158
app.py CHANGED
@@ -1,171 +1,94 @@
1
- import urllib.request
2
- import fitz
3
- import re
4
- import numpy as np
5
- import tensorflow_hub as hub
6
- import openai
7
- import gradio as gr
8
  import os
9
  import zipfile
10
- from sklearn.neighbors import NearestNeighbors
11
-
12
- openai.api_key = os.getenv('OpenAPI')
13
-
14
- def download_pdf(url, output_path):
15
- urllib.request.urlretrieve(url, output_path)
16
-
17
- def extract_zip(file):
18
- with zipfile.ZipFile(file, 'r') as zip_ref:
19
- for member in zip_ref.namelist():
20
- filename = os.path.basename(member)
21
- if filename.endswith('.pdf'):
22
- zip_ref.extract(member, 'pdfs')
23
-
24
- def preprocess(text):
25
- text = text.replace('\n', ' ')
26
- text = re.sub('\s+', ' ', text)
27
- return text
28
-
29
- def pdf_to_text(path, start_page=1, end_page=None):
30
- doc = fitz.open(path)
31
- total_pages = doc.page_count
32
-
33
- if end_page is None:
34
- end_page = total_pages
35
-
36
- text_list = []
37
-
38
- for i in range(start_page-1, end_page):
39
- text = doc.load_page(i).get_text("text")
40
- text = preprocess(text)
41
- text_list.append(text)
42
-
43
- doc.close()
44
- return text_list
45
-
46
- def text_to_chunks(texts, word_length=150, start_page=1):
47
- text_toks = [t.split(' ') for t in texts]
48
  chunks = []
49
-
50
- for idx, words in enumerate(text_toks):
51
- for i in range(0, len(words), word_length):
52
- chunk = words[i:i+word_length]
53
- if (i+word_length) > len(words) and (len(chunk) < word_length) and (
54
- len(text_toks) != (idx+1)):
55
- text_toks[idx+1] = chunk + text_toks[idx+1]
56
- continue
57
- chunk = ' '.join(chunk).strip()
58
- chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
59
- chunks.append(chunk)
60
- return chunks
61
-
62
- class SemanticSearch:
63
-
64
- def __init__(self):
65
- self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
66
- self.fitted = False
67
-
68
- def fit(self, data, batch=1000, n_neighbors=15):
69
- self.data = data
70
- self.embeddings = self.get_text_embedding(data, batch=batch)
71
- n_neighbors = min(n_neighbors, len(self.embeddings))
72
- self.nn = NearestNeighbors(n_neighbors=n_neighbors)
73
- self.nn.fit(self.embeddings)
74
- self.fitted = True
75
-
76
- def __call__(self, text, return_data=True):
77
- inp_emb = self.use([text])
78
- neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
79
-
80
- if return_data:
81
- return [self.data[i] for i in neighbors]
82
  else:
83
- return neighbors
84
-
85
- def get_text_embedding(self, texts, batch=1000):
86
- embeddings = []
87
- for i in range(0, len(texts), batch):
88
- text_batch = texts[i:(i+batch)]
89
- emb_batch = self.use(text_batch)
90
- embeddings.append(emb_batch)
91
- embeddings = np.vstack(embeddings)
92
- return embeddings
93
-
94
- recommender = SemanticSearch()
95
-
96
- def load_recommender(paths, start_page=1):
97
- global recommender
98
- chunks = []
99
- for path in paths:
100
- if path.endswith('.pdf'):
101
- texts = pdf_to_text(path, start_page=start_page)
102
- chunks += text_to_chunks(texts, start_page=start_page)
103
- recommender.fit(chunks)
104
- return 'Corpus Loaded.'
105
-
106
- def generate_text(messages, engine='gpt-3.5-turbo', max_tokens=2048, temperature=0.8):
107
- response = openai.ChatCompletion.create(
108
- model=engine,
109
- messages=[{"role": "system", "content": "You are a research assistant"},
110
- {"role": "user", "content": question}],
111
- max_tokens=max_tokens,
112
- n=1,
113
- temperature=temperature
114
- )
115
- return response.choices[0].message['content']
116
-
117
-
118
- def generate_answer(question):
119
- topn_chunks = recommender(question)
120
-
121
- prompt = "You are a helpful assistant.\n"
122
- prompt += "User: " + question + "\n"
123
 
124
- for c in topn_chunks:
125
- prompt += "Assistant: " + c + "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
- answer = generate_text(prompt)
128
  return answer
129
 
 
 
 
 
130
 
 
 
 
131
 
132
- def question_answer(urls, file, question):
133
- if urls.strip() == '' and file is None:
134
- return '[ERROR]: Both URLs and PDFs are empty. Provide at least one.'
135
-
136
- paths = []
137
- if urls.strip() != '':
138
- urls = urls.split(',') # split the URLs string into a list of URLs
139
- for url in urls:
140
- download_pdf(url.strip(), 'corpus.pdf')
141
- paths.append('corpus.pdf')
142
-
143
- if file is not None:
144
- extract_zip(file.name) # extract the PDFs from the zip file
145
- for pdf_file in os.listdir('pdfs'):
146
- paths.append(os.path.join('pdfs', pdf_file))
147
-
148
- load_recommender(paths)
149
-
150
- if question.strip() == '':
151
- return '[ERROR]: Question field is empty'
152
-
153
- return generate_answer(question)
154
-
155
- title = 'Cognitive AI Agent - Asks the Expert'
156
- description = """ This cognitive agent allows you to chat with your PDF files as a single corpus of knowledge. Add your relevant PDFs to a zip file and upload. 🛑PROOF OF CONCEPT🛑 """
157
 
158
- iface = gr.Interface(
159
- fn=question_answer,
160
- inputs=[
161
- gr.inputs.Textbox(label="Enter PDF URLs here, separated by commas"),
162
- gr.inputs.File(label="Upload a zip file containing PDF files"),
163
- gr.inputs.Textbox(label="Enter your question here"),
164
- ],
165
- outputs=gr.outputs.Textbox(label="Generated Answer"),
166
- title=title,
167
- description=description
168
- )
169
- iface.launch()
170
 
171
 
 
 
 
 
 
 
 
 
1
  import os
2
  import zipfile
3
+ import openai
4
+ import gradio as gr
5
+ from gradio import components as grc
6
+
7
+ # Set up OpenAI API credentials
8
+ openai.api_key = "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"
9
+
10
+ # Function to extract text from PDF using OpenAI API
11
+ def extract_text_from_pdf(pdf_path):
12
+ with open(pdf_path, "rb") as f:
13
+ pdf_bytes = f.read()
14
+ response = openai.Completion.create(
15
+ engine="text-davinci-003",
16
+ prompt=pdf_bytes.decode("utf-8"),
17
+ max_tokens=2048,
18
+ temperature=0.7,
19
+ n=1,
20
+ stop=None,
21
+ timeout=120,
22
+ )
23
+ return response.choices[0].text.strip()
24
+
25
+ # Function to extract text from multiple PDFs in a ZIP archive
26
+ def extract_text_from_zip(zip_file):
27
+ corpus = ""
28
+ with zipfile.ZipFile(zip_file, "r") as zip_ref:
29
+ for file_name in zip_ref.namelist():
30
+ if file_name.endswith(".pdf"):
31
+ extracted_text = extract_text_from_pdf(zip_ref.read(file_name))
32
+ corpus += extracted_text + "\n"
33
+ return corpus
34
+
35
+ # Function to split text into chunks based on maximum token length
36
+ def split_text_into_chunks(text, max_tokens=2048):
 
 
 
 
37
  chunks = []
38
+ words = text.split()
39
+ current_chunk = ""
40
+ for word in words:
41
+ if len(current_chunk) + len(word) <= max_tokens:
42
+ current_chunk += word + " "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  else:
44
+ chunks.append(current_chunk.strip())
45
+ current_chunk = word + " "
46
+ if current_chunk:
47
+ chunks.append(current_chunk.strip())
48
+ return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # Function to process files and query using OpenAI API
51
+ def process_files_and_query(zip_file, query):
52
+ # Save uploaded ZIP file
53
+ zip_path = "uploaded.zip"
54
+ with open(zip_path, "wb") as f:
55
+ f.write(zip_file.read())
56
+
57
+ # Extract text from PDFs in the ZIP archive
58
+ corpus = extract_text_from_zip(zip_file)
59
+
60
+ # Split the corpus into chunks
61
+ chunks = split_text_into_chunks(corpus)
62
+
63
+ # Perform OpenAI API query on each chunk
64
+ responses = []
65
+ for chunk in chunks:
66
+ prompt = chunk + "\nQuery: " + query
67
+ response = openai.Completion.create(
68
+ engine="text-davinci-003",
69
+ prompt=prompt,
70
+ max_tokens=2048,
71
+ temperature=0.7,
72
+ n=1,
73
+ stop=None,
74
+ timeout=120,
75
+ )
76
+ responses.append(response.choices[0].text.strip())
77
+
78
+ # Combine the responses into a single answer
79
+ answer = " ".join(responses)
80
 
 
81
  return answer
82
 
83
+ # Gradio input and output interfaces
84
+ zip_file_input = grc.File(label="Upload ZIP File")
85
+ query_input = grc.Textbox(label="Enter your query")
86
+ output = grc.Textbox(label="Answer")
87
 
88
+ # Gradio interface configuration
89
+ iface = gr.Interface(fn=process_files_and_query, inputs=[zip_file_input, query_input], outputs=output, title="PDF Search", description="Upload a ZIP file containing PDFs, enter your query, and get the answer.")
90
+ iface.launch()
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94