File size: 6,024 Bytes
47125e3
d68b3f4
47125e3
 
 
 
 
 
 
 
 
195e8f3
47125e3
b522f05
 
47125e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3aae288
195e8f3
 
 
 
3aae288
 
 
 
 
 
 
 
a7df561
 
 
 
 
47125e3
ec3fe15
47125e3
ec3fe15
 
 
 
 
 
 
 
 
 
 
 
 
 
47125e3
 
 
 
 
 
ec3fe15
47125e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec3fe15
 
47125e3
 
 
 
 
 
 
 
f1125bf
ec3fe15
47125e3
 
ec3fe15
47125e3
 
 
ec3fe15
 
 
47125e3
 
 
 
ec3fe15
 
 
 
 
 
47125e3
ec3fe15
 
 
 
47125e3
ec3fe15
 
 
47125e3
 
 
 
 
 
 
 
 
ec3fe15
47125e3
 
 
 
 
ec3fe15
 
47125e3
 
 
ec3fe15
 
47125e3
ec3fe15
47125e3
 
ec3fe15
 
 
 
47125e3
 
 
973e299
da2f69e
 
ec3fe15
47125e3
 
 
ebaad60
a7df561
ec3fe15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import fitz
import uuid
import re
import numpy as np
import tensorflow_hub as hub
import openai
import gradio as gr
import shutil
import os
from sklearn.neighbors import NearestNeighbors
from tempfile import NamedTemporaryFile
from PyPDF2 import PdfReader

openAI_key = os.environ['OpenAPI']

class SemanticSearch:
    
    def __init__(self):
        self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
        self.fitted = False
    
    
    def fit(self, data, batch=1000, n_neighbors=5):
        self.data = data
        self.embeddings = self.get_text_embedding(data, batch=batch)
        n_neighbors = min(n_neighbors, len(self.embeddings))
        self.nn = NearestNeighbors(n_neighbors=n_neighbors)
        self.nn.fit(self.embeddings)
        self.fitted = True
    
    
    def __call__(self, text, return_data=True):
        inp_emb = self.use([text])
        neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
        
        if return_data:
            return [self.data[i] for i in neighbors]
        else:
            return neighbors
    
    
    def get_text_embedding(self, texts, batch=1000):
        embeddings = []
        for i in range(0, len(texts), batch):
            text_batch = texts[i:(i+batch)]
            emb_batch = self.use(text_batch)
            embeddings.append(emb_batch)
        embeddings = np.vstack(embeddings)
        return embeddings

def pdf_to_text(pdf_path, start_page=1):
    pdf = PdfReader(pdf_path)
    text = ''
    for i in range(start_page, len(pdf.pages)):
        text += pdf.pages[i].extract_text()
    return text

def text_to_chunks(text, start_page=1, chunk_size=512):
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks



def unique_filename(basename):
    # Append a unique ID to the end of the filename, before the extension
    base, ext = os.path.splitext(basename)
    return base + "_" + uuid.uuid4().hex + ext


def load_recommender(paths, start_page=1):
    global recommender
    chunks = []
    for path in paths:
        pdf_file = os.path.basename(path)
        embeddings_file = f"{pdf_file}_{start_page}.npy"
        
        if os.path.isfile(embeddings_file):
            embeddings = np.load(embeddings_file)
            recommender.embeddings = embeddings
            recommender.fitted = True
            print("Embeddings loaded from file")
            continue
        
        texts = pdf_to_text(path, start_page=start_page)
        chunks.extend(text_to_chunks(texts, start_page=start_page))
    
    recommender.fit(chunks)
    np.save(embeddings_file, recommender.embeddings)
    return 'Corpus Loaded.'


def generate_text(openAI_key, prompt, engine="gpt-3.5-turbo"):
    openai.api_key = openAI_key
    messages = [{'role': 'system', 'content': 'You are a helpful assistant.'},
                {'role': 'user', 'content': prompt}]
    
    completions = openai.ChatCompletion.create(
        model=engine,
        messages=messages,
        max_tokens=512,
        n=1,
        stop=None,
        temperature=0.7,
    )
    message = completions.choices[0].message['content']
    return message


def generate_answer(question, openAI_key):
    topn_chunks = recommender(question)
    prompt = ""
    prompt += 'search results:\n\n'
    for c in topn_chunks:
        prompt += c + '\n\n'
        
    prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
              "Make sure the answer is correct and don't output false content. "\
              "If you do not know the answer - answer 'information not provided' "\
              "Answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "
    
    prompt += f"Query: {question}\nAnswer:"
    answer = generate_text(openAI_key, prompt, "gpt-3.5-turbo")
    return answer


def main_loop(url: str, files: list, question:
str, openAI_key):
    paths = []

    if url.strip() != '':
        glob_url = url
        download_pdf(glob_url, 'corpus.pdf')
        paths.append('corpus.pdf')
    if files is not None and len(files) > 0:
        for file in files:
            old_file_name = file.name
            file_name = old_file_name[:-12] + old_file_name[-4:]
            file_name = unique_filename(file_name)  # Ensure the new file name is unique

            # Copy the content of the old file to the new file and delete the old file
            with open(old_file_name, 'rb') as src, open(file_name, 'wb') as dst:
                shutil.copyfileobj(src, dst)
            os.remove(old_file_name)

            paths.append(file_name)

    load_recommender(paths)

    if question.strip().lower() == 'exit':
        return '', False
    
    answer = generate_answer(question, openAI_key)
    return answer, True  # Assuming the function returns an answer in all other cases


def on_click(*args):
    answer.value = main_loop(url.value, files.value, question.value)


recommender = SemanticSearch()

title = 'Cognitive pdfGPT'
description = """ Why use Cognitive Ask an Expert?
This is Cognitive Chat. Here you can upload multiple PDF files and query them as a single corpus of knowledge. 🛑DO NOT USE CONFIDENTIAL INFORMATION """


with gr.Blocks() as demo:
    gr.Markdown(f'<center><h1>{title}</h1></center>')
    gr.Markdown(description)

    with gr.Row():

        with gr.Group():
            files = gr.Files(label='➡️ Upload your PDFs ⬅️  NO CONFIDENTIAL FILES ', file_types=['.pdf'])
            url = gr.Textbox(label=' ')
            question = gr.Textbox(label='🔤 Enter your question here 🔤')
            btn = gr.Button(value='Submit')
            btn.style(full_width=False)

        with gr.Group():
            gr.Image("logo.jpg") 
            answer = gr.Textbox(label='The answer to your question is :')
            
        btn.click(main_loop, inputs=[url, files, question], outputs=[answer])

     

demo.launch(share=False, debug=True, auth=None, auth_message=None)