import fitz
import uuid
import re
import numpy as np
import tensorflow_hub as hub
import openai
import gradio as gr
import shutil
import os
from sklearn.neighbors import NearestNeighbors
from tempfile import NamedTemporaryFile
from PyPDF2 import PdfReader

openAI_key = os.environ['OpenAPI']

class SemanticSearch:
    
    def __init__(self):
        self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
        self.fitted = False
    
    
    def fit(self, data, batch=1000, n_neighbors=5):
        self.data = data
        self.embeddings = self.get_text_embedding(data, batch=batch)
        n_neighbors = min(n_neighbors, len(self.embeddings))
        self.nn = NearestNeighbors(n_neighbors=n_neighbors)
        self.nn.fit(self.embeddings)
        self.fitted = True
    
    
    def __call__(self, text, return_data=True):
        inp_emb = self.use([text])
        neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
        
        if return_data:
            return [self.data[i] for i in neighbors]
        else:
            return neighbors
    
    
    def get_text_embedding(self, texts, batch=1000):
        embeddings = []
        for i in range(0, len(texts), batch):
            text_batch = texts[i:(i+batch)]
            emb_batch = self.use(text_batch)
            embeddings.append(emb_batch)
        embeddings = np.vstack(embeddings)
        return embeddings

def pdf_to_text(pdf_path, start_page=1):
    pdf = PdfReader(pdf_path)
    text = ''
    for i in range(start_page, len(pdf.pages)):
        text += pdf.pages[i].extract_text()
    return text

def text_to_chunks(text, start_page=1, chunk_size=512):
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks


def unique_filename(basename):
    # Append a unique ID to the end of the filename, before the extension
    base, ext = os.path.splitext(basename)
    return base + "_" + uuid.uuid4().hex + ext


def load_recommender(paths, start_page=1):
    global recommender
    chunks = []
    for path in paths:
        pdf_file = os.path.basename(path)
        embeddings_file = f"{pdf_file}_{start_page}.npy"
        
        if os.path.isfile(embeddings_file):
            embeddings = np.load(embeddings_file)
            recommender.embeddings = embeddings
            recommender.fitted = True
            print("Embeddings loaded from file")
            continue
        
        texts = pdf_to_text(path, start_page=start_page)
        chunks.extend(text_to_chunks(texts, start_page=start_page))
    
    recommender.fit(chunks)
    np.save(embeddings_file, recommender.embeddings)
    return 'Corpus Loaded.'


def generate_text(openAI_key, prompt, engine="gpt-3.5-turbo"):
    openai.api_key = openAI_key
    messages = [{'role': 'system', 'content': 'You are a helpful assistant.'},
                {'role': 'user', 'content': prompt}]
    
    completions = openai.ChatCompletion.create(
        model=engine,
        messages=messages,
        max_tokens=512,
        n=1,
        stop=None,
        temperature=0.7,
    )
    message = completions.choices[0].message['content']
    return message


def generate_answer(question, openAI_key):
    topn_chunks = recommender(question)
    prompt = ""
    prompt += 'search results:\n\n'
    for c in topn_chunks:
        prompt += c + '\n\n'
        
    prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
              "Make sure the answer is correct and don't output false content. "\
              "If you do not know the answer - answer 'information not provided' "\
              "Answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "
    
    prompt += f"Query: {question}\nAnswer:"
    answer = generate_text(openAI_key, prompt, "gpt-3.5-turbo")
    return answer


def main_loop(url: str, files: list, question:
str, openAI_key):
    paths = []

    if url.strip() != '':
        glob_url = url
        download_pdf(glob_url, 'corpus.pdf')
        paths.append('corpus.pdf')
    if files is not None and len(files) > 0:
        for file in files:
            old_file_name = file.name
            file_name = old_file_name[:-12] + old_file_name[-4:]
            file_name = unique_filename(file_name)  # Ensure the new file name is unique

            # Copy the content of the old file to the new file and delete the old file
            with open(old_file_name, 'rb') as src, open(file_name, 'wb') as dst:
                shutil.copyfileobj(src, dst)
            os.remove(old_file_name)

            paths.append(file_name)

    load_recommender(paths)

    if question.strip().lower() == 'exit':
        return '', False
    
    answer = generate_answer(question, openAI_key)
    return answer, True  # Assuming the function returns an answer in all other cases


def on_click(*args):
    answer.value = main_loop(url.value, files.value, question.value)


recommender = SemanticSearch()

title = 'Cognitive pdfGPT'
description = """ Why use Cognitive Ask an Expert?
This is Cognitive Chat. Here you can upload multiple PDF files and query them as a single corpus of knowledge. 🛑DO NOT USE CONFIDENTIAL INFORMATION """


with gr.Blocks() as demo:
    gr.Markdown(f'<center><h1>{title}</h1></center>')
    gr.Markdown(description)

    with gr.Row():

        with gr.Group():
            files = gr.Files(label='➡️ Upload your PDFs ⬅️  NO CONFIDENTIAL FILES ', file_types=['.pdf'])
            url = gr.Textbox(label=' ')
            question = gr.Textbox(label='🔤 Enter your question here 🔤')
            btn = gr.Button(value='Submit')
            btn.style(full_width=False)

        with gr.Group():
            gr.Image("logo.jpg") 
            answer = gr.Textbox(label='The answer to your question is :')
            
        btn.click(main_loop, inputs=[url, files, question], outputs=[answer])

     
demo.launch(share=False, debug=True, auth=None, auth_message=None)