import urllib.request
import fitz
import re
import numpy as np
import tensorflow_hub as hub
import openai
import gradio as gr
import os
import zipfile
from sklearn.neighbors import NearestNeighbors

openai.api_key = os.getenv('OpenAPI') 

def download_pdf(url, output_path):
    urllib.request.urlretrieve(url, output_path)

def extract_zip(file):
    with zipfile.ZipFile(file, 'r') as zip_ref:
        for member in zip_ref.namelist():
            filename = os.path.basename(member)
            if filename.endswith('.pdf'):
                zip_ref.extract(member, 'pdfs')

def preprocess(text):
    text = text.replace('\n', ' ')
    text = re.sub('\s+', ' ', text)
    return text

def pdf_to_text(path, start_page=1, end_page=None):
    doc = fitz.open(path)
    total_pages = doc.page_count

    if end_page is None:
        end_page = total_pages

    text_list = []

    for i in range(start_page-1, end_page):
        text = doc.load_page(i).get_text("text")
        text = preprocess(text)
        text_list.append(text)

    doc.close()
    return text_list

def text_to_chunks(texts, word_length=150, start_page=1):
    text_toks = [t.split(' ') for t in texts]
    chunks = []

    for idx, words in enumerate(text_toks):
        for i in range(0, len(words), word_length):
            chunk = words[i:i+word_length]
            if (i+word_length) > len(words) and (len(chunk) < word_length) and (
                len(text_toks) != (idx+1)):
                text_toks[idx+1] = chunk + text_toks[idx+1]
                continue
            chunk = ' '.join(chunk).strip()
            chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
            chunks.append(chunk)
    return chunks

class SemanticSearch:

    def __init__(self):
        self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
        self.fitted = False

    def fit(self, data, batch=1000, n_neighbors=15):
        self.data = data
        self.embeddings = self.get_text_embedding(data, batch=batch)
        n_neighbors = min(n_neighbors, len(self.embeddings))
        self.nn = NearestNeighbors(n_neighbors=n_neighbors)
        self.nn.fit(self.embeddings)
        self.fitted = True

    def __call__(self, text, return_data=True):
        inp_emb = self.use([text])
        neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]

        if return_data:
            return [self.data[i] for i in neighbors]
        else:
            return neighbors

    def get_text_embedding(self, texts, batch=1000):
        embeddings = []
        for i in range(0, len(texts), batch):
            text_batch = texts[i:(i+batch)]
            emb_batch = self.use(text_batch)
            embeddings.append(emb_batch)
        embeddings = np.vstack(embeddings)
        return embeddings

recommender = SemanticSearch()

def load_recommender(paths, start_page=1):
    global recommender
    chunks = []
    for path in paths:
        if path.endswith('.pdf'):
            texts = pdf_to_text(path, start_page=start_page)
            chunks += text_to_chunks(texts, start_page=start_page)
    recommender.fit(chunks)
    return 'Corpus Loaded.'

def generate_text(messages, engine='gpt-3.5-turbo', max_tokens=2048, temperature=0.8):
    response = openai.ChatCompletion.create(
        model=engine,
        messages=[{"role": "system", "content": "You are a research assistant"},
             {"role": "user", "content": prompt}],
        max_tokens=max_tokens,
        n=1,
        temperature=temperature
     )
    return response.choices[0].message['content']


def generate_answer(question):
    topn_chunks = recommender(question)

    prompt = "You are a helpful assistant.\n"
    prompt += "User: " + question + "\n"

    for c in topn_chunks:
        prompt += "Assistant: " + c + "\n"

    answer = generate_text(prompt)
    return answer


def question_answer(urls, file, question):
    if urls.strip() == '' and file is None:
        return '[ERROR]: Both URLs and PDFs are empty. Provide at least one.'

    paths = []
    if urls.strip() != '':
        urls = urls.split(',')  # split the URLs string into a list of URLs
        for url in urls:
            download_pdf(url.strip(), 'corpus.pdf')
            paths.append('corpus.pdf')

    if file is not None:
        extract_zip(file.name)  # extract the PDFs from the zip file
        for pdf_file in os.listdir('pdfs'):
            paths.append(os.path.join('pdfs', pdf_file))

    load_recommender(paths)

    if question.strip() == '':
        return '[ERROR]: Question field is empty'

    return generate_answer(question)

title = 'Cognitive AI Agent - Asks the Expert'
description = """ This cognitive agent allows you to chat with your PDF files as a single corpus of knowledge.  Add your relevant PDFs to a zip file and upload. 🛑PROOF OF CONCEPT🛑 """

iface = gr.Interface(
    fn=question_answer,
    inputs=[
        gr.inputs.Textbox(label="Enter PDF URLs here, separated by commas"),
        gr.inputs.File(label="Upload a zip file containing PDF files"),
        gr.inputs.Textbox(label="Enter your question here"),
    ],
    outputs=gr.outputs.Textbox(label="Generated Answer"),
    title=title,
    description=description
)
iface.launch()