import torch
import re
import pandas as pd
from PyPDF2 import PdfReader
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM
import gradio as gr
import space

# Load the tokenizer and model
led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")

# Load the model separately
model = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")

# Move the model to CUDA if available
if torch.cuda.is_available():
    model = model.to("cuda")

# Function to clean text by keeping only alphanumeric characters and spaces
def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

# Function to extract text from PDF files
def extract_text(pdf_file):
    try:
        with open(pdf_file, 'rb') as file:
            pdf_reader = PdfReader(file)
            if pdf_reader.is_encrypted:
                print(f"Skipping encrypted file: {pdf_file}")
                return None
            return ' '.join(page.extract_text() or '' for page in pdf_reader.pages)
    except Exception as e:
        print(f"Error extracting text from {pdf_file}: {e}")
        return None

# Function to classify text using LED model in batches
def classify_texts(texts):
    return [classifier(text)["label"] for text in texts]

# Function to summarize text using the summarizer model in batches
@spaces.GPU
def summarize_texts(texts):
    return [summarizer(text, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for text in texts]

# Function to extract a title-like summary from the beginning of the text
@spaces.GPU
def extract_title(text):
    return summarizer(text, max_length=20, min_length=5, do_sample=False)[0]['summary_text']

# Function to process PDF files
@spaces.GPU
def process_files(pdf_files):
    data = []
    for pdf_file in pdf_files:
        text = extract_text(pdf_file)
        if text is None:
            continue

        title_text = text.split(maxsplit=512)[0]
        title = extract_title(title_text)

        # Clean the entire text at once
        cleaned_text = clean_text(text)

        data.append([title, summarize_texts([cleaned_text])[0], cleaned_text])

    df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content'])
    output_file_path = 'processed_pdfs.csv'
    df.to_csv(output_file_path, index=False)
    return output_file_path

# Gradio interface
pdf_input = gr.Interface.inputs.File(label="Upload PDF Files", type="file", multiple=True)
csv_output = gr.Interface.outputs.File(label="Download CSV")

gr.Interface(
    fn=process_files,
    inputs=pdf_input,
    outputs=csv_output,
    title="Dataset creation",
    description="Upload PDF files and get a summarized CSV file.",
    article="""<p>This app creates a dataset from research papers using AI models.</p>
                <p>It uses models for classification and summarization to extract titles, abstracts, and content from PDFs.</p>"""
).launch(share=True)