import os import re import pandas as pd import PyPDF2 from concurrent.futures import ThreadPoolExecutor from transformers import pipeline, AutoTokenizer import gradio as gr # Load the LED tokenizer and model led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long") classifier = pipeline("text-classification", model="allenai/led-base-16384-multi_lexsum-source-long", tokenizer=led_tokenizer, framework="pt") # Load the summarization model and tokenizer summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt") # Function to clean text by keeping only alphanumeric characters and spaces def clean_text(text): return re.sub(r'[^a-zA-Z0-9\s]', '', text) # Function to extract text from PDF files def extract_text(pdf_file): try: pdf_reader = PyPDF2.PdfReader(pdf_file) if pdf_reader.is_encrypted: print(f"Skipping encrypted file: {pdf_file}") return None text = '' for page in pdf_reader.pages: text += page.extract_text() or '' return text except Exception as e: print(f"Error extracting text from {pdf_file}: {e}") return None # Function to split text into chunks of a specified size def split_text(text, chunk_size=1024): words = text.split() for i in range(0, len(words), chunk_size): yield ' '.join(words[i:i + chunk_size]) # Function to classify text using LED model def classify_text(text): try: return classifier(text)[0]['label'] except IndexError: return "Unable to classify" # Function to summarize text using the summarizer model def summarize_text(text, max_length=100, min_length=30): try: return summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text'] except IndexError: return "Unable to summarize" # Function to extract a title-like summary from the beginning of the text def extract_title(text, max_length=20): try: return summarizer(text, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text'] except IndexError: return "Unable to extract title" # Function to process each PDF file and extract relevant information def process_pdf(pdf_file): text = extract_text(pdf_file) # Skip encrypted files if text is None: return None # Extract a title from the beginning of the text title_text = ' '.join(text.split()[:512]) # Take the first 512 tokens for title extraction title = extract_title(title_text) # Initialize placeholders for combined results combined_abstract = [] combined_cleaned_text = [] # Split text into chunks and process each chunk for chunk in split_text(text, chunk_size=512): # Summarize the text chunk abstract = summarize_text(chunk) combined_abstract.append(abstract) # Clean the text chunk cleaned_text = clean_text(chunk) combined_cleaned_text.append(cleaned_text) # Combine results from all chunks final_abstract = ' '.join(combined_abstract) final_cleaned_text = ' '.join(combined_cleaned_text) return [title, final_abstract, final_cleaned_text] # Function to handle multiple PDF files in parallel def process_pdfs(files): data = [] with ThreadPoolExecutor() as executor: results = list(executor.map(process_pdf, files)) data.extend(result for result in results if result is not None) return data # Gradio interface function def gradio_interface(files): data = process_pdfs([file.name for file in files]) df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content']) csv_path = "/content/drive/My Drive/path_to_output/output.csv" # Adjust this to your actual path df.to_csv(csv_path, index=False) return csv_path # Gradio app setup gr.Interface( fn=gradio_interface, inputs=gr.inputs.File(file_count="multiple", file_types=[".pdf"]), outputs="text", title="PDF Research Paper Dataset Creator", description="Upload PDF research papers to create a dataset with title, abstract, and content." ).launch()