Creatingdataset / app.py
Yoxas's picture
Update app.py
015e0a1 verified
raw
history blame
3.1 kB
import torch
import re
import pandas as pd
from PyPDF2 import PdfReader
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM
import gradio as gr
import spaces
# Load the tokenizer and model
led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
# Load the model separately
model = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
# Move the model to CUDA if available
if torch.cuda.is_available():
model = model.to("cuda")
# Function to clean text by keeping only alphanumeric characters and spaces
def clean_text(text):
return re.sub(r'[^a-zA-Z0-9\s]', '', text)
# Function to extract text from PDF files
def extract_text(pdf_file):
try:
with open(pdf_file, 'rb') as file:
pdf_reader = PdfReader(file)
if pdf_reader.is_encrypted:
print(f"Skipping encrypted file: {pdf_file}")
return None
return ' '.join(page.extract_text() or '' for page in pdf_reader.pages)
except Exception as e:
print(f"Error extracting text from {pdf_file}: {e}")
return None
# Function to classify text using LED model in batches
def classify_texts(texts):
return [classifier(text)["label"] for text in texts]
# Function to summarize text using the summarizer model in batches
@spaces.GPU
def summarize_texts(texts):
return [summarizer(text, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for text in texts]
# Function to extract a title-like summary from the beginning of the text
@spaces.GPU
def extract_title(text):
return summarizer(text, max_length=20, min_length=5, do_sample=False)[0]['summary_text']
# Function to process PDF files
@spaces.GPU
def process_files(pdf_files):
data = []
for pdf_file in pdf_files:
text = extract_text(pdf_file)
if text is None:
continue
title_text = text.split(maxsplit=512)[0]
title = extract_title(title_text)
# Clean the entire text at once
cleaned_text = clean_text(text)
data.append([title, summarize_texts([cleaned_text])[0], cleaned_text])
df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content'])
output_file_path = 'processed_pdfs.csv'
df.to_csv(output_file_path, index=False)
return output_file_path
# Gradio interface
pdf_input = gr.Interface.inputs.File(label="Upload PDF Files", type="file", multiple=True)
csv_output = gr.Interface.outputs.File(label="Download CSV")
gr.Interface(
fn=process_files,
inputs=pdf_input,
outputs=csv_output,
title="Dataset creation",
description="Upload PDF files and get a summarized CSV file.",
article="""<p>This app creates a dataset from research papers using AI models.</p>
<p>It uses models for classification and summarization to extract titles, abstracts, and content from PDFs.</p>"""
).launch(share=True)