Spaces:

Yoxas
/

Creatingdataset

Runtime error

App Files Files Community

Creatingdataset / app.py

Yoxas

Update app.py

015e0a1 verified about 1 year ago

raw

history blame

3.1 kB

	import torch
	import re
	import pandas as pd
	from PyPDF2 import PdfReader
	from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM
	import gradio as gr
	import spaces

	# Load the tokenizer and model
	led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
	summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")

	# Load the model separately
	model = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")

	# Move the model to CUDA if available
	if torch.cuda.is_available():
	model = model.to("cuda")

	# Function to clean text by keeping only alphanumeric characters and spaces
	def clean_text(text):
	return re.sub(r'[^a-zA-Z0-9\s]', '', text)

	# Function to extract text from PDF files
	def extract_text(pdf_file):
	try:
	with open(pdf_file, 'rb') as file:
	pdf_reader = PdfReader(file)
	if pdf_reader.is_encrypted:
	print(f"Skipping encrypted file: {pdf_file}")
	return None
	return ' '.join(page.extract_text() or '' for page in pdf_reader.pages)
	except Exception as e:
	print(f"Error extracting text from {pdf_file}: {e}")
	return None

	# Function to classify text using LED model in batches
	def classify_texts(texts):
	return [classifier(text)["label"] for text in texts]

	# Function to summarize text using the summarizer model in batches
	@spaces.GPU
	def summarize_texts(texts):
	return [summarizer(text, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for text in texts]

	# Function to extract a title-like summary from the beginning of the text
	@spaces.GPU
	def extract_title(text):
	return summarizer(text, max_length=20, min_length=5, do_sample=False)[0]['summary_text']

	# Function to process PDF files
	@spaces.GPU
	def process_files(pdf_files):
	data = []
	for pdf_file in pdf_files:
	text = extract_text(pdf_file)
	if text is None:
	continue

	title_text = text.split(maxsplit=512)[0]
	title = extract_title(title_text)

	# Clean the entire text at once
	cleaned_text = clean_text(text)

	data.append([title, summarize_texts([cleaned_text])[0], cleaned_text])

	df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content'])
	output_file_path = 'processed_pdfs.csv'
	df.to_csv(output_file_path, index=False)
	return output_file_path

	# Gradio interface
	pdf_input = gr.Interface.inputs.File(label="Upload PDF Files", type="file", multiple=True)
	csv_output = gr.Interface.outputs.File(label="Download CSV")

	gr.Interface(
	fn=process_files,
	inputs=pdf_input,
	outputs=csv_output,
	title="Dataset creation",
	description="Upload PDF files and get a summarized CSV file.",
	article="""<p>This app creates a dataset from research papers using AI models.</p>
	<p>It uses models for classification and summarization to extract titles, abstracts, and content from PDFs.</p>"""
	).launch(share=True)