Spaces:
Runtime error
Runtime error
File size: 4,220 Bytes
2899092 70637e5 2899092 70637e5 2899092 70637e5 2899092 70637e5 2899092 70637e5 2899092 70637e5 2899092 70637e5 2899092 70637e5 2899092 70637e5 2899092 70637e5 2899092 70637e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import os
import re
import pandas as pd
import PyPDF2
from concurrent.futures import ThreadPoolExecutor
from transformers import pipeline, AutoTokenizer
import gradio as gr
# Load the LED tokenizer and model
led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
classifier = pipeline("text-classification", model="allenai/led-base-16384-multi_lexsum-source-long", tokenizer=led_tokenizer, framework="pt")
# Load the summarization model and tokenizer
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
# Function to clean text by keeping only alphanumeric characters and spaces
def clean_text(text):
return re.sub(r'[^a-zA-Z0-9\s]', '', text)
# Function to extract text from PDF files
def extract_text(pdf_file):
try:
pdf_reader = PyPDF2.PdfReader(pdf_file)
if pdf_reader.is_encrypted:
print(f"Skipping encrypted file: {pdf_file}")
return None
text = ''
for page in pdf_reader.pages:
text += page.extract_text() or ''
return text
except Exception as e:
print(f"Error extracting text from {pdf_file}: {e}")
return None
# Function to split text into chunks of a specified size
def split_text(text, chunk_size=1024):
words = text.split()
for i in range(0, len(words), chunk_size):
yield ' '.join(words[i:i + chunk_size])
# Function to classify text using LED model
def classify_text(text):
try:
return classifier(text)[0]['label']
except IndexError:
return "Unable to classify"
# Function to summarize text using the summarizer model
def summarize_text(text, max_length=100, min_length=30):
try:
return summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
except IndexError:
return "Unable to summarize"
# Function to extract a title-like summary from the beginning of the text
def extract_title(text, max_length=20):
try:
return summarizer(text, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text']
except IndexError:
return "Unable to extract title"
# Function to process each PDF file and extract relevant information
def process_pdf(pdf_file):
text = extract_text(pdf_file)
# Skip encrypted files
if text is None:
return None
# Extract a title from the beginning of the text
title_text = ' '.join(text.split()[:512]) # Take the first 512 tokens for title extraction
title = extract_title(title_text)
# Initialize placeholders for combined results
combined_abstract = []
combined_cleaned_text = []
# Split text into chunks and process each chunk
for chunk in split_text(text, chunk_size=512):
# Summarize the text chunk
abstract = summarize_text(chunk)
combined_abstract.append(abstract)
# Clean the text chunk
cleaned_text = clean_text(chunk)
combined_cleaned_text.append(cleaned_text)
# Combine results from all chunks
final_abstract = ' '.join(combined_abstract)
final_cleaned_text = ' '.join(combined_cleaned_text)
return [title, final_abstract, final_cleaned_text]
# Function to handle multiple PDF files in parallel
def process_pdfs(files):
data = []
with ThreadPoolExecutor() as executor:
results = list(executor.map(process_pdf, files))
data.extend(result for result in results if result is not None)
return data
# Gradio interface function
def gradio_interface(files):
data = process_pdfs([file.name for file in files])
df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content'])
csv_path = "/content/drive/My Drive/path_to_output/output.csv" # Adjust this to your actual path
df.to_csv(csv_path, index=False)
return csv_path
# Gradio app setup
gr.Interface(
fn=gradio_interface,
inputs=gr.inputs.File(file_count="multiple", file_types=[".pdf"]),
outputs="text",
title="PDF Research Paper Dataset Creator",
description="Upload PDF research papers to create a dataset with title, abstract, and content."
).launch()
|