File size: 3,103 Bytes
1ffeef0
2899092
 
b096875
1ffeef0
adb34bf
015e0a1
b096875
1ffeef0
70637e5
 
2899092
1ffeef0
 
 
 
 
 
 
2899092
 
 
1ffeef0
791ccd1
 
f8af002
1ffeef0
 
 
 
 
 
f8af002
 
 
2899092
1ffeef0
 
 
2899092
1ffeef0
 
 
 
2899092
 
1ffeef0
 
 
2899092
1ffeef0
 
b096875
1ffeef0
b096875
 
 
 
 
1ffeef0
b096875
 
1ffeef0
 
b096875
1ffeef0
b096875
70637e5
b096875
 
 
1ffeef0
b096875
1ffeef0
 
b096875
2899092
1ffeef0
 
50d5b5b
b096875
 
1ffeef0
 
b096875
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import torch
import re
import pandas as pd
from PyPDF2 import PdfReader
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM
import gradio as gr
import spaces

# Load the tokenizer and model
led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")

# Load the model separately
model = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")

# Move the model to CUDA if available
if torch.cuda.is_available():
    model = model.to("cuda")

# Function to clean text by keeping only alphanumeric characters and spaces
def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

# Function to extract text from PDF files
def extract_text(pdf_file):
    try:
        with open(pdf_file, 'rb') as file:
            pdf_reader = PdfReader(file)
            if pdf_reader.is_encrypted:
                print(f"Skipping encrypted file: {pdf_file}")
                return None
            return ' '.join(page.extract_text() or '' for page in pdf_reader.pages)
    except Exception as e:
        print(f"Error extracting text from {pdf_file}: {e}")
        return None

# Function to classify text using LED model in batches
def classify_texts(texts):
    return [classifier(text)["label"] for text in texts]

# Function to summarize text using the summarizer model in batches
@spaces.GPU
def summarize_texts(texts):
    return [summarizer(text, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for text in texts]

# Function to extract a title-like summary from the beginning of the text
@spaces.GPU
def extract_title(text):
    return summarizer(text, max_length=20, min_length=5, do_sample=False)[0]['summary_text']

# Function to process PDF files
@spaces.GPU
def process_files(pdf_files):
    data = []
    for pdf_file in pdf_files:
        text = extract_text(pdf_file)
        if text is None:
            continue

        title_text = text.split(maxsplit=512)[0]
        title = extract_title(title_text)

        # Clean the entire text at once
        cleaned_text = clean_text(text)

        data.append([title, summarize_texts([cleaned_text])[0], cleaned_text])

    df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content'])
    output_file_path = 'processed_pdfs.csv'
    df.to_csv(output_file_path, index=False)
    return output_file_path

# Gradio interface
pdf_input = gr.Interface.inputs.File(label="Upload PDF Files", type="file", multiple=True)
csv_output = gr.Interface.outputs.File(label="Download CSV")

gr.Interface(
    fn=process_files,
    inputs=pdf_input,
    outputs=csv_output,
    title="Dataset creation",
    description="Upload PDF files and get a summarized CSV file.",
    article="""<p>This app creates a dataset from research papers using AI models.</p>
                <p>It uses models for classification and summarization to extract titles, abstracts, and content from PDFs.</p>"""
).launch(share=True)