File size: 4,327 Bytes
2899092
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c2c97b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import re
import PyPDF2
import pandas as pd
from transformers import pipeline, AutoTokenizer
import gradio as gr
import spaces

# Function to clean text by keeping only alphanumeric characters and spaces
def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

# Function to extract text from PDF files
def extract_text(pdf_file):
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ''
    for page_num in range(len(pdf_reader.pages)):
        text += pdf_reader.pages[page_num].extract_text()
    return text

# Function to split text into chunks of a specified size
def split_text(text, chunk_size=1024):
    words = text.split()
    for i in range(0, len(words), chunk_size):
        yield ' '.join(words[i:i + chunk_size])

# Load the LED tokenizer
led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")

# Function to classify text using LED model
@spaces.GPU(duration=120)
def classify_text(text):
    classifier = pipeline("text-classification", model="allenai/led-base-16384-multi_lexsum-source-long", tokenizer=led_tokenizer, framework="pt")
    try:
        return classifier(text)[0]['label']
    except IndexError:
        return "Unable to classify"

# Function to summarize text using BGE-m3 model
@spaces.GPU(duration=120)
def summarize_text(text, max_length=100, min_length=30):
    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
    try:
        return summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
    except IndexError:
        return "Unable to summarize"

# Function to extract a title-like summary from the beginning of the text
@spaces.GPU(duration=120)
def extract_title(text, max_length=20):
    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
    try:
        return summarizer(text, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text']
    except IndexError:
        return "Unable to extract title"

# Function to process PDF files and generate summaries
@spaces.GPU(duration=120)
def process_pdfs(pdf_files):
    data = []

    for pdf_file in pdf_files:
        text = extract_text(pdf_file)
        
        # Extract a title from the beginning of the text
        title_text = ' '.join(text.split()[:512])  # Take the first 512 tokens for title extraction
        title = extract_title(title_text)
        
        # Initialize placeholders for combined results
        combined_abstract = []
        combined_cleaned_text = []

        # Split text into chunks and process each chunk
        for chunk in split_text(text, chunk_size=512):
            # Summarize the text chunk
            abstract = summarize_text(chunk)
            combined_abstract.append(abstract)
            
            # Clean the text chunk
            cleaned_text = clean_text(chunk)
            combined_cleaned_text.append(cleaned_text)

        # Combine results from all chunks
        final_abstract = ' '.join(combined_abstract)
        final_cleaned_text = ' '.join(combined_cleaned_text)
        
        # Append the data to the list
        data.append([title, final_abstract, final_cleaned_text])

    # Create a DataFrame from the data list
    df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content'])

    # Save the DataFrame to a CSV file in the same folder as the source folder
    csv_file_path = 'processed_pdfs.csv'
    df.to_csv(csv_file_path, index=False)

    return csv_file_path

# Gradio interface
pdf_input = gr.File(label="Upload PDF Files", file_types=[".pdf"], file_count="multiple")
csv_output = gr.File(label="Download CSV")

gr.Interface(
    fn=process_pdfs, 
    inputs=pdf_input, 
    outputs=csv_output,
    title="Dataset creation",
    description="Upload PDF files and get a summarized CSV file.",
    article="""<p>This is an experimental app that allows you to create a dataset from research papers.</p>
                <p>This app uses the allenai/led-base-16384-multi_lexsum-source-long and sshleifer/distilbart-cnn-12-6 AI models.</p>
                <p>The output file is a CSV with 3 columns: title, abstract, and content.</p>"""
).launch()