File size: 4,220 Bytes
2899092
 
 
70637e5
 
2899092
 
70637e5
 
 
 
 
 
 
2899092
 
 
 
 
 
 
70637e5
 
 
 
 
 
 
 
 
 
 
 
2899092
 
 
 
 
 
 
 
 
 
 
 
 
 
70637e5
2899092
 
 
 
 
 
 
 
 
 
 
 
 
70637e5
 
 
2899092
70637e5
 
 
 
 
 
 
 
 
 
 
2899092
70637e5
 
 
 
 
2899092
70637e5
 
 
2899092
70637e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2899092
70637e5
2899092
70637e5
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import re
import pandas as pd
import PyPDF2
from concurrent.futures import ThreadPoolExecutor
from transformers import pipeline, AutoTokenizer
import gradio as gr

# Load the LED tokenizer and model
led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
classifier = pipeline("text-classification", model="allenai/led-base-16384-multi_lexsum-source-long", tokenizer=led_tokenizer, framework="pt")

# Load the summarization model and tokenizer
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")

# Function to clean text by keeping only alphanumeric characters and spaces
def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

# Function to extract text from PDF files
def extract_text(pdf_file):
    try:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        if pdf_reader.is_encrypted:
            print(f"Skipping encrypted file: {pdf_file}")
            return None
        text = ''
        for page in pdf_reader.pages:
            text += page.extract_text() or ''
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_file}: {e}")
        return None

# Function to split text into chunks of a specified size
def split_text(text, chunk_size=1024):
    words = text.split()
    for i in range(0, len(words), chunk_size):
        yield ' '.join(words[i:i + chunk_size])

# Function to classify text using LED model
def classify_text(text):
    try:
        return classifier(text)[0]['label']
    except IndexError:
        return "Unable to classify"

# Function to summarize text using the summarizer model
def summarize_text(text, max_length=100, min_length=30):
    try:
        return summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
    except IndexError:
        return "Unable to summarize"

# Function to extract a title-like summary from the beginning of the text
def extract_title(text, max_length=20):
    try:
        return summarizer(text, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text']
    except IndexError:
        return "Unable to extract title"

# Function to process each PDF file and extract relevant information
def process_pdf(pdf_file):
    text = extract_text(pdf_file)

    # Skip encrypted files
    if text is None:
        return None

    # Extract a title from the beginning of the text
    title_text = ' '.join(text.split()[:512])  # Take the first 512 tokens for title extraction
    title = extract_title(title_text)

    # Initialize placeholders for combined results
    combined_abstract = []
    combined_cleaned_text = []

    # Split text into chunks and process each chunk
    for chunk in split_text(text, chunk_size=512):
        # Summarize the text chunk
        abstract = summarize_text(chunk)
        combined_abstract.append(abstract)

        # Clean the text chunk
        cleaned_text = clean_text(chunk)
        combined_cleaned_text.append(cleaned_text)

    # Combine results from all chunks
    final_abstract = ' '.join(combined_abstract)
    final_cleaned_text = ' '.join(combined_cleaned_text)

    return [title, final_abstract, final_cleaned_text]

# Function to handle multiple PDF files in parallel
def process_pdfs(files):
    data = []
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(process_pdf, files))
        data.extend(result for result in results if result is not None)
    return data

# Gradio interface function
def gradio_interface(files):
    data = process_pdfs([file.name for file in files])
    df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content'])
    csv_path = "/content/drive/My Drive/path_to_output/output.csv"  # Adjust this to your actual path
    df.to_csv(csv_path, index=False)
    return csv_path

# Gradio app setup
gr.Interface(
    fn=gradio_interface,
    inputs=gr.inputs.File(file_count="multiple", file_types=[".pdf"]),
    outputs="text",
    title="PDF Research Paper Dataset Creator",
    description="Upload PDF research papers to create a dataset with title, abstract, and content."
).launch()