Yoxas commited on
Commit
a456478
·
verified ·
1 Parent(s): 7040885

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -86
app.py DELETED
@@ -1,86 +0,0 @@
1
- import torch
2
- import re
3
- import pandas as pd
4
- from gradio import Interface, File
5
- import spaces
6
- import gradio as gr
7
- from PyPDF2 import PdfReader
8
- from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM
9
-
10
- # Load the tokenizer and model
11
- led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
12
- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
13
-
14
- # Load the model separately
15
- model = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
16
-
17
- # Move the model to CUDA if available
18
- if torch.cuda.is_available():
19
- model = model.to("cuda")
20
-
21
- # Function to clean text by keeping only alphanumeric characters and spaces
22
- def clean_text(text):
23
- return re.sub(r'[^a-zA-Z0-9\s]', '', text)
24
-
25
- # Function to extract text from PDF files
26
- def extract_text(pdf_file):
27
- try:
28
- with open(pdf_file, 'rb') as file:
29
- pdf_reader = PdfReader(file)
30
- if pdf_reader.is_encrypted:
31
- print(f"Skipping encrypted file: {pdf_file}")
32
- return None
33
- return ' '.join(page.extract_text() or '' for page in pdf_reader.pages)
34
- except Exception as e:
35
- print(f"Error extracting text from {pdf_file}: {e}")
36
- return None
37
-
38
- # Function to classify text using LED model in batches
39
- def classify_texts(texts):
40
- return [classifier(text)["label"] for text in texts]
41
-
42
- # Function to summarize text using the summarizer model in batches
43
- @spaces.GPU
44
- def summarize_texts(texts):
45
- return [summarizer(text, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for text in texts]
46
-
47
- # Function to extract a title-like summary from the beginning of the text
48
- @spaces.GPU
49
- def extract_title(text):
50
- return summarizer(text, max_length=20, min_length=5, do_sample=False)[0]['summary_text']
51
-
52
- # Function to process PDF files
53
- @spaces.GPU
54
- def process_files(pdf_files):
55
- data = []
56
- for pdf_file in pdf_files:
57
- text = extract_text(pdf_file)
58
- if text is None:
59
- continue
60
-
61
- title_text = text.split(maxsplit=512)[0]
62
- title = extract_title(title_text)
63
-
64
- # Clean the entire text at once
65
- cleaned_text = clean_text(text)
66
-
67
- data.append([title, summarize_texts([cleaned_text])[0], cleaned_text])
68
-
69
- df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content'])
70
- output_file_path = 'processed_pdfs.csv'
71
- df.to_csv(output_file_path, index=False)
72
- return output_file_path
73
-
74
- # Gradio interface
75
- pdf_input = gr.File(label="Upload PDF Files", file_types=[".pdf"], file_count="multiple")
76
- csv_output = gr.File(label="Download CSV")
77
-
78
- gr.Interface(
79
- fn=process_files,
80
- inputs=pdf_input,
81
- outputs=csv_output,
82
- title="Dataset creation",
83
- description="Upload PDF files and get a summarized CSV file.",
84
- article="""<p>This app creates a dataset from research papers using AI models.</p>
85
- <p>It uses models for classification and summarization to extract titles, abstracts, and content from PDFs.</p>"""
86
- ).launch(share=True)