Yoxas commited on
Commit
70637e5
·
verified ·
1 Parent(s): 3fc8ca0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -67
app.py CHANGED
@@ -1,10 +1,17 @@
1
  import os
2
  import re
3
- import PyPDF2
4
  import pandas as pd
 
 
5
  from transformers import pipeline, AutoTokenizer
6
  import gradio as gr
7
- import spaces
 
 
 
 
 
 
8
 
9
  # Function to clean text by keeping only alphanumeric characters and spaces
10
  def clean_text(text):
@@ -12,11 +19,18 @@ def clean_text(text):
12
 
13
  # Function to extract text from PDF files
14
  def extract_text(pdf_file):
15
- pdf_reader = PyPDF2.PdfReader(pdf_file)
16
- text = ''
17
- for page_num in range(len(pdf_reader.pages)):
18
- text += pdf_reader.pages[page_num].extract_text()
19
- return text
 
 
 
 
 
 
 
20
 
21
  # Function to split text into chunks of a specified size
22
  def split_text(text, chunk_size=1024):
@@ -24,89 +38,80 @@ def split_text(text, chunk_size=1024):
24
  for i in range(0, len(words), chunk_size):
25
  yield ' '.join(words[i:i + chunk_size])
26
 
27
- # Load the LED tokenizer
28
- led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
29
-
30
  # Function to classify text using LED model
31
- @spaces.GPU(duration=120)
32
  def classify_text(text):
33
- classifier = pipeline("text-classification", model="allenai/led-base-16384-multi_lexsum-source-long", tokenizer=led_tokenizer, framework="pt")
34
  try:
35
  return classifier(text)[0]['label']
36
  except IndexError:
37
  return "Unable to classify"
38
 
39
- # Function to summarize text using BGE-m3 model
40
- @spaces.GPU(duration=120)
41
  def summarize_text(text, max_length=100, min_length=30):
42
- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
43
  try:
44
  return summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
45
  except IndexError:
46
  return "Unable to summarize"
47
 
48
  # Function to extract a title-like summary from the beginning of the text
49
- @spaces.GPU(duration=120)
50
  def extract_title(text, max_length=20):
51
- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
52
  try:
53
  return summarizer(text, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text']
54
  except IndexError:
55
  return "Unable to extract title"
56
 
57
- # Function to process PDF files and generate summaries
58
- @spaces.GPU(duration=120)
59
- def process_pdfs(pdf_files):
60
- data = []
61
 
62
- for pdf_file in pdf_files:
63
- text = extract_text(pdf_file)
64
-
65
- # Extract a title from the beginning of the text
66
- title_text = ' '.join(text.split()[:512]) # Take the first 512 tokens for title extraction
67
- title = extract_title(title_text)
68
-
69
- # Initialize placeholders for combined results
70
- combined_abstract = []
71
- combined_cleaned_text = []
72
-
73
- # Split text into chunks and process each chunk
74
- for chunk in split_text(text, chunk_size=512):
75
- # Summarize the text chunk
76
- abstract = summarize_text(chunk)
77
- combined_abstract.append(abstract)
78
-
79
- # Clean the text chunk
80
- cleaned_text = clean_text(chunk)
81
- combined_cleaned_text.append(cleaned_text)
82
-
83
- # Combine results from all chunks
84
- final_abstract = ' '.join(combined_abstract)
85
- final_cleaned_text = ' '.join(combined_cleaned_text)
86
-
87
- # Append the data to the list
88
- data.append([title, final_abstract, final_cleaned_text])
89
-
90
- # Create a DataFrame from the data list
91
- df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content'])
92
 
93
- # Save the DataFrame to a CSV file in the same folder as the source folder
94
- csv_file_path = 'processed_pdfs.csv'
95
- df.to_csv(csv_file_path, index=False)
 
 
96
 
97
- return csv_file_path
 
 
98
 
99
- # Gradio interface
100
- pdf_input = gr.File(label="Upload PDF Files", file_types=[".pdf"], file_count="multiple")
101
- csv_output = gr.File(label="Download CSV")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
 
103
  gr.Interface(
104
- fn=process_pdfs,
105
- inputs=pdf_input,
106
- outputs=csv_output,
107
- title="Dataset creation",
108
- description="Upload PDF files and get a summarized CSV file.",
109
- article="""<p>This is an experimental app that allows you to create a dataset from research papers.</p>
110
- <p>This app uses the allenai/led-base-16384-multi_lexsum-source-long and sshleifer/distilbart-cnn-12-6 AI models.</p>
111
- <p>The output file is a CSV with 3 columns: title, abstract, and content.</p>"""
112
- ).launch()
 
1
  import os
2
  import re
 
3
  import pandas as pd
4
+ import PyPDF2
5
+ from concurrent.futures import ThreadPoolExecutor
6
  from transformers import pipeline, AutoTokenizer
7
  import gradio as gr
8
+
9
+ # Load the LED tokenizer and model
10
+ led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
11
+ classifier = pipeline("text-classification", model="allenai/led-base-16384-multi_lexsum-source-long", tokenizer=led_tokenizer, framework="pt")
12
+
13
+ # Load the summarization model and tokenizer
14
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
15
 
16
  # Function to clean text by keeping only alphanumeric characters and spaces
17
  def clean_text(text):
 
19
 
20
  # Function to extract text from PDF files
21
  def extract_text(pdf_file):
22
+ try:
23
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
24
+ if pdf_reader.is_encrypted:
25
+ print(f"Skipping encrypted file: {pdf_file}")
26
+ return None
27
+ text = ''
28
+ for page in pdf_reader.pages:
29
+ text += page.extract_text() or ''
30
+ return text
31
+ except Exception as e:
32
+ print(f"Error extracting text from {pdf_file}: {e}")
33
+ return None
34
 
35
  # Function to split text into chunks of a specified size
36
  def split_text(text, chunk_size=1024):
 
38
  for i in range(0, len(words), chunk_size):
39
  yield ' '.join(words[i:i + chunk_size])
40
 
 
 
 
41
  # Function to classify text using LED model
 
42
  def classify_text(text):
 
43
  try:
44
  return classifier(text)[0]['label']
45
  except IndexError:
46
  return "Unable to classify"
47
 
48
+ # Function to summarize text using the summarizer model
 
49
  def summarize_text(text, max_length=100, min_length=30):
 
50
  try:
51
  return summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
52
  except IndexError:
53
  return "Unable to summarize"
54
 
55
  # Function to extract a title-like summary from the beginning of the text
 
56
  def extract_title(text, max_length=20):
 
57
  try:
58
  return summarizer(text, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text']
59
  except IndexError:
60
  return "Unable to extract title"
61
 
62
+ # Function to process each PDF file and extract relevant information
63
+ def process_pdf(pdf_file):
64
+ text = extract_text(pdf_file)
 
65
 
66
+ # Skip encrypted files
67
+ if text is None:
68
+ return None
69
+
70
+ # Extract a title from the beginning of the text
71
+ title_text = ' '.join(text.split()[:512]) # Take the first 512 tokens for title extraction
72
+ title = extract_title(title_text)
73
+
74
+ # Initialize placeholders for combined results
75
+ combined_abstract = []
76
+ combined_cleaned_text = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ # Split text into chunks and process each chunk
79
+ for chunk in split_text(text, chunk_size=512):
80
+ # Summarize the text chunk
81
+ abstract = summarize_text(chunk)
82
+ combined_abstract.append(abstract)
83
 
84
+ # Clean the text chunk
85
+ cleaned_text = clean_text(chunk)
86
+ combined_cleaned_text.append(cleaned_text)
87
 
88
+ # Combine results from all chunks
89
+ final_abstract = ' '.join(combined_abstract)
90
+ final_cleaned_text = ' '.join(combined_cleaned_text)
91
+
92
+ return [title, final_abstract, final_cleaned_text]
93
+
94
+ # Function to handle multiple PDF files in parallel
95
+ def process_pdfs(files):
96
+ data = []
97
+ with ThreadPoolExecutor() as executor:
98
+ results = list(executor.map(process_pdf, files))
99
+ data.extend(result for result in results if result is not None)
100
+ return data
101
+
102
+ # Gradio interface function
103
+ def gradio_interface(files):
104
+ data = process_pdfs([file.name for file in files])
105
+ df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content'])
106
+ csv_path = "/content/drive/My Drive/path_to_output/output.csv" # Adjust this to your actual path
107
+ df.to_csv(csv_path, index=False)
108
+ return csv_path
109
 
110
+ # Gradio app setup
111
  gr.Interface(
112
+ fn=gradio_interface,
113
+ inputs=gr.inputs.File(file_count="multiple", file_types=[".pdf"]),
114
+ outputs="text",
115
+ title="PDF Research Paper Dataset Creator",
116
+ description="Upload PDF research papers to create a dataset with title, abstract, and content."
117
+ ).launch()