File size: 7,927 Bytes
dd5c0ad 3075f85 96ebfa6 c7ebf67 96ebfa6 c7ebf67 96ebfa6 3075f85 c7ebf67 3075f85 a92e01d c7ebf67 a92e01d 3075f85 a92e01d 96ebfa6 a92e01d 96ebfa6 a92e01d 96ebfa6 a92e01d 96ebfa6 a92e01d 96ebfa6 a92e01d 96ebfa6 d6d00d9 589fdb6 96ebfa6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
# my space: https://huggingface.co/spaces/vividsd/practice
import gradio as gr
from transformers import pipeline
from tempfile import NamedTemporaryFile
import PyPDF2
from PyPDF2 import PdfReader
from pdfminer.high_level import extract_pages, extract_text
import pdfplumber
from PIL import Image
from pdf2image import convert_from_path
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
import pytesseract
import os
import numpy as np
import torch
import sentencepiece
import soundfile as sf
from IPython.display import Audio
from datasets import load_dataset
from transformers import SpeechT5HifiGan
def read_pdf(pdf_path):
# create a PDF file object
pdfFileObj = open(pdf_path, 'rb')
# create a PDF reader object
pdfReaded = PyPDF2.PdfReader(pdfFileObj)
# Create the dictionary to extract text from each image
text_per_page = {}
# We extract the pages from the PDF
for pagenum, page in enumerate(extract_pages(pdf_path)):
print("Elaborating Page_" +str(pagenum))
# Initialize the variables needed for the text extraction from the page
pageObj = pdfReaded.pages[pagenum]
page_text = []
line_format = []
text_from_images = []
text_from_tables = []
page_content = []
# Initialize the number of the examined tables
table_num = 0
first_element= True
table_extraction_flag= False
# Open the pdf file
pdf = pdfplumber.open(pdf_path)
# Find the examined page
page_tables = pdf.pages[pagenum]
# Find the number of tables on the page
tables = page_tables.find_tables()
# Find all the elements
page_elements = [(element.y1, element) for element in page._objs]
# Sort all the elements as they appear in the page
page_elements.sort(key=lambda a: a[0], reverse=True)
# Find the elements that composed a page
for i,component in enumerate(page_elements):
# Extract the position of the top side of the element in the PDF
pos= component[0]
# Extract the element of the page layout
element = component[1]
# Check if the element is a text element
if isinstance(element, LTTextContainer):
# Check if the text appeared in a table
if table_extraction_flag == False:
# Use the function to extract the text and format for each text element
(line_text, format_per_line) = text_extraction(element)
# Append the text of each line to the page text
page_text.append(line_text)
# Append the format for each line containing text
line_format.append(format_per_line)
page_content.append(line_text)
else:
# Omit the text that appeared in a table
pass
# Check the elements for images
if isinstance(element, LTFigure):
# Crop the image from the PDF
crop_image(element, pageObj)
# Convert the cropped pdf to an image
convert_to_images('cropped_image.pdf')
# Extract the text from the image
image_text = image_to_text('PDF_image.png')
text_from_images.append(image_text)
page_content.append(image_text)
# Add a placeholder in the text and format lists
page_text.append('image')
line_format.append('image')
# Check the elements for tables
if isinstance(element, LTRect):
# If the first rectangular element
if first_element == True and (table_num+1) <= len(tables):
# Find the bounding box of the table
lower_side = page.bbox[3] - tables[table_num].bbox[3]
upper_side = element.y1
# Extract the information from the table
table = extract_table(pdf_path, pagenum, table_num)
# Convert the table information in structured string format
table_string = table_converter(table)
# Append the table string into a list
text_from_tables.append(table_string)
page_content.append(table_string)
# Set the flag as True to avoid the content again
table_extraction_flag = True
# Make it another element
first_element = False
# Add a placeholder in the text and format lists
page_text.append('table')
line_format.append('table')
# Check if we already extracted the tables from the page
if element.y0 >= lower_side and element.y1 <= upper_side:
pass
elif not isinstance(page_elements[i+1][1], LTRect):
table_extraction_flag = False
first_element = True
table_num+=1
# Create the key of the dictionary
dctkey = 'Page_'+str(pagenum)
# Add the list of list as the value of the page key
text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
# Closing the pdf file object
pdfFileObj.close()
return text_per_page
pdf_path = pdf_file.name
text_per_page = read_pdf(pdf_path)
page_0 = text_per_page['Page_0']
page_0_clean = [item for sublist in page_0 for item in sublist if isinstance(item, str)]
for i in range(len(page_0_clean)):
page_0_clean[i] = page_0_clean[i].replace('\n', ' ').strip()
#intead of cleaning the exact position as I did in my previous code, since I don't know it, then I try to identify the section of the abstract
abstract = 'abstract'
found_abstract = False
intro_string ='introduction'
extracted_abstract =""
extracted_abstract = extracted_text_string.replace("Abstract", "")
file = text.splitlines()
for lines in file:
lower_lines = lines.lower()
if lower_lines.strip()== abstract:
found_abstract = True
elif "1" in lower_lines.strip() and intro_string in lower_lines.strip():
found_abstract = False
#summarizing the abstract
from transformers import pipeline
summarizer = pipeline("summarization", model="Falconsai/text_summarization")
text1 = extracted_abstract
print(summarizer(text1, max_length=20, min_length=10, do_sample=False))
#in here, I try to save it differently, since on my previous code I had copied and pasted the summary and in here I don't know
sentence = summarized_text[0]['summary_text']
# generating the audio of the output by using my previous code
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
text = sentence
inputs = processor(text=sentence, return_tensors="pt")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
with torch.no_grad():
speech = vocoder(spectrogram)
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
Audio(speech, rate=16000)
# Creating the Gradio app
input_component = gr.File(file_types=["pdf"])
output_component = gr.Audio()
demo = gr.Interface(
fn=read_pdf,
inputs=input_component,
outputs=output_component,
title="Reading your abstract summary outloud",
description="Upload a PDF that contains an Abstract. Get your abstract summarized in 1 sentence and read outloud. We only accept with PDfs that contains the section Abstract"
)
demo.launch() |