Spaces:

vividsd
/

practice

Build error

App Files Files Community

practice / app.py

vividsd

Update app.py

dd5c0ad over 1 year ago

raw

history blame

7.93 kB

	# my space: https://huggingface.co/spaces/vividsd/practice

	import gradio as gr
	from transformers import pipeline
	from tempfile import NamedTemporaryFile
	import PyPDF2
	from PyPDF2 import PdfReader
	from pdfminer.high_level import extract_pages, extract_text
	import pdfplumber
	from PIL import Image
	from pdf2image import convert_from_path
	from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
	import pytesseract
	import os
	import numpy as np
	import torch
	import sentencepiece
	import soundfile as sf
	from IPython.display import Audio
	from datasets import load_dataset
	from transformers import SpeechT5HifiGan

	def read_pdf(pdf_path):
	# create a PDF file object
	pdfFileObj = open(pdf_path, 'rb')
	# create a PDF reader object
	pdfReaded = PyPDF2.PdfReader(pdfFileObj)

	# Create the dictionary to extract text from each image
	text_per_page = {}
	# We extract the pages from the PDF
	for pagenum, page in enumerate(extract_pages(pdf_path)):
	print("Elaborating Page_" +str(pagenum))
	# Initialize the variables needed for the text extraction from the page
	pageObj = pdfReaded.pages[pagenum]
	page_text = []
	line_format = []
	text_from_images = []
	text_from_tables = []
	page_content = []
	# Initialize the number of the examined tables
	table_num = 0
	first_element= True
	table_extraction_flag= False
	# Open the pdf file
	pdf = pdfplumber.open(pdf_path)
	# Find the examined page
	page_tables = pdf.pages[pagenum]
	# Find the number of tables on the page
	tables = page_tables.find_tables()


	# Find all the elements
	page_elements = [(element.y1, element) for element in page._objs]
	# Sort all the elements as they appear in the page
	page_elements.sort(key=lambda a: a[0], reverse=True)

	# Find the elements that composed a page
	for i,component in enumerate(page_elements):
	# Extract the position of the top side of the element in the PDF
	pos= component[0]
	# Extract the element of the page layout
	element = component[1]

	# Check if the element is a text element
	if isinstance(element, LTTextContainer):
	# Check if the text appeared in a table
	if table_extraction_flag == False:
	# Use the function to extract the text and format for each text element
	(line_text, format_per_line) = text_extraction(element)
	# Append the text of each line to the page text
	page_text.append(line_text)
	# Append the format for each line containing text
	line_format.append(format_per_line)
	page_content.append(line_text)
	else:
	# Omit the text that appeared in a table
	pass

	# Check the elements for images
	if isinstance(element, LTFigure):
	# Crop the image from the PDF
	crop_image(element, pageObj)
	# Convert the cropped pdf to an image
	convert_to_images('cropped_image.pdf')
	# Extract the text from the image
	image_text = image_to_text('PDF_image.png')
	text_from_images.append(image_text)
	page_content.append(image_text)
	# Add a placeholder in the text and format lists
	page_text.append('image')
	line_format.append('image')

	# Check the elements for tables
	if isinstance(element, LTRect):
	# If the first rectangular element
	if first_element == True and (table_num+1) <= len(tables):
	# Find the bounding box of the table
	lower_side = page.bbox[3] - tables[table_num].bbox[3]
	upper_side = element.y1
	# Extract the information from the table
	table = extract_table(pdf_path, pagenum, table_num)
	# Convert the table information in structured string format
	table_string = table_converter(table)
	# Append the table string into a list
	text_from_tables.append(table_string)
	page_content.append(table_string)
	# Set the flag as True to avoid the content again
	table_extraction_flag = True
	# Make it another element
	first_element = False
	# Add a placeholder in the text and format lists
	page_text.append('table')
	line_format.append('table')

	# Check if we already extracted the tables from the page
	if element.y0 >= lower_side and element.y1 <= upper_side:
	pass
	elif not isinstance(page_elements[i+1][1], LTRect):
	table_extraction_flag = False
	first_element = True
	table_num+=1


	# Create the key of the dictionary
	dctkey = 'Page_'+str(pagenum)
	# Add the list of list as the value of the page key
	text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]

	# Closing the pdf file object
	pdfFileObj.close()

	return text_per_page

	pdf_path = pdf_file.name

	text_per_page = read_pdf(pdf_path)

	page_0 = text_per_page['Page_0']

	page_0_clean = [item for sublist in page_0 for item in sublist if isinstance(item, str)]
	for i in range(len(page_0_clean)):
	page_0_clean[i] = page_0_clean[i].replace('\n', ' ').strip()

	#intead of cleaning the exact position as I did in my previous code, since I don't know it, then I try to identify the section of the abstract

	abstract = 'abstract'
	found_abstract = False
	intro_string ='introduction'
	extracted_abstract =""
	extracted_abstract = extracted_text_string.replace("Abstract", "")

	file = text.splitlines()
	for lines in file:
	lower_lines = lines.lower()
	if lower_lines.strip()== abstract:
	found_abstract = True
	elif "1" in lower_lines.strip() and intro_string in lower_lines.strip():
	found_abstract = False

	#summarizing the abstract

	from transformers import pipeline
	summarizer = pipeline("summarization", model="Falconsai/text_summarization")
	text1 = extracted_abstract
	print(summarizer(text1, max_length=20, min_length=10, do_sample=False))

	#in here, I try to save it differently, since on my previous code I had copied and pasted the summary and in here I don't know

	sentence = summarized_text[0]['summary_text']

	# generating the audio of the output by using my previous code


	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
	processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
	model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")


	text = sentence
	inputs = processor(text=sentence, return_tensors="pt")
	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
	spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
	with torch.no_grad():
	speech = vocoder(spectrogram)

	speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
	Audio(speech, rate=16000)


	# Creating the Gradio app
	input_component = gr.File(file_types=["pdf"])
	output_component = gr.Audio()

	demo = gr.Interface(
	fn=read_pdf,
	inputs=input_component,
	outputs=output_component,
	title="Reading your abstract summary outloud",
	description="Upload a PDF that contains an Abstract. Get your abstract summarized in 1 sentence and read outloud. We only accept with PDfs that contains the section Abstract"
	)

	demo.launch()