import gradio as gr from transformers import pipeline from tempfile import NamedTemporaryFile from PyPDF2 import PdfReader from IPython.display import Audio import numpy as np from bark import SAMPLE_RATE, generate_audio, preload_models from scipy.io.wavfile import write as write_wav import torch def read_pdf(pdf_path): # create a PDF file object pdfFileObj = open('/content/Article_11', 'rb') # create a PDF reader object pdfReaded = PyPDF2.PdfReader(pdfFileObj) # Create the dictionary to extract text from each image text_per_page = {} # We extract the pages from the PDF for pagenum, page in enumerate(extract_pages(pdf_path)): print("Elaborating Page_" +str(pagenum)) # Initialize the variables needed for the text extraction from the page pageObj = pdfReaded.pages[pagenum] page_text = [] line_format = [] text_from_images = [] text_from_tables = [] page_content = [] # Initialize the number of the examined tables table_num = 0 first_element= True table_extraction_flag= False # Open the pdf file pdf = pdfplumber.open(pdf_path) # Find the examined page page_tables = pdf.pages[pagenum] # Find the number of tables on the page tables = page_tables.find_tables() # Find all the elements page_elements = [(element.y1, element) for element in page._objs] # Sort all the elements as they appear in the page page_elements.sort(key=lambda a: a[0], reverse=True) # Find the elements that composed a page for i,component in enumerate(page_elements): # Extract the position of the top side of the element in the PDF pos= component[0] # Extract the element of the page layout element = component[1] # Check if the element is a text element if isinstance(element, LTTextContainer): # Check if the text appeared in a table if table_extraction_flag == False: # Use the function to extract the text and format for each text element (line_text, format_per_line) = text_extraction(element) # Append the text of each line to the page text page_text.append(line_text) # Append the format for each line containing text line_format.append(format_per_line) page_content.append(line_text) else: # Omit the text that appeared in a table pass # Check the elements for images if isinstance(element, LTFigure): # Crop the image from the PDF crop_image(element, pageObj) # Convert the cropped pdf to an image convert_to_images('cropped_image.pdf') # Extract the text from the image image_text = image_to_text('PDF_image.png') text_from_images.append(image_text) page_content.append(image_text) # Add a placeholder in the text and format lists page_text.append('image') line_format.append('image') # Check the elements for tables if isinstance(element, LTRect): # If the first rectangular element if first_element == True and (table_num+1) <= len(tables): # Find the bounding box of the table lower_side = page.bbox[3] - tables[table_num].bbox[3] upper_side = element.y1 # Extract the information from the table table = extract_table(pdf_path, pagenum, table_num) # Convert the table information in structured string format table_string = table_converter(table) # Append the table string into a list text_from_tables.append(table_string) page_content.append(table_string) # Set the flag as True to avoid the content again table_extraction_flag = True # Make it another element first_element = False # Add a placeholder in the text and format lists page_text.append('table') line_format.append('table') # Check if we already extracted the tables from the page if element.y0 >= lower_side and element.y1 <= upper_side: pass elif not isinstance(page_elements[i+1][1], LTRect): table_extraction_flag = False first_element = True table_num+=1 # Create the key of the dictionary dctkey = 'Page_'+str(pagenum) # Add the list of list as the value of the page key text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content] # Closing the pdf file object pdfFileObj.close() return text_per_page pdf_path = pdf_file.name text_per_page = read_pdf(pdf_path) page_0 = text_per_page['Page_0'] page_0_clean = [item for sublist in page_0 for item in sublist if isinstance(item, str)] for i in range(len(page_0_clean)): page_0_clean[i] = page_0_clean[i].replace('\n', ' ').strip() #intead of cleaning the exact position as I did in my previous code, since I don't know it, then I try to identify the section of the abstract abstract = 'abstract' found_abstract = False intro_string ='introduction' extracted_abstract ="" extracted_abstract = extracted_text_string.replace("Abstract", "") file = text.splitlines() for lines in file: lower_lines = lines.lower() if lower_lines.strip()== abstract: found_abstract = True elif "1" in lower_lines.strip() and intro_string in lower_lines.strip(): found_abstract = False #summarizing the abstract from transformers import pipeline summarizer = pipeline("summarization", model="Falconsai/text_summarization") text1 = extracted_abstract print(summarizer(text1, max_length=20, min_length=10, do_sample=False)) #in here, I try to save it differently, since on my previous code I had copied and pasted the summary and in here I don't know sentence = summarized_text[0]['summary_text'] # generating the audio of the output by using my previous code from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") text = sentence inputs = processor(text=sentence, return_tensors="pt") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings) vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") with torch.no_grad(): speech = vocoder(spectrogram) speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) Audio(speech, rate=16000) # Creating the Gradio app input_component = gr.File(file_types=["pdf"]) output_component = gr.Audio() demo = gr.Interface( fn=read_pdf, inputs=input_component, outputs=output_component, title="Reading your abstract summary outloud", description="Upload a PDF that contains an Abstract. Get your abstract summarized in 1 sentence and read outloud. We only accept with PDfs that contains the section Abstract" ) demo.launch()