Spaces:

vividsd
/

practice

Build error

App Files Files Community

vividsd commited on Dec 10, 2023

Commit

a92e01d

1 Parent(s): 589fdb6

Update app.py

Browse files

Files changed (1) hide show

app.py +162 -59

app.py CHANGED Viewed

@@ -8,79 +8,182 @@ from bark import SAMPLE_RATE, generate_audio, preload_models
 from scipy.io.wavfile import write as write_wav
 import torch
-def summarize_abstract_from_pdf(pdf_file_path):
-    abstract_string = 'abstract'
-    found_abstract = False
-    intro_string ='introduction'
-    extracted_text_string =""
-    # Read the PDF and extract text from the first page
-    with open(pdf_file_path, 'rb') as pdf_file:
-        reader = PdfReader(pdf_file)
-        text = ""
-        text += reader.pages[0].extract_text()
-    file = text.splitlines()
     for lines in file:
       lower_lines = lines.lower()
-      if lower_lines.strip()== abstract_string:
         found_abstract = True
       elif "1" in lower_lines.strip() and intro_string in lower_lines.strip():
         found_abstract = False
-      if found_abstract == True:
-        extracted_text_string += lines
-    extracted_text_string = extracted_text_string.replace("Abstract", "")
-    summarizer = pipeline("summarization", "pszemraj/led-base-book-summary",device=0 if torch.cuda.is_available() else -1,)
-    # Generate a summarized abstract using the specified model
-    summarized_abstract = summarizer(extracted_text_string,
-    min_length=16,
-    max_length=150,
-    no_repeat_ngram_size=3,
-    encoder_no_repeat_ngram_size=3,
-    repetition_penalty=3.5,
-    num_beams=4,
-    early_stopping=True,
-    )
-    #I run this twice to get summazired text
-    summarized_abstract2 = summarizer(summarized_abstract[0]['summary_text'],
-    min_length=16,
-    max_length=25,
-    no_repeat_ngram_size=3,
-    encoder_no_repeat_ngram_size=3,
-    repetition_penalty=3.5,
-    num_beams=4,
-    early_stopping=True,
-    )
-    # Return the summarized abstract as a string
-    return summarized_abstract2[0]['summary_text']
-def generate_audio_func(pdf_file):
-    pdf_file_path = pdf_file.name
-  # Generate audio from text
-  #call the summarize abstract function
-    text_prompt =  summarize_abstract_from_pdf(pdf_file_path)
-    audio_array = generate_audio(text_prompt)
-  # Create a temporary WAV file to save the audio
-    with NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav_file:
-        wav_file_path = temp_wav_file.name
-        write_wav(wav_file_path, 22050, (audio_array * 32767).astype(np.int16))
-    return wav_file_path
-# Create the Gradio app
 input_component = gr.File(file_types=["pdf"])
 output_component = gr.Audio()
 demo = gr.Interface(
-    fn=generate_audio_func,
     inputs=input_component,
     outputs=output_component,
     title="Reading your abstract summary outloud",

 from scipy.io.wavfile import write as write_wav
 import torch
+def read_pdf(pdf_path):
+  # create a PDF file object
+  pdfFileObj = open('/content/Article_11', 'rb')
+  # create a PDF reader object
+  pdfReaded = PyPDF2.PdfReader(pdfFileObj)
+  # Create the dictionary to extract text from each image
+  text_per_page = {}
+  # We extract the pages from the PDF
+  for pagenum, page in enumerate(extract_pages(pdf_path)):
+      print("Elaborating Page_" +str(pagenum))
+      # Initialize the variables needed for the text extraction from the page
+      pageObj = pdfReaded.pages[pagenum]
+      page_text = []
+      line_format = []
+      text_from_images = []
+      text_from_tables = []
+      page_content = []
+      # Initialize the number of the examined tables
+      table_num = 0
+      first_element= True
+      table_extraction_flag= False
+      # Open the pdf file
+      pdf = pdfplumber.open(pdf_path)
+      # Find the examined page
+      page_tables = pdf.pages[pagenum]
+      # Find the number of tables on the page
+      tables = page_tables.find_tables()
+      # Find all the elements
+      page_elements = [(element.y1, element) for element in page._objs]
+      # Sort all the elements as they appear in the page
+      page_elements.sort(key=lambda a: a[0], reverse=True)
+      # Find the elements that composed a page
+      for i,component in enumerate(page_elements):
+          # Extract the position of the top side of the element in the PDF
+          pos= component[0]
+          # Extract the element of the page layout
+          element = component[1]
+          # Check if the element is a text element
+          if isinstance(element, LTTextContainer):
+              # Check if the text appeared in a table
+              if table_extraction_flag == False:
+                  # Use the function to extract the text and format for each text element
+                  (line_text, format_per_line) = text_extraction(element)
+                  # Append the text of each line to the page text
+                  page_text.append(line_text)
+                  # Append the format for each line containing text
+                  line_format.append(format_per_line)
+                  page_content.append(line_text)
+              else:
+                  # Omit the text that appeared in a table
+                  pass
+          # Check the elements for images
+          if isinstance(element, LTFigure):
+              # Crop the image from the PDF
+              crop_image(element, pageObj)
+              # Convert the cropped pdf to an image
+              convert_to_images('cropped_image.pdf')
+              # Extract the text from the image
+              image_text = image_to_text('PDF_image.png')
+              text_from_images.append(image_text)
+              page_content.append(image_text)
+              # Add a placeholder in the text and format lists
+              page_text.append('image')
+              line_format.append('image')
+          # Check the elements for tables
+          if isinstance(element, LTRect):
+              # If the first rectangular element
+              if first_element == True and (table_num+1) <= len(tables):
+                  # Find the bounding box of the table
+                  lower_side = page.bbox[3] - tables[table_num].bbox[3]
+                  upper_side = element.y1
+                  # Extract the information from the table
+                  table = extract_table(pdf_path, pagenum, table_num)
+                  # Convert the table information in structured string format
+                  table_string = table_converter(table)
+                  # Append the table string into a list
+                  text_from_tables.append(table_string)
+                  page_content.append(table_string)
+                  # Set the flag as True to avoid the content again
+                  table_extraction_flag = True
+                  # Make it another element
+                  first_element = False
+                  # Add a placeholder in the text and format lists
+                  page_text.append('table')
+                  line_format.append('table')
+                  # Check if we already extracted the tables from the page
+                  if element.y0 >= lower_side and element.y1 <= upper_side:
+                      pass
+                  elif not isinstance(page_elements[i+1][1], LTRect):
+                      table_extraction_flag = False
+                      first_element = True
+                      table_num+=1
+      # Create the key of the dictionary
+      dctkey = 'Page_'+str(pagenum)
+      # Add the list of list as the value of the page key
+      text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
+  # Closing the pdf file object
+  pdfFileObj.close()
+  return text_per_page
+pdf_path = pdf_file.name
+text_per_page = read_pdf(pdf_path)
+page_0 = text_per_page['Page_0']
+page_0_clean = [item for sublist in page_0 for item in sublist if isinstance(item, str)]
+for i in range(len(page_0_clean)):
+    page_0_clean[i] = page_0_clean[i].replace('\n', ' ').strip()
+#intead of cleaning the exact position as I did in my previous code, since I don't know it, then I try to identify the section of the abstract
+abstract = 'abstract'
+found_abstract = False
+intro_string ='introduction'
+extracted_abstract =""
+extracted_abstract = extracted_text_string.replace("Abstract", "")
+file = text.splitlines()
     for lines in file:
       lower_lines = lines.lower()
+      if lower_lines.strip()== abstract:
         found_abstract = True
       elif "1" in lower_lines.strip() and intro_string in lower_lines.strip():
         found_abstract = False
+#summarizing the abstract
+from transformers import pipeline
+summarizer = pipeline("summarization", model="Falconsai/text_summarization")
+text1 = extracted_abstract
+print(summarizer(text1, max_length=20, min_length=10, do_sample=False))
+#in here, I try to save it differently, since on my previous code I had copied and pasted the summary and in here I don't know
+sentence = summarized_text[0]['summary_text']
+# generating the audio of the output by using my previous code
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
+processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+text = sentence
+inputs = processor(text=sentence, return_tensors="pt")
+embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+with torch.no_grad():
+    speech = vocoder(spectrogram)
+speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
+Audio(speech, rate=16000)
+# Creating the Gradio app
 input_component = gr.File(file_types=["pdf"])
 output_component = gr.Audio()
 demo = gr.Interface(
+    fn=read_pdf,
     inputs=input_component,
     outputs=output_component,
     title="Reading your abstract summary outloud",