Spaces:
Runtime error
Runtime error
File size: 6,323 Bytes
8c468f3 05baaca 8c468f3 05baaca c14f8d7 3b1b590 912db67 c14f8d7 3b1b590 c14f8d7 05baaca 47bdf28 ba7e56a f76c355 2fa0266 c14f8d7 eed1bb3 41bb5ba eed1bb3 70b9010 c14f8d7 05baaca c14f8d7 f76c355 c14f8d7 f76c355 c14f8d7 3b1b590 c14f8d7 3b1b590 c14f8d7 3b1b590 c14f8d7 3b1b590 c14f8d7 912db67 3b1b590 c14f8d7 c2b55f1 912db67 c14f8d7 2fa0266 c14f8d7 3b1b590 c14f8d7 912db67 c14f8d7 3b1b590 c14f8d7 3b1b590 e8cc3a6 47bdf28 eed1bb3 c14f8d7 3b1b590 c14f8d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
# https://huggingface.co/spaces/Alioth86/SpeechAbstractor
#Please, consider that I have recombined the function I created for the part 1 of assessment
#I have added a main function to connect them (for this main function I got some help from ChatGPT-4)
#I have created the input/output parts and the titles and the description
#and all the gradio features according to the Gradio website instructions.
#Please note that I have uploaded it all through git and git LFS.
#Here are the imports
import PyPDF2
import pdfplumber
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
import re
import torch
import transformers
from transformers import pipeline
import nltk
from datasets import load_dataset
import soundfile as sf
from IPython.display import Audio
import sentencepiece as spm
import os
import tempfile
import gradio as gr
#Here is the code
title="SpeechAbstractor"
description = """
This app enables users to upload academic articles in PDF format, specifically focusing on abstracts.
It efficiently summarizes the abstract and provides an audio playback of the summarized content.
Below are some example PDFs for you to experiment with. Feel free to explore the functionality of SpeechAbstractor!
(Please note: it works only with articles with an abstract)."""
examples = [
["Article_7.pdf"],["Article_11.pdf"]
]
#reporting the functions created for the part 1
def text_extraction(element):
line_text = element.get_text()
line_formats = []
for text_line in element:
if isinstance(text_line, LTTextContainer):
for character in text_line:
if isinstance(character, LTChar):
line_formats.append(character.fontname)
line_formats.append(character.size)
format_per_line = list(set(line_formats))
return (line_text, format_per_line)
def read_pdf(pdf_pathy):
pdfFileObj = open(pdf_pathy, 'rb')
pdfReaded = PyPDF2.PdfReader(pdfFileObj)
text_per_pagy = {}
for pagenum, page in enumerate(extract_pages(pdf_pathy)):
print("Elaborating Page_" +str(pagenum))
pageObj = pdfReaded.pages[pagenum]
page_text = []
line_format = []
page_content = []
pdf = pdfplumber.open(pdf_pathy)
page_elements = [(element.y1, element) for element in page._objs]
page_elements.sort(key=lambda a: a[0], reverse=True)
for i,component in enumerate(page_elements):
pos= component[0]
element = component[1]
if isinstance(element, LTTextContainer):
(line_text, format_per_line) = text_extraction(element)
page_text.append(line_text)
line_format.append(format_per_line)
page_content.append(line_text)
dctkey = 'Page_'+str(pagenum)
text_per_pagy[dctkey]= [page_text, line_format, page_content]
pdfFileObj.close()
return text_per_pagy
def clean_text(text):
text = re.sub(r'\s+', ' ', text)
return text.strip()
def extract_abstract(text_per_pagy):
abstract_text = ""
for page_num, page_text in text_per_pagy.items():
if page_text:
page_text = page_text.replace("- ", "")
start_index = -1
for variant in ["Abstract", "abstract", "ABSTRACT"]:
start_index = page_text.find(variant)
if start_index != -1:
start_index += len(variant) + 1
break
if start_index != -1:
end_markers = ["Introduction", "INTRODUCTION", "Background", "Contents", "Keywords"]
end_index = -1
for marker in end_markers:
temp_index = page_text.find(marker, start_index)
if temp_index != -1:
end_index = temp_index
break
if end_index == -1:
end_index = len(page_text)
abstract = page_text[start_index:end_index].strip()
abstract_text += " " + abstract
break
return abstract_text
#let's define a main function that gets the uploaded file (pdf) to do the job
def main_function(uploaded_filepath):
#put a control to see if there is a file uploaded
if uploaded_filepath is None:
return "No file loaded", None
#read and process the file according to read_pdf
text_per_pagy = read_pdf(uploaded_filepath)
#cleaning the text and getting the abstract using the 2 other functions
for key, value in text_per_pagy.items():
cleaned_text = clean_text(' '.join(value[0]))
text_per_pagy[key] = cleaned_text
abstract_text = extract_abstract(text_per_pagy)
nltk.download('punkt')
#abstract the summary with my pipeline and model, deciding the length
summarizer = pipeline("summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify")
summary = summarizer(abstract_text, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
#keeping just the first sentence, to be sure.
sentences = nltk.tokenize.sent_tokenize(summary)
first_sentence = sentences[0]
#generating the audio from the text, with my pipeline and model
synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
speech = synthesiser(first_sentence, forward_params={"speaker_embeddings": speaker_embedding})
#saving the audio in a temporary file
audio_file_path = "summary.wav"
sf.write(audio_file_path, speech["audio"], samplerate=speech["sampling_rate"])
#the function returns the 2 pieces we need
return first_sentence, audio_file_path
#let's communicate with gradio what it has to put in
iface = gr.Interface(
fn=main_function,
inputs=gr.File(type="filepath"),
outputs=[gr.Textbox(label="Abstract Summary"), gr.Audio(label="Abstract Summary Audio", type="filepath")],
title=title,
description=description,
examples=examples
)
#launching the app
if __name__ == "__main__":
iface.launch()
|