mrsk1883 commited on
Commit
e9382d2
·
1 Parent(s): 8e5bd23

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -0
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PyPDF2 import PdfReader
2
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
+ from gtts import gTTS
4
+ from IPython.display import Audio
5
+
6
+ # Download the model and tokenizer
7
+ model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
8
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
9
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
10
+
11
+ def summarize_pdf_abstract(pdf_path):
12
+ """
13
+ Reads a PDF file, extracts the abstract, and summarizes it in one sentence.
14
+
15
+ Args:
16
+ pdf_path: Path to the PDF file.
17
+
18
+ Returns:
19
+ A string containing the one-sentence summary of the abstract.
20
+ """
21
+
22
+ # Read the PDF file
23
+ reader = PdfReader(open(pdf_path, 'rb'))
24
+
25
+ # Extract the abstract
26
+ abstract_text = ""
27
+ for page in reader.pages:
28
+ # Search for keywords like "Abstract" or "Introduction"
29
+ if "Abstract" in page.extract_text() or "Introduction" in page.extract_text():
30
+ # Extract the text following the keyword
31
+ abstract_text = page.extract_text()
32
+ break
33
+
34
+ # Encode the abstract text
35
+ inputs = tokenizer(abstract_text, return_tensors="pt")
36
+
37
+ # Generate the summary
38
+ outputs = model.generate(**inputs)
39
+
40
+ # Decode the summary
41
+ summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
42
+
43
+ return summary
44
+
45
+ # Define the file path
46
+ pdf_path = "/content/Article 11 Hidden Technical Debt in Machine Learning Systems.pdf"
47
+
48
+ # Summarize the abstract
49
+ summary = summarize_pdf_abstract(pdf_path)
50
+
51
+ # Print the summary
52
+ print("One-sentence summary of the abstract:")
53
+ print(summary)
54
+
55
+ # Choose your preferred language for the audio
56
+ language = "en"
57
+
58
+ # Generate audio file
59
+ speech = gTTS(summary, lang=language)
60
+ speech.save("summary.mp3")
61
+
62
+ # Display the audio file
63
+ Audio("summary.mp3")