aminaj commited on
Commit
6b0baa7
·
verified ·
1 Parent(s): ade9907

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -0
app.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from docx import Document
3
+
4
+ import fitz # PyMuPDF
5
+ from transformers import BartForConditionalGeneration, BartTokenizer, pipeline
6
+ import textwrap
7
+ import tempfile
8
+
9
+ # Functions for file reading
10
+ def read_txt(file):
11
+ return file.getvalue().decode("utf-8")
12
+
13
+ def read_docx(file):
14
+ doc = Document(file)
15
+ return " ".join([para.text for para in doc.paragraphs])
16
+
17
+ def extract_text_from_pdf(file_path):
18
+ doc = fitz.open(file_path)
19
+ text = ""
20
+ for page_num in range(len(doc)):
21
+ page = doc[page_num]
22
+ text += page.get_text()
23
+ doc.close()
24
+ return text
25
+
26
+ def read_pdf(file):
27
+ # Create a temporary file
28
+ temp_file = tempfile.NamedTemporaryFile(delete=False)
29
+ # Write uploaded file content to the temporary file
30
+ temp_file.write(file.read())
31
+ # Close the temporary file to ensure changes are saved
32
+ temp_file.close()
33
+ # Get the file path of the temporary file
34
+ file_path = temp_file.name
35
+
36
+ return file_path, extract_text_from_pdf(file_path)
37
+
38
+ # Function for text summarization from pdf
39
+ def text_summarizer_from_pdf(pdf_path):
40
+ pdf_text = extract_text_from_pdf(pdf_path)
41
+
42
+ model_name = "facebook/bart-large-cnn"
43
+ model = BartForConditionalGeneration.from_pretrained(model_name)
44
+ tokenizer = BartTokenizer.from_pretrained(model_name)
45
+
46
+ inputs = tokenizer.encode("summarize: " + pdf_text, return_tensors="pt", max_length=1024, truncation=True)
47
+ summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
48
+
49
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
50
+ formatted_summary = "\n".join(textwrap.wrap(summary, width=80))
51
+ return formatted_summary
52
+
53
+ # Summarizer pipeline for txt and docx files
54
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
55
+
56
+ st.title("Text Summarizer")
57
+ st.subheader("📁 Upload a pdf, docx or text file to generate a short summary")
58
+
59
+ # Sidebar to upload file
60
+ uploaded_file = st.sidebar.file_uploader("Choose a file", type=["txt", "pdf", "docx"])
61
+
62
+ if uploaded_file:
63
+ file_details = {"FileName:" : uploaded_file.name, "FileType:" : uploaded_file.type, "FileSize:" : uploaded_file.size}
64
+ for key, value in file_details.items():
65
+ st.sidebar.write(key, value)
66
+
67
+ # Check the file type and read the file
68
+ if uploaded_file.type == "text/plain":
69
+ text = read_txt(uploaded_file)
70
+ elif uploaded_file.type == "application/pdf":
71
+ temp_path, text = read_pdf(uploaded_file)
72
+ elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
73
+ text = read_docx(uploaded_file)
74
+ else:
75
+ st.error("File type not supported. Please upload a txt, pdf or docx file.")
76
+ st.stop()
77
+
78
+ # Generate summary
79
+ if st.button('Generate Summary'):
80
+ with st.spinner("Generating summary..."):
81
+ try:
82
+ if(uploaded_file.type == "application/pdf"):
83
+ pdf_file_path = temp_path
84
+ summary = text_summarizer_from_pdf(temp_path)
85
+ st.success(summary)
86
+ else:
87
+ summary = summarizer(text, max_length=1000, min_length=30, do_sample=False)
88
+ st.success(summary[0]['summary_text'])
89
+ except Exception as e:
90
+ st.write(f"Failed to generate summary. Your file may have some problem. Please try again!")