File size: 2,957 Bytes
f2c8e06
421cd7c
 
f2c8e06
 
421cd7c
f2c8e06
 
 
421cd7c
f2c8e06
 
 
421cd7c
 
7c6a41d
 
 
 
 
 
 
 
 
421cd7c
 
 
432a8d9
 
421cd7c
 
 
 
432a8d9
 
421cd7c
f2c8e06
 
421cd7c
 
7c6a41d
 
 
 
9dca518
421cd7c
f2c8e06
7c6a41d
f2c8e06
421cd7c
f2c8e06
 
421cd7c
f2c8e06
 
421cd7c
 
f2c8e06
421cd7c
f2c8e06
 
 
421cd7c
f2c8e06
 
421cd7c
 
 
 
 
 
 
 
 
 
 
 
432a8d9
 
421cd7c
 
432a8d9
421cd7c
 
 
 
 
 
f2c8e06
 
421cd7c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import base64
import tempfile
import streamlit as st
from transformers import pipeline
from PyPDF2 import PdfReader

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the summarization model
tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
base_model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")

# Function to extract text from a PDF using PyPDF2
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:  # Only add page text if it exists
            text += page_text
    if text.strip():
        return text
    return None

# LLM pipeline for summarization
def llm_pipeline(input_text):
    pipe_sum = pipeline(
        'summarization',
        model=base_model,
        tokenizer=tokenizer,
        max_length=500,
        min_length=50,
    )
    result = pipe_sum(input_text)
    return result[0]['summary_text']

@st.cache_data
# Function to display the PDF
def displayPDF(file_path):
    with open(file_path, "rb") as f:
        base64_pdf = base64.b64encode(f.read()).decode('utf-8')
    pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
    st.markdown(pdf_display, unsafe_allow_html=True)

# Streamlit App
def main():
    st.title('PDF Content Summarizer')

    # PDF Upload Section
    uploaded_file = st.file_uploader("Upload your PDF file", type=['pdf'])
    if uploaded_file is not None:
        if st.button("Summarize PDF"):
            col1, col2 = st.columns(2)

            # Save the uploaded file to a temporary location
            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf", dir="/tmp/") as temp_file:
                temp_file.write(uploaded_file.read())
                temp_filepath = temp_file.name

            with col1:
                st.info("Uploaded PDF File")
                displayPDF(temp_filepath)

            with col2:
                st.info("Summarization")
                input_text = extract_text_from_pdf(temp_filepath)
                if input_text:  # Proceed only if text extraction was successful
                    summary = llm_pipeline(input_text)
                    st.success(summary)

    # Text Input Section
    st.header("Summarize Your Text")
    user_input = st.text_area("Enter your content here:", height=200)
    if st.button("Summarize Text"):
        if user_input.strip():
            col1, col2 = st.columns(2)

            with col1:
                st.info("Original Content")
                st.write(user_input)

            with col2:
                st.info("Summarization")
                summary = llm_pipeline(user_input)
                st.success(summary)
        else:
            st.warning("Please enter some content to summarize.")

if __name__ == '__main__':
    main()