File size: 4,446 Bytes
ad376bb
 
f2c8e06
421cd7c
 
f2c8e06
fdd3e13
ad376bb
 
fdd3e13
f2c8e06
 
421cd7c
f2c8e06
 
 
fdd3e13
421cd7c
668f0b8
fdd3e13
 
 
 
668f0b8
 
 
421cd7c
ad376bb
 
baec4bc
 
 
ad376bb
baec4bc
 
 
ad376bb
baec4bc
 
 
ad376bb
421cd7c
 
432a8d9
 
421cd7c
 
 
 
432a8d9
baec4bc
 
f2c8e06
 
421cd7c
 
668f0b8
 
 
 
9dca518
421cd7c
f2c8e06
dac916b
f2c8e06
421cd7c
dac916b
f2c8e06
 
421cd7c
f2c8e06
 
421cd7c
 
f2c8e06
421cd7c
f2c8e06
 
 
421cd7c
f2c8e06
 
421cd7c
 
 
 
 
ad376bb
 
 
 
 
 
 
 
dac916b
 
 
 
 
 
 
 
 
ad376bb
 
421cd7c
 
 
 
 
 
 
432a8d9
 
421cd7c
 
432a8d9
421cd7c
 
 
 
 
 
f2c8e06
 
421cd7c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from dotenv import load_dotenv
load_dotenv()
import os
import base64
import tempfile
import streamlit as st
import fitz  # PyMuPDF
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the summarization model
tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
base_model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")

# Function to extract text from a PDF using PyMuPDF
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)  # Get a page
        text += page.get_text()  # Extract text from the page
    if text.strip():
        return text
    return None

# Web Scraping Function
def scrape_article(url):
     response = requests.get(url, timeout=10)
     response.raise_for_status()  # Raise an error if the request fails
     soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the main content (common tags for articles)
     paragraphs = soup.find_all('p')
     article_text = "\n".join([para.get_text() for para in paragraphs])

     if not article_text.strip():
         raise ValueError("Unable to extract content from the page.")
     return article_text

# LLM pipeline for summarization
def llm_pipeline(input_text):
    pipe_sum = pipeline(
        'summarization',
        model=base_model,
        tokenizer=tokenizer,
        max_length=500,
        min_length=50,
    )
    result = pipe_sum(input_text)
    return result[0]['summary_text']

@st.cache_data
# Function to display the PDF
def displayPDF(file_path):
    with open(file_path, "rb") as f:
        base64_pdf = base64.b64encode(f.read()).decode('utf-8')
    pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
    st.markdown(pdf_display, unsafe_allow_html=True)

# Streamlit App
def main():
    st.title('AI Content Summarizer')

    # PDF Upload Section
    st.header("PDF content Summarizer")
    uploaded_file = st.file_uploader("Upload your PDF file", type=['pdf'])
    if uploaded_file is not None:
        if st.button("Summarize PDF"):
            col1, col2 = st.columns(2)

            # Save the uploaded file to a temporary location
            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf", dir="/tmp/") as temp_file:
                temp_file.write(uploaded_file.read())
                temp_filepath = temp_file.name

            with col1:
                st.info("Uploaded PDF File")
                displayPDF(temp_filepath)

            with col2:
                st.info("Summarization")
                input_text = extract_text_from_pdf(temp_filepath)
                if input_text:  # Proceed only if text extraction was successful
                    summary = llm_pipeline(input_text)
                    st.success(summary)
    st.header("Summarize Online Articles")
    url = st.text_input("Enter the URL of the article:")
    if st.button("Summarize Article"):
        if url.strip():
            st.info("Fetching and Summarizing Article...")
            article_text = scrape_article(url)
            if "Error:" in article_text:
                st.error(article_text)
            else: 
                col1, col2 = st.columns(2)
                with col1:
                    st.info("Original Article Content")
                    st.write(article_text[:1000] + "..." if len(article_text) > 1000 else article_text)
                with col2:
                    st.info("Summarized Content")
                    summary = llm_pipeline(article_text)
                    st.success(summary)
        else:
            st.warning("Please enter a valid URL.")

    # Text Input Section
    st.header("Summarize Your Text")
    user_input = st.text_area("Enter your content here:", height=200)
    if st.button("Summarize Text"):
        if user_input.strip():
            col1, col2 = st.columns(2)

            with col1:
                st.info("Original Content")
                st.write(user_input)

            with col2:
                st.info("Summarization")
                summary = llm_pipeline(user_input)
                st.success(summary)
        else:
            st.warning("Please enter some content to summarize.")

if __name__ == '__main__':
    main()