Spaces:

blazingbunny
/

BERT-Extractive-Summarizer

Running

File size: 2,233 Bytes

4142a7d
1457f73
eb51c13
4b1ed8b
4142a7d
2e9079e
de301cb
 
 
 
 
 
 
2e9079e
724a5a1
4b1ed8b
 
 
20be358
 
 
4b1ed8b
2e9079e
4b1ed8b
 
 
 
 
 
eb51c13
2e9079e
de301cb
eb51c13
 
4b1ed8b
 
eb51c13
 
 
20be358
 
 
 
 
17ad421
eb51c13
 
2e9079e

import streamlit as st
from transformers import pipeline
import textwrap
import re

st.title('Hugging Face BERT Summarizer')

# List of models
models = ["sshleifer/distilbart-cnn-12-6", "facebook/bart-large-cnn", "t5-base", "t5-large", "google/pegasus-newsroom"]

# Dropdown model selector
model = st.sidebar.selectbox("Choose a model", models)

uploaded_file = st.file_uploader("Choose a .txt file", type="txt")

# Add text input for keywords
keywords = st.text_input("Enter keywords (comma-separated)")

# Add slider to the sidebar for the scale value
scale_percentage = st.sidebar.slider('Scale %', min_value=1, max_value=100, value=50)

if uploaded_file is not None and keywords:
    user_input = uploaded_file.read().decode('utf-8')
    keywords = [keyword.strip() for keyword in keywords.split(",")]

    # Filter sentences based on keywords
    sentences = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', user_input)
    filtered_sentences = [sentence for sentence in sentences if any(keyword.lower() in sentence.lower() for keyword in keywords)]
    filtered_text = ' '.join(filtered_sentences)

    if st.button('Summarize'):
        summarizer = pipeline('summarization', model=model)
        summarized_text = ""

        # Split the filtered text into chunks of approximately 500 words each
        chunks = textwrap.wrap(filtered_text, 500)
        
        # Summarize each chunk
        for chunk in chunks:
            chunk_length = len(chunk.split())
            min_length_percentage = max(scale_percentage - 10, 1)  # Ensure min_length_percentage is not less than 1
            max_length_percentage = min(scale_percentage + 10, 100)  # Ensure max_length_percentage is not more than 100
            min_length = max(int(chunk_length * min_length_percentage / 100), 1)  # Calculate min_length based on the percentage of the chunk length
            max_length = int(chunk_length * max_length_percentage / 100)  # Calculate max_length based on the percentage of the chunk length
            summarized = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)
            summarized_text += summarized[0]['summary_text'] + " "

        st.text_area('Summarized Text', summarized_text, height=200)