Spaces:

Kawthar12h
/

Text_Summarization

Sleeping

App Files Files Community

Kawthar12h commited on Sep 14, 2024

Commit

85b718b

verified ·

1 Parent(s): 9270041

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -17

app.py CHANGED Viewed

@@ -1,27 +1,65 @@
 import gradio as gr
-from transformers import pipeline
-import torch
-from bs4 import BeautifulSoup
-import requests
 def summarize_article(url, min_len, max_len):
     summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
     try:
         r = requests.get(url)
         soup = BeautifulSoup(r.text, 'html.parser')
-        results = soup.find_all(['h1', 'p'])
         text = [result.text for result in results]
         ARTICLE = ' '.join(text)
         ARTICLE = ARTICLE.replace('\n', '')
         ARTICLE = ARTICLE.replace('.', '.<eos>')
         ARTICLE = ARTICLE.replace('?', '?<eos>')
         ARTICLE = ARTICLE.replace('!', '!<eos>')
         sentences = ARTICLE.split('<eos>')
         current_chunk = 0
         chunks = []
         for sentence in sentences:
             if len(chunks) == current_chunk + 1:
                 if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
@@ -32,25 +70,33 @@ def summarize_article(url, min_len, max_len):
             else:
                 chunks.append(sentence.split(' '))
         for chunk_id in range(len(chunks)):
             chunks[chunk_id] = ' '.join(chunks[chunk_id])
-        res = summarizer(chunks, max_length=max_len, min_length=min_len, do_sample=False)
         summary = ' '.join([summ['summary_text'] for summ in res])
         return summary
-    except Exception as e:  # Handle potential errors during web request or parsing
         return f"Error: {str(e)}"
-with gr.Blocks() as iface:
-    url_input = gr.Textbox(label="Enter the article URL")
-    min_len_slider = gr.Slider(minimum=10, maximum=100, step=1, label="Adjust Minimum Length")
-    max_len_slider = gr.Slider(minimum=50, maximum=1000, step=1, label="Adjust Maximum Length")
-    summary_output = gr.Textbox(label="Summary")
-    btn = gr.Button("Summarize")
-    btn.click(fn=summarize_article, inputs=[url_input, min_len_slider, max_len_slider], outputs=summary_output)
-iface.launch()

+pip install transformers
+pip install torch
 import gradio as gr
+from transformers import pipeline # import pipeline to use pre-trained models
+import torch # import PyTorch library, which is commonly used for Deep Learning tasks
+from bs4 import BeautifulSoup # import BeautifulSoup for parsing HTML & XML documnts
+import requests # To make HTTP requests to retrieve web content.
 def summarize_article(url, min_len, max_len):
+  #Create summarization pipeline
     summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
     try:
+        # Send an HTTP GET request to the URL(take it from user) and retrieve the web page content
         r = requests.get(url)
+        # Creat object from BeautifulSoup to extract the text content of the webpage, parsing the HTML content
         soup = BeautifulSoup(r.text, 'html.parser')
+        # To finds all the <h1> (header) and <p> (paragraph) elements in the HTML content
+        results = soup.find_all(['h1','p'])
+        # Extract the text content from each element and store it in a list called text
         text = [result.text for result in results]
+        # joins all the extracted text into a single string, representing the entire article
         ARTICLE = ' '.join(text)
+        # Replace sentence-ending punctuation with a special token (<eos>) . This helps split the article into smaller chunks for summarization.
         ARTICLE = ARTICLE.replace('\n', '')
         ARTICLE = ARTICLE.replace('.', '.<eos>')
         ARTICLE = ARTICLE.replace('?', '?<eos>')
         ARTICLE = ARTICLE.replace('!', '!<eos>')
+        # Splits the article into sentences based on the <eos> token and stores them in a list called sentences.
         sentences = ARTICLE.split('<eos>')
+        # Sets the maximum length (in words) for each chunk of text during summarization.
+        max_chunk = 500
+        # Initializes a variable to keep track of the current chunk being processed
         current_chunk = 0
+        # Creates an empty list called chunks to store the individual chunks of text
         chunks = []
+        # For loop iterates through each sentence in the sentences list
+        '''If the length of the current chunk (in terms of words) plus the length of the current sentence (split by spaces) is less than or equal to the max_chunk length:
+        The sentence is added to the current chunk.
+        Otherwise:
+        The current_chunk index is incremented to move to the next chunk.
+        A new chunk is created, and the current sentence becomes the first sentence in this new chunk.
+        The current chunk is appended to the chunks list.
+        '''
         for sentence in sentences:
             if len(chunks) == current_chunk + 1:
                 if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
             else:
                 chunks.append(sentence.split(' '))
+        ''' After processing all sentences, the loop iterates through each chunk,
+        to ensures that each chunk is represented as a single string (rather than a list of words).
+        '''
         for chunk_id in range(len(chunks)):
             chunks[chunk_id] = ' '.join(chunks[chunk_id])
+        # Apply Summarization to text with lenth of 30-120 word for each chunk
+        res = summarizer(chunks, max_length = max_len, min_length = min_len, do_sample=False)
+        # Extracting the 'summary_text' value from each summary in the res list
         summary = ' '.join([summ['summary_text'] for summ in res])
         return summary
+    # Handle potential errors during web request or parsing
+    except Exception as e:
         return f"Error: {str(e)}"
+# Create Gradio Interface
+interface = gr.Interface(
+    fn=summarize_article,
+    inputs=[
+        gr.Textbox(label="Enter the article URL"),
+        gr.Slider(minimum=10, maximum=100, step=1, label="Adjust Minimum Length"),
+        gr.Slider(minimum=50, maximum=1000, step=1, label="Adjust Maximum Length")
+    ],
+    outputs=gr.Textbox(label="Summary")
+)
+interface.launch()