Kawthar12h commited on
Commit
85b718b
·
verified ·
1 Parent(s): 9270041

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -17
app.py CHANGED
@@ -1,27 +1,65 @@
 
 
 
 
 
1
  import gradio as gr
2
- from transformers import pipeline
3
- import torch
4
- from bs4 import BeautifulSoup
5
- import requests
 
 
6
 
7
  def summarize_article(url, min_len, max_len):
 
8
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
9
 
10
  try:
 
11
  r = requests.get(url)
 
 
12
  soup = BeautifulSoup(r.text, 'html.parser')
13
- results = soup.find_all(['h1', 'p'])
 
 
 
 
14
  text = [result.text for result in results]
 
 
15
  ARTICLE = ' '.join(text)
16
 
 
17
  ARTICLE = ARTICLE.replace('\n', '')
18
  ARTICLE = ARTICLE.replace('.', '.<eos>')
19
  ARTICLE = ARTICLE.replace('?', '?<eos>')
20
  ARTICLE = ARTICLE.replace('!', '!<eos>')
21
 
 
22
  sentences = ARTICLE.split('<eos>')
 
 
 
 
 
23
  current_chunk = 0
 
 
24
  chunks = []
 
 
 
 
 
 
 
 
 
 
 
 
25
  for sentence in sentences:
26
  if len(chunks) == current_chunk + 1:
27
  if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
@@ -32,25 +70,33 @@ def summarize_article(url, min_len, max_len):
32
  else:
33
  chunks.append(sentence.split(' '))
34
 
 
 
 
35
  for chunk_id in range(len(chunks)):
36
  chunks[chunk_id] = ' '.join(chunks[chunk_id])
37
 
38
- res = summarizer(chunks, max_length=max_len, min_length=min_len, do_sample=False)
39
-
 
 
40
  summary = ' '.join([summ['summary_text'] for summ in res])
41
  return summary
42
 
43
- except Exception as e: # Handle potential errors during web request or parsing
 
44
  return f"Error: {str(e)}"
45
 
46
- with gr.Blocks() as iface:
47
- url_input = gr.Textbox(label="Enter the article URL")
48
- min_len_slider = gr.Slider(minimum=10, maximum=100, step=1, label="Adjust Minimum Length")
49
- max_len_slider = gr.Slider(minimum=50, maximum=1000, step=1, label="Adjust Maximum Length")
50
- summary_output = gr.Textbox(label="Summary")
51
-
52
- btn = gr.Button("Summarize")
53
 
54
- btn.click(fn=summarize_article, inputs=[url_input, min_len_slider, max_len_slider], outputs=summary_output)
 
 
 
 
 
 
 
 
 
55
 
56
- iface.launch()
 
1
+
2
+ pip install transformers
3
+ pip install torch
4
+
5
+
6
  import gradio as gr
7
+ from transformers import pipeline # import pipeline to use pre-trained models
8
+ import torch # import PyTorch library, which is commonly used for Deep Learning tasks
9
+ from bs4 import BeautifulSoup # import BeautifulSoup for parsing HTML & XML documnts
10
+ import requests # To make HTTP requests to retrieve web content.
11
+
12
+
13
 
14
  def summarize_article(url, min_len, max_len):
15
+ #Create summarization pipeline
16
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
17
 
18
  try:
19
+ # Send an HTTP GET request to the URL(take it from user) and retrieve the web page content
20
  r = requests.get(url)
21
+
22
+ # Creat object from BeautifulSoup to extract the text content of the webpage, parsing the HTML content
23
  soup = BeautifulSoup(r.text, 'html.parser')
24
+
25
+ # To finds all the <h1> (header) and <p> (paragraph) elements in the HTML content
26
+ results = soup.find_all(['h1','p'])
27
+
28
+ # Extract the text content from each element and store it in a list called text
29
  text = [result.text for result in results]
30
+
31
+ # joins all the extracted text into a single string, representing the entire article
32
  ARTICLE = ' '.join(text)
33
 
34
+ # Replace sentence-ending punctuation with a special token (<eos>) . This helps split the article into smaller chunks for summarization.
35
  ARTICLE = ARTICLE.replace('\n', '')
36
  ARTICLE = ARTICLE.replace('.', '.<eos>')
37
  ARTICLE = ARTICLE.replace('?', '?<eos>')
38
  ARTICLE = ARTICLE.replace('!', '!<eos>')
39
 
40
+ # Splits the article into sentences based on the <eos> token and stores them in a list called sentences.
41
  sentences = ARTICLE.split('<eos>')
42
+
43
+ # Sets the maximum length (in words) for each chunk of text during summarization.
44
+ max_chunk = 500
45
+
46
+ # Initializes a variable to keep track of the current chunk being processed
47
  current_chunk = 0
48
+
49
+ # Creates an empty list called chunks to store the individual chunks of text
50
  chunks = []
51
+
52
+ # For loop iterates through each sentence in the sentences list
53
+ '''If the length of the current chunk (in terms of words) plus the length of the current sentence (split by spaces) is less than or equal to the max_chunk length:
54
+ The sentence is added to the current chunk.
55
+
56
+ Otherwise:
57
+
58
+ The current_chunk index is incremented to move to the next chunk.
59
+ A new chunk is created, and the current sentence becomes the first sentence in this new chunk.
60
+
61
+ The current chunk is appended to the chunks list.
62
+ '''
63
  for sentence in sentences:
64
  if len(chunks) == current_chunk + 1:
65
  if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
 
70
  else:
71
  chunks.append(sentence.split(' '))
72
 
73
+ ''' After processing all sentences, the loop iterates through each chunk,
74
+ to ensures that each chunk is represented as a single string (rather than a list of words).
75
+ '''
76
  for chunk_id in range(len(chunks)):
77
  chunks[chunk_id] = ' '.join(chunks[chunk_id])
78
 
79
+ # Apply Summarization to text with lenth of 30-120 word for each chunk
80
+ res = summarizer(chunks, max_length = max_len, min_length = min_len, do_sample=False)
81
+
82
+ # Extracting the 'summary_text' value from each summary in the res list
83
  summary = ' '.join([summ['summary_text'] for summ in res])
84
  return summary
85
 
86
+ # Handle potential errors during web request or parsing
87
+ except Exception as e:
88
  return f"Error: {str(e)}"
89
 
 
 
 
 
 
 
 
90
 
91
+ # Create Gradio Interface
92
+ interface = gr.Interface(
93
+ fn=summarize_article,
94
+ inputs=[
95
+ gr.Textbox(label="Enter the article URL"),
96
+ gr.Slider(minimum=10, maximum=100, step=1, label="Adjust Minimum Length"),
97
+ gr.Slider(minimum=50, maximum=1000, step=1, label="Adjust Maximum Length")
98
+ ],
99
+ outputs=gr.Textbox(label="Summary")
100
+ )
101
 
102
+ interface.launch()