Kawthar12h commited on
Commit
9270041
·
verified ·
1 Parent(s): 584e102

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -59
app.py CHANGED
@@ -1,65 +1,27 @@
1
- !pip install transformers
2
- !pip install torch
3
- !pip install gradio
4
-
5
-
6
  import gradio as gr
7
  from transformers import pipeline
8
  import torch
9
  from bs4 import BeautifulSoup
10
  import requests
11
 
12
-
13
-
14
  def summarize_article(url, min_len, max_len):
15
- #Create summarization pipeline
16
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
17
 
18
  try:
19
- # Send an HTTP GET request to the URL(take it from user) and retrieve the web page content
20
  r = requests.get(url)
21
-
22
- # Creat object from BeautifulSoup to extract the text content of the webpage, parsing the HTML content
23
  soup = BeautifulSoup(r.text, 'html.parser')
24
-
25
- # To finds all the <h1> (header) and <p> (paragraph) elements in the HTML content
26
- results = soup.find_all(['h1','p'])
27
-
28
- # Extract the text content from each element and store it in a list called text
29
  text = [result.text for result in results]
30
-
31
- # joins all the extracted text into a single string, representing the entire article
32
  ARTICLE = ' '.join(text)
33
 
34
- # Replace sentence-ending punctuation with a special token (<eos>) . This helps split the article into smaller chunks for summarization.
35
  ARTICLE = ARTICLE.replace('\n', '')
36
  ARTICLE = ARTICLE.replace('.', '.<eos>')
37
  ARTICLE = ARTICLE.replace('?', '?<eos>')
38
  ARTICLE = ARTICLE.replace('!', '!<eos>')
39
 
40
- # Splits the article into sentences based on the <eos> token and stores them in a list called sentences.
41
  sentences = ARTICLE.split('<eos>')
42
-
43
- # Sets the maximum length (in words) for each chunk of text during summarization.
44
- max_chunk = 500
45
-
46
- # Initializes a variable to keep track of the current chunk being processed
47
  current_chunk = 0
48
-
49
- # Creates an empty list called chunks to store the individual chunks of text
50
  chunks = []
51
-
52
- # For loop iterates through each sentence in the sentences list
53
- '''If the length of the current chunk (in terms of words) plus the length of the current sentence (split by spaces) is less than or equal to the max_chunk length:
54
- The sentence is added to the current chunk.
55
-
56
- Otherwise:
57
-
58
- The current_chunk index is incremented to move to the next chunk.
59
- A new chunk is created, and the current sentence becomes the first sentence in this new chunk.
60
-
61
- The current chunk is appended to the chunks list.
62
- '''
63
  for sentence in sentences:
64
  if len(chunks) == current_chunk + 1:
65
  if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
@@ -70,33 +32,25 @@ def summarize_article(url, min_len, max_len):
70
  else:
71
  chunks.append(sentence.split(' '))
72
 
73
- ''' After processing all sentences, the loop iterates through each chunk,
74
- to ensures that each chunk is represented as a single string (rather than a list of words).
75
- '''
76
  for chunk_id in range(len(chunks)):
77
  chunks[chunk_id] = ' '.join(chunks[chunk_id])
78
 
79
- # Apply Summarization to text with lenth of 30-120 word for each chunk
80
- res = summarizer(chunks, max_length = max_len, min_length = min_len, do_sample=False)
81
-
82
- # Extracting the 'summary_text' value from each summary in the res list
83
  summary = ' '.join([summ['summary_text'] for summ in res])
84
  return summary
85
 
86
- # Handle potential errors during web request or parsing
87
- except Exception as e:
88
  return f"Error: {str(e)}"
89
 
 
 
 
 
 
 
 
90
 
91
- # Create Gradio Interface
92
- interface = gr.Interface(
93
- fn=summarize_article,
94
- inputs=[
95
- gr.Textbox(label="Enter the article URL"),
96
- gr.Slider(minimum=10, maximum=100, step=1, label="Adjust Minimum Length"),
97
- gr.Slider(minimum=50, maximum=1000, step=1, label="Adjust Maximum Length")
98
- ],
99
- outputs=gr.Textbox(label="Summary")
100
- )
101
 
102
- interface.launch()
 
 
 
 
 
 
1
  import gradio as gr
2
  from transformers import pipeline
3
  import torch
4
  from bs4 import BeautifulSoup
5
  import requests
6
 
 
 
7
  def summarize_article(url, min_len, max_len):
 
8
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
9
 
10
  try:
 
11
  r = requests.get(url)
 
 
12
  soup = BeautifulSoup(r.text, 'html.parser')
13
+ results = soup.find_all(['h1', 'p'])
 
 
 
 
14
  text = [result.text for result in results]
 
 
15
  ARTICLE = ' '.join(text)
16
 
 
17
  ARTICLE = ARTICLE.replace('\n', '')
18
  ARTICLE = ARTICLE.replace('.', '.<eos>')
19
  ARTICLE = ARTICLE.replace('?', '?<eos>')
20
  ARTICLE = ARTICLE.replace('!', '!<eos>')
21
 
 
22
  sentences = ARTICLE.split('<eos>')
 
 
 
 
 
23
  current_chunk = 0
 
 
24
  chunks = []
 
 
 
 
 
 
 
 
 
 
 
 
25
  for sentence in sentences:
26
  if len(chunks) == current_chunk + 1:
27
  if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
 
32
  else:
33
  chunks.append(sentence.split(' '))
34
 
 
 
 
35
  for chunk_id in range(len(chunks)):
36
  chunks[chunk_id] = ' '.join(chunks[chunk_id])
37
 
38
+ res = summarizer(chunks, max_length=max_len, min_length=min_len, do_sample=False)
39
+
 
 
40
  summary = ' '.join([summ['summary_text'] for summ in res])
41
  return summary
42
 
43
+ except Exception as e: # Handle potential errors during web request or parsing
 
44
  return f"Error: {str(e)}"
45
 
46
+ with gr.Blocks() as iface:
47
+ url_input = gr.Textbox(label="Enter the article URL")
48
+ min_len_slider = gr.Slider(minimum=10, maximum=100, step=1, label="Adjust Minimum Length")
49
+ max_len_slider = gr.Slider(minimum=50, maximum=1000, step=1, label="Adjust Maximum Length")
50
+ summary_output = gr.Textbox(label="Summary")
51
+
52
+ btn = gr.Button("Summarize")
53
 
54
+ btn.click(fn=summarize_article, inputs=[url_input, min_len_slider, max_len_slider], outputs=summary_output)
 
 
 
 
 
 
 
 
 
55
 
56
+ iface.launch()