Kawthar12h commited on
Commit
c0ea65f
·
verified ·
1 Parent(s): 62dc08f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -0
app.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import torch
4
+ from bs4 import BeautifulSoup
5
+ import requests
6
+
7
+
8
+
9
+ def summarize_article(url, min_len, max_len):
10
+ #Create summarization pipeline
11
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
12
+
13
+ try:
14
+ # Send an HTTP GET request to the URL(take it from user) and retrieve the web page content
15
+ r = requests.get(url)
16
+
17
+ # Creat object from BeautifulSoup to extract the text content of the webpage, parsing the HTML content
18
+ soup = BeautifulSoup(r.text, 'html.parser')
19
+
20
+ # To finds all the <h1> (header) and <p> (paragraph) elements in the HTML content
21
+ results = soup.find_all(['h1','p'])
22
+
23
+ # Extract the text content from each element and store it in a list called text
24
+ text = [result.text for result in results]
25
+
26
+ # joins all the extracted text into a single string, representing the entire article
27
+ ARTICLE = ' '.join(text)
28
+
29
+ # Replace sentence-ending punctuation with a special token (<eos>) . This helps split the article into smaller chunks for summarization.
30
+ ARTICLE = ARTICLE.replace('\n', '')
31
+ ARTICLE = ARTICLE.replace('.', '.<eos>')
32
+ ARTICLE = ARTICLE.replace('?', '?<eos>')
33
+ ARTICLE = ARTICLE.replace('!', '!<eos>')
34
+
35
+ # Splits the article into sentences based on the <eos> token and stores them in a list called sentences.
36
+ sentences = ARTICLE.split('<eos>')
37
+
38
+ # Sets the maximum length (in words) for each chunk of text during summarization.
39
+ max_chunk = 500
40
+
41
+ # Initializes a variable to keep track of the current chunk being processed
42
+ current_chunk = 0
43
+
44
+ # Creates an empty list called chunks to store the individual chunks of text
45
+ chunks = []
46
+
47
+ # For loop iterates through each sentence in the sentences list
48
+ '''If the length of the current chunk (in terms of words) plus the length of the current sentence (split by spaces) is less than or equal to the max_chunk length:
49
+ The sentence is added to the current chunk.
50
+
51
+ Otherwise:
52
+
53
+ The current_chunk index is incremented to move to the next chunk.
54
+ A new chunk is created, and the current sentence becomes the first sentence in this new chunk.
55
+
56
+ The current chunk is appended to the chunks list.
57
+ '''
58
+ for sentence in sentences:
59
+ if len(chunks) == current_chunk + 1:
60
+ if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
61
+ chunks[current_chunk].extend(sentence.split(' '))
62
+ else:
63
+ current_chunk += 1
64
+ chunks.append(sentence.split(' '))
65
+ else:
66
+ chunks.append(sentence.split(' '))
67
+
68
+ ''' After processing all sentences, the loop iterates through each chunk,
69
+ to ensures that each chunk is represented as a single string (rather than a list of words).
70
+ '''
71
+ for chunk_id in range(len(chunks)):
72
+ chunks[chunk_id] = ' '.join(chunks[chunk_id])
73
+
74
+ # Apply Summarization to text with lenth of 30-120 word for each chunk
75
+ res = summarizer(chunks, max_length = max_len, min_length = min_len, do_sample=False)
76
+
77
+ # Extracting the 'summary_text' value from each summary in the res list
78
+ summary = ' '.join([summ['summary_text'] for summ in res])
79
+ return summary
80
+
81
+ # Handle potential errors during web request or parsing
82
+ except Exception as e:
83
+ return f"Error: {str(e)}"
84
+
85
+
86
+ # Create Gradio Interface
87
+ interface = gr.Interface(
88
+ fn=summarize_article,
89
+ inputs=[
90
+ gr.Textbox(label="Enter the article URL"),
91
+ gr.Slider(minimum=10, maximum=100, step=1, label="Adjust Minimum Length"),
92
+ gr.Slider(minimum=50, maximum=1000, step=1, label="Adjust Maximum Length")
93
+ ],
94
+ outputs=gr.Textbox(label="Summary")
95
+ )
96
+
97
+ interface.launch()