sohail-shaikh-s07
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -12,10 +12,10 @@ try:
|
|
12 |
except Exception as e:
|
13 |
print(f"Error downloading NLTK data: {e}")
|
14 |
|
15 |
-
|
16 |
try:
|
17 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
18 |
-
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device) #
|
19 |
except Exception as e:
|
20 |
print(f"Error loading model: {e}")
|
21 |
summarizer = None
|
@@ -39,20 +39,20 @@ def extract_article_text(url):
|
|
39 |
|
40 |
soup = BeautifulSoup(response.text, 'html.parser')
|
41 |
|
42 |
-
# Remove unwanted elements
|
43 |
for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
|
44 |
tag.decompose()
|
45 |
|
46 |
-
|
47 |
article_text = ""
|
48 |
-
|
49 |
-
|
50 |
main_content = soup.find('article') or soup.find(class_=['article', 'post-content', 'entry-content', 'content'])
|
51 |
|
52 |
if main_content:
|
53 |
paragraphs = main_content.find_all('p')
|
54 |
else:
|
55 |
-
|
56 |
paragraphs = soup.find_all('p')
|
57 |
|
58 |
# Extract text from paragraphs
|
@@ -77,13 +77,13 @@ def extract_and_summarize(url):
|
|
77 |
if not text:
|
78 |
return "Could not extract text from the article. Please make sure it's a valid news article."
|
79 |
|
80 |
-
# Split text into chunks if it's too long
|
81 |
max_chunk_length = 1024
|
82 |
chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
|
83 |
|
84 |
-
# Summarize each chunk
|
85 |
summaries = []
|
86 |
-
for
|
87 |
if len(chunk.strip()) > 100: # Only summarize substantial chunks
|
88 |
try:
|
89 |
summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
|
@@ -95,10 +95,10 @@ def extract_and_summarize(url):
|
|
95 |
if not summaries:
|
96 |
return "Could not generate summary. Please try a different article."
|
97 |
|
98 |
-
# Combine all summaries ---
|
99 |
final_summary = " ".join(summaries)
|
100 |
|
101 |
-
return
|
102 |
|
103 |
except Exception as e:
|
104 |
return f"Error processing article: {str(e)}"
|
@@ -116,9 +116,6 @@ demo = gr.Interface(
|
|
116 |
description="""
|
117 |
This app creates concise summaries of news articles using AI.
|
118 |
Simply paste a URL of a news article and get a summary!
|
119 |
-
|
120 |
-
Note: Please wait patiently while processing - it may take 30-60 seconds depending on the article length.
|
121 |
-
The model is processing when you see 'Running...' below the output box.
|
122 |
""",
|
123 |
examples=[
|
124 |
["https://www.bbc.com/sport/football/articles/cvgxmzy86e4o"],
|
|
|
12 |
except Exception as e:
|
13 |
print(f"Error downloading NLTK data: {e}")
|
14 |
|
15 |
+
|
16 |
try:
|
17 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
18 |
+
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device) # I have used BART-Large-CNN, you can use any model as per your preference like gpt2, t5, etc
|
19 |
except Exception as e:
|
20 |
print(f"Error loading model: {e}")
|
21 |
summarizer = None
|
|
|
39 |
|
40 |
soup = BeautifulSoup(response.text, 'html.parser')
|
41 |
|
42 |
+
# Remove unwanted elements --- to avoid the model to get distract and generate wrong summary
|
43 |
for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
|
44 |
tag.decompose()
|
45 |
|
46 |
+
|
47 |
article_text = ""
|
48 |
+
|
49 |
+
|
50 |
main_content = soup.find('article') or soup.find(class_=['article', 'post-content', 'entry-content', 'content'])
|
51 |
|
52 |
if main_content:
|
53 |
paragraphs = main_content.find_all('p')
|
54 |
else:
|
55 |
+
# Fallback to all paragraphs if no article container found
|
56 |
paragraphs = soup.find_all('p')
|
57 |
|
58 |
# Extract text from paragraphs
|
|
|
77 |
if not text:
|
78 |
return "Could not extract text from the article. Please make sure it's a valid news article."
|
79 |
|
80 |
+
# Split text into chunks if it's too long --- it will divide the text into 1024 tokens
|
81 |
max_chunk_length = 1024
|
82 |
chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
|
83 |
|
84 |
+
# Summarize each chunk --- each small part will be summarized indiviually
|
85 |
summaries = []
|
86 |
+
for chunk in chunks:
|
87 |
if len(chunk.strip()) > 100: # Only summarize substantial chunks
|
88 |
try:
|
89 |
summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
|
|
|
95 |
if not summaries:
|
96 |
return "Could not generate summary. Please try a different article."
|
97 |
|
98 |
+
# Combine all summaries --- we need to combine all summaries to get a complete summary
|
99 |
final_summary = " ".join(summaries)
|
100 |
|
101 |
+
return final_summary
|
102 |
|
103 |
except Exception as e:
|
104 |
return f"Error processing article: {str(e)}"
|
|
|
116 |
description="""
|
117 |
This app creates concise summaries of news articles using AI.
|
118 |
Simply paste a URL of a news article and get a summary!
|
|
|
|
|
|
|
119 |
""",
|
120 |
examples=[
|
121 |
["https://www.bbc.com/sport/football/articles/cvgxmzy86e4o"],
|