sohail-shaikh-s07
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1 |
import gradio as gr
|
2 |
-
|
|
|
3 |
from transformers import pipeline
|
4 |
import nltk
|
5 |
import torch
|
|
|
6 |
|
7 |
# Download required NLTK data
|
8 |
try:
|
@@ -18,20 +20,62 @@ except Exception as e:
|
|
18 |
print(f"Error loading model: {e}")
|
19 |
summarizer = None
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
def extract_and_summarize(url):
|
22 |
if not url or not url.strip():
|
23 |
return "Please enter a valid URL"
|
24 |
|
|
|
|
|
|
|
25 |
try:
|
26 |
-
#
|
27 |
-
|
28 |
-
article.download()
|
29 |
-
article.parse()
|
30 |
|
31 |
-
# Get the text content
|
32 |
-
text = article.text
|
33 |
if not text:
|
34 |
-
return "Could not extract text from the article"
|
35 |
|
36 |
# Split text into chunks if it's too long
|
37 |
max_chunk_length = 1024
|
@@ -62,10 +106,17 @@ def extract_and_summarize(url):
|
|
62 |
# Create Gradio interface
|
63 |
demo = gr.Interface(
|
64 |
fn=extract_and_summarize,
|
65 |
-
inputs=gr.Textbox(
|
66 |
-
|
|
|
|
|
|
|
|
|
67 |
title="📰 News Article Summarizer",
|
68 |
-
description="
|
|
|
|
|
|
|
69 |
examples=[
|
70 |
["https://www.bbc.com/news/world-us-canada-67841980"],
|
71 |
["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]
|
|
|
1 |
import gradio as gr
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
from transformers import pipeline
|
5 |
import nltk
|
6 |
import torch
|
7 |
+
from urllib.parse import urlparse
|
8 |
|
9 |
# Download required NLTK data
|
10 |
try:
|
|
|
20 |
print(f"Error loading model: {e}")
|
21 |
summarizer = None
|
22 |
|
23 |
+
def is_valid_url(url):
|
24 |
+
try:
|
25 |
+
result = urlparse(url)
|
26 |
+
return all([result.scheme, result.netloc])
|
27 |
+
except:
|
28 |
+
return False
|
29 |
+
|
30 |
+
def extract_article_text(url):
|
31 |
+
"""Extract article text using BeautifulSoup instead of newspaper3k"""
|
32 |
+
headers = {
|
33 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
34 |
+
}
|
35 |
+
|
36 |
+
try:
|
37 |
+
response = requests.get(url, headers=headers, timeout=10)
|
38 |
+
response.raise_for_status()
|
39 |
+
|
40 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
41 |
+
|
42 |
+
# Remove unwanted elements
|
43 |
+
for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
|
44 |
+
tag.decompose()
|
45 |
+
|
46 |
+
# Find the main content
|
47 |
+
article_text = ""
|
48 |
+
|
49 |
+
# Look for common article containers
|
50 |
+
main_content = soup.find('article') or soup.find(class_=['article', 'post-content', 'entry-content', 'content'])
|
51 |
+
|
52 |
+
if main_content:
|
53 |
+
paragraphs = main_content.find_all('p')
|
54 |
+
else:
|
55 |
+
# Fallback to all paragraphs if no article container found
|
56 |
+
paragraphs = soup.find_all('p')
|
57 |
+
|
58 |
+
# Extract text from paragraphs
|
59 |
+
article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 50])
|
60 |
+
|
61 |
+
return article_text
|
62 |
+
|
63 |
+
except Exception as e:
|
64 |
+
raise Exception(f"Error fetching article: {str(e)}")
|
65 |
+
|
66 |
def extract_and_summarize(url):
|
67 |
if not url or not url.strip():
|
68 |
return "Please enter a valid URL"
|
69 |
|
70 |
+
if not is_valid_url(url):
|
71 |
+
return "Please enter a valid URL starting with http:// or https://"
|
72 |
+
|
73 |
try:
|
74 |
+
# Extract article text
|
75 |
+
text = extract_article_text(url)
|
|
|
|
|
76 |
|
|
|
|
|
77 |
if not text:
|
78 |
+
return "Could not extract text from the article. Please make sure it's a valid news article."
|
79 |
|
80 |
# Split text into chunks if it's too long
|
81 |
max_chunk_length = 1024
|
|
|
106 |
# Create Gradio interface
|
107 |
demo = gr.Interface(
|
108 |
fn=extract_and_summarize,
|
109 |
+
inputs=gr.Textbox(
|
110 |
+
label="Enter News Article URL",
|
111 |
+
placeholder="https://...",
|
112 |
+
info="Enter a news article URL to get a summary"
|
113 |
+
),
|
114 |
+
outputs=gr.Textbox(label="Summary", lines=5),
|
115 |
title="📰 News Article Summarizer",
|
116 |
+
description="""
|
117 |
+
This app creates concise summaries of news articles using AI.
|
118 |
+
Simply paste a URL of a news article and get a summary!
|
119 |
+
""",
|
120 |
examples=[
|
121 |
["https://www.bbc.com/news/world-us-canada-67841980"],
|
122 |
["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]
|