sohail-shaikh-s07 commited on
Commit
53ed53b
·
verified ·
1 Parent(s): e521008

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -11
app.py CHANGED
@@ -6,7 +6,7 @@ import nltk
6
  import torch
7
  from urllib.parse import urlparse
8
 
9
- # Download required NLTK data
10
  try:
11
  nltk.download('punkt')
12
  except Exception as e:
@@ -15,7 +15,7 @@ except Exception as e:
15
  # Initialize the summarization pipeline
16
  try:
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
19
  except Exception as e:
20
  print(f"Error loading model: {e}")
21
  summarizer = None
@@ -39,11 +39,11 @@ def extract_article_text(url):
39
 
40
  soup = BeautifulSoup(response.text, 'html.parser')
41
 
42
- # Remove unwanted elements
43
  for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
44
  tag.decompose()
45
 
46
- # Find the main content
47
  article_text = ""
48
 
49
  # Look for common article containers
@@ -52,7 +52,7 @@ def extract_article_text(url):
52
  if main_content:
53
  paragraphs = main_content.find_all('p')
54
  else:
55
- # Fallback to all paragraphs if no article container found
56
  paragraphs = soup.find_all('p')
57
 
58
  # Extract text from paragraphs
@@ -77,13 +77,13 @@ def extract_and_summarize(url):
77
  if not text:
78
  return "Could not extract text from the article. Please make sure it's a valid news article."
79
 
80
- # Split text into chunks if it's too long
81
  max_chunk_length = 1024
82
  chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
83
 
84
  # Summarize each chunk
85
  summaries = []
86
- for chunk in chunks:
87
  if len(chunk.strip()) > 100: # Only summarize substantial chunks
88
  try:
89
  summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
@@ -95,10 +95,10 @@ def extract_and_summarize(url):
95
  if not summaries:
96
  return "Could not generate summary. Please try a different article."
97
 
98
- # Combine all summaries
99
  final_summary = " ".join(summaries)
100
 
101
- return final_summary
102
 
103
  except Exception as e:
104
  return f"Error processing article: {str(e)}"
@@ -116,10 +116,13 @@ demo = gr.Interface(
116
  description="""
117
  This app creates concise summaries of news articles using AI.
118
  Simply paste a URL of a news article and get a summary!
 
 
 
119
  """,
120
  examples=[
121
- ["https://www.bbc.com/news/world-us-canada-67841980"],
122
- ["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]
123
  ],
124
  theme=gr.themes.Soft()
125
  )
 
6
  import torch
7
  from urllib.parse import urlparse
8
 
9
+
10
  try:
11
  nltk.download('punkt')
12
  except Exception as e:
 
15
  # Initialize the summarization pipeline
16
  try:
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device) # Here I have used the BART-Large-CNN model, you can use other models as well like T5, GPT-2 etc for faster processing and better results
19
  except Exception as e:
20
  print(f"Error loading model: {e}")
21
  summarizer = None
 
39
 
40
  soup = BeautifulSoup(response.text, 'html.parser')
41
 
42
+ # Remove unwanted elements -- it will help in faster processing and better results for summarization
43
  for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
44
  tag.decompose()
45
 
46
+
47
  article_text = ""
48
 
49
  # Look for common article containers
 
52
  if main_content:
53
  paragraphs = main_content.find_all('p')
54
  else:
55
+
56
  paragraphs = soup.find_all('p')
57
 
58
  # Extract text from paragraphs
 
77
  if not text:
78
  return "Could not extract text from the article. Please make sure it's a valid news article."
79
 
80
+ # Split text into chunks if it's too long --- it will divide the article into chunks of 1024 tokens
81
  max_chunk_length = 1024
82
  chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
83
 
84
  # Summarize each chunk
85
  summaries = []
86
+ for i, chunk in enumerate(chunks):
87
  if len(chunk.strip()) > 100: # Only summarize substantial chunks
88
  try:
89
  summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
 
95
  if not summaries:
96
  return "Could not generate summary. Please try a different article."
97
 
98
+ # Combine all summaries --- it will combine all the summaries into a single paragraph to get a clean summary
99
  final_summary = " ".join(summaries)
100
 
101
+ return f"Summary:\n\n{final_summary}\n\n(Processing complete - if you want to try another article, just paste a new URL above)"
102
 
103
  except Exception as e:
104
  return f"Error processing article: {str(e)}"
 
116
  description="""
117
  This app creates concise summaries of news articles using AI.
118
  Simply paste a URL of a news article and get a summary!
119
+
120
+ Note: Please wait patiently while processing - it may take 30-60 seconds depending on the article length.
121
+ The model is processing when you see 'Running...' below the output box.
122
  """,
123
  examples=[
124
+ ["https://www.bbc.com/sport/football/articles/cvgxmzy86e4o"],
125
+ ["https://globalsouthworld.com/article/biden-approves-571-million-in-defense-support-for-taiwan"]
126
  ],
127
  theme=gr.themes.Soft()
128
  )