sohail-shaikh-s07 commited on
Commit
51a065f
·
verified ·
1 Parent(s): a66af68

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -15
app.py CHANGED
@@ -12,10 +12,10 @@ try:
12
  except Exception as e:
13
  print(f"Error downloading NLTK data: {e}")
14
 
15
- # Initialize the summarization pipeline
16
  try:
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device) # Here I have used the BART-Large-CNN model, you can use other models as well like T5, GPT-2 etc for faster processing and better results
19
  except Exception as e:
20
  print(f"Error loading model: {e}")
21
  summarizer = None
@@ -39,20 +39,20 @@ def extract_article_text(url):
39
 
40
  soup = BeautifulSoup(response.text, 'html.parser')
41
 
42
- # Remove unwanted elements -- it will help in faster processing and better results for summarization
43
  for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
44
  tag.decompose()
45
 
46
-
47
  article_text = ""
48
-
49
- # Look for common article containers
50
  main_content = soup.find('article') or soup.find(class_=['article', 'post-content', 'entry-content', 'content'])
51
 
52
  if main_content:
53
  paragraphs = main_content.find_all('p')
54
  else:
55
-
56
  paragraphs = soup.find_all('p')
57
 
58
  # Extract text from paragraphs
@@ -77,13 +77,13 @@ def extract_and_summarize(url):
77
  if not text:
78
  return "Could not extract text from the article. Please make sure it's a valid news article."
79
 
80
- # Split text into chunks if it's too long --- it will divide the article into chunks of 1024 tokens
81
  max_chunk_length = 1024
82
  chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
83
 
84
- # Summarize each chunk
85
  summaries = []
86
- for i, chunk in enumerate(chunks):
87
  if len(chunk.strip()) > 100: # Only summarize substantial chunks
88
  try:
89
  summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
@@ -95,10 +95,10 @@ def extract_and_summarize(url):
95
  if not summaries:
96
  return "Could not generate summary. Please try a different article."
97
 
98
- # Combine all summaries --- it will combine all the summaries into a single paragraph to get a clean summary
99
  final_summary = " ".join(summaries)
100
 
101
- return f"Summary:\n\n{final_summary}\n\n(Processing complete - if you want to try another article, just paste a new URL above)"
102
 
103
  except Exception as e:
104
  return f"Error processing article: {str(e)}"
@@ -116,9 +116,6 @@ demo = gr.Interface(
116
  description="""
117
  This app creates concise summaries of news articles using AI.
118
  Simply paste a URL of a news article and get a summary!
119
-
120
- Note: Please wait patiently while processing - it may take 30-60 seconds depending on the article length.
121
- The model is processing when you see 'Running...' below the output box.
122
  """,
123
  examples=[
124
  ["https://www.bbc.com/sport/football/articles/cvgxmzy86e4o"],
 
12
  except Exception as e:
13
  print(f"Error downloading NLTK data: {e}")
14
 
15
+
16
  try:
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device) # I have used BART-Large-CNN, you can use any model as per your preference like gpt2, t5, etc
19
  except Exception as e:
20
  print(f"Error loading model: {e}")
21
  summarizer = None
 
39
 
40
  soup = BeautifulSoup(response.text, 'html.parser')
41
 
42
+ # Remove unwanted elements --- to avoid the model to get distract and generate wrong summary
43
  for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
44
  tag.decompose()
45
 
46
+
47
  article_text = ""
48
+
49
+
50
  main_content = soup.find('article') or soup.find(class_=['article', 'post-content', 'entry-content', 'content'])
51
 
52
  if main_content:
53
  paragraphs = main_content.find_all('p')
54
  else:
55
+ # Fallback to all paragraphs if no article container found
56
  paragraphs = soup.find_all('p')
57
 
58
  # Extract text from paragraphs
 
77
  if not text:
78
  return "Could not extract text from the article. Please make sure it's a valid news article."
79
 
80
+ # Split text into chunks if it's too long --- it will divide the text into 1024 tokens
81
  max_chunk_length = 1024
82
  chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
83
 
84
+ # Summarize each chunk --- each small part will be summarized indiviually
85
  summaries = []
86
+ for chunk in chunks:
87
  if len(chunk.strip()) > 100: # Only summarize substantial chunks
88
  try:
89
  summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
 
95
  if not summaries:
96
  return "Could not generate summary. Please try a different article."
97
 
98
+ # Combine all summaries --- we need to combine all summaries to get a complete summary
99
  final_summary = " ".join(summaries)
100
 
101
+ return final_summary
102
 
103
  except Exception as e:
104
  return f"Error processing article: {str(e)}"
 
116
  description="""
117
  This app creates concise summaries of news articles using AI.
118
  Simply paste a URL of a news article and get a summary!
 
 
 
119
  """,
120
  examples=[
121
  ["https://www.bbc.com/sport/football/articles/cvgxmzy86e4o"],