Spaces:
Runtime error
Runtime error
Delete App_Function_Libraries/Article_Summarization_Lib.py
Browse files
App_Function_Libraries/Article_Summarization_Lib.py
DELETED
@@ -1,246 +0,0 @@
|
|
1 |
-
# Article_Summarization_Lib.py
|
2 |
-
#########################################
|
3 |
-
# Article Summarization Library
|
4 |
-
# This library is used to handle summarization of articles.
|
5 |
-
|
6 |
-
#
|
7 |
-
####
|
8 |
-
#
|
9 |
-
####################
|
10 |
-
# Function List
|
11 |
-
#
|
12 |
-
# 1.
|
13 |
-
#
|
14 |
-
####################
|
15 |
-
#
|
16 |
-
# Import necessary libraries
|
17 |
-
import datetime
|
18 |
-
from datetime import datetime
|
19 |
-
import gradio as gr
|
20 |
-
import json
|
21 |
-
import os
|
22 |
-
import logging
|
23 |
-
import requests
|
24 |
-
# 3rd-Party Imports
|
25 |
-
from tqdm import tqdm
|
26 |
-
|
27 |
-
from App_Function_Libraries.Utils.Utils import sanitize_filename
|
28 |
-
# Local Imports
|
29 |
-
from Article_Extractor_Lib import scrape_article
|
30 |
-
from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_llama, summarize_with_oobabooga, summarize_with_tabbyapi, \
|
31 |
-
summarize_with_vllm, summarize_with_kobold, save_summary_to_file, summarize_with_local_llm
|
32 |
-
from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_openai, summarize_with_anthropic, summarize_with_cohere, \
|
33 |
-
summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, summarize_with_huggingface, \
|
34 |
-
summarize_with_mistral
|
35 |
-
from App_Function_Libraries.DB.DB_Manager import ingest_article_to_db
|
36 |
-
#
|
37 |
-
#######################################################################################################################
|
38 |
-
# Function Definitions
|
39 |
-
#
|
40 |
-
|
41 |
-
def scrape_and_summarize_multiple(urls, custom_prompt_arg, api_name, api_key, keywords, custom_article_titles, system_message=None):
|
42 |
-
urls = [url.strip() for url in urls.split('\n') if url.strip()]
|
43 |
-
custom_titles = custom_article_titles.split('\n') if custom_article_titles else []
|
44 |
-
|
45 |
-
results = []
|
46 |
-
errors = []
|
47 |
-
|
48 |
-
# Create a progress bar
|
49 |
-
progress = gr.Progress()
|
50 |
-
|
51 |
-
for i, url in tqdm(enumerate(urls), total=len(urls), desc="Processing URLs"):
|
52 |
-
custom_title = custom_titles[i] if i < len(custom_titles) else None
|
53 |
-
try:
|
54 |
-
article = scrape_article(url)
|
55 |
-
if article and article['extraction_successful']:
|
56 |
-
if custom_title:
|
57 |
-
article['title'] = custom_title
|
58 |
-
results.append(article)
|
59 |
-
except Exception as e:
|
60 |
-
error_message = f"Error processing URL {i + 1} ({url}): {str(e)}"
|
61 |
-
errors.append(error_message)
|
62 |
-
|
63 |
-
# Update progress
|
64 |
-
progress((i + 1) / len(urls), desc=f"Processed {i + 1}/{len(urls)} URLs")
|
65 |
-
|
66 |
-
if errors:
|
67 |
-
logging.error("\n".join(errors))
|
68 |
-
|
69 |
-
return results
|
70 |
-
|
71 |
-
|
72 |
-
def scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_article_title, system_message=None):
|
73 |
-
try:
|
74 |
-
# Step 1: Scrape the article
|
75 |
-
article_data = scrape_article(url)
|
76 |
-
print(f"Scraped Article Data: {article_data}") # Debugging statement
|
77 |
-
if not article_data:
|
78 |
-
return "Failed to scrape the article."
|
79 |
-
|
80 |
-
# Use the custom title if provided, otherwise use the scraped title
|
81 |
-
title = custom_article_title.strip() if custom_article_title else article_data.get('title', 'Untitled')
|
82 |
-
author = article_data.get('author', 'Unknown')
|
83 |
-
content = article_data.get('content', '')
|
84 |
-
ingestion_date = datetime.now().strftime('%Y-%m-%d')
|
85 |
-
|
86 |
-
print(f"Title: {title}, Author: {author}, Content Length: {len(content)}") # Debugging statement
|
87 |
-
|
88 |
-
# Custom system prompt for the article
|
89 |
-
system_message = system_message or "Act as a professional summarizer and summarize this article."
|
90 |
-
# Custom prompt for the article
|
91 |
-
article_custom_prompt = custom_prompt_arg or "Act as a professional summarizer and summarize this article."
|
92 |
-
|
93 |
-
# Step 2: Summarize the article
|
94 |
-
summary = None
|
95 |
-
if api_name:
|
96 |
-
logging.debug(f"Article_Summarizer: Summarization being performed by {api_name}")
|
97 |
-
|
98 |
-
# Sanitize filename for saving the JSON file
|
99 |
-
sanitized_title = sanitize_filename(title)
|
100 |
-
json_file_path = os.path.join("Results", f"{sanitized_title}_segments.json")
|
101 |
-
|
102 |
-
with open(json_file_path, 'w') as json_file:
|
103 |
-
json.dump([{'text': content}], json_file, indent=2)
|
104 |
-
|
105 |
-
# FIXME - Swap out this if/else to use the dedicated function....
|
106 |
-
try:
|
107 |
-
if api_name.lower() == 'openai':
|
108 |
-
# def summarize_with_openai(api_key, input_data, custom_prompt_arg)
|
109 |
-
summary = summarize_with_openai(api_key, json_file_path, article_custom_prompt, system_message)
|
110 |
-
|
111 |
-
elif api_name.lower() == "anthropic":
|
112 |
-
# def summarize_with_anthropic(api_key, input_data, model, custom_prompt_arg, max_retries=3, retry_delay=5):
|
113 |
-
summary = summarize_with_anthropic(api_key, json_file_path, article_custom_prompt, system_message)
|
114 |
-
elif api_name.lower() == "cohere":
|
115 |
-
# def summarize_with_cohere(api_key, input_data, model, custom_prompt_arg)
|
116 |
-
summary = summarize_with_cohere(api_key, json_file_path, article_custom_prompt, system_message)
|
117 |
-
|
118 |
-
elif api_name.lower() == "groq":
|
119 |
-
logging.debug(f"MAIN: Trying to summarize with groq")
|
120 |
-
# def summarize_with_groq(api_key, input_data, model, custom_prompt_arg):
|
121 |
-
summary = summarize_with_groq(api_key, json_file_path, article_custom_prompt, system_message)
|
122 |
-
|
123 |
-
elif api_name.lower() == "openrouter":
|
124 |
-
logging.debug(f"MAIN: Trying to summarize with OpenRouter")
|
125 |
-
# def summarize_with_openrouter(api_key, input_data, custom_prompt_arg):
|
126 |
-
summary = summarize_with_openrouter(api_key, json_file_path, article_custom_prompt, system_message)
|
127 |
-
|
128 |
-
elif api_name.lower() == "deepseek":
|
129 |
-
logging.debug(f"MAIN: Trying to summarize with DeepSeek")
|
130 |
-
# def summarize_with_deepseek(api_key, input_data, custom_prompt_arg):
|
131 |
-
summary = summarize_with_deepseek(api_key, json_file_path, article_custom_prompt, system_message)
|
132 |
-
|
133 |
-
elif api_name.lower() == "mistral":
|
134 |
-
summary = summarize_with_mistral(api_key, json_file_path, article_custom_prompt, system_message)
|
135 |
-
|
136 |
-
elif api_name.lower() == "llama.cpp":
|
137 |
-
logging.debug(f"MAIN: Trying to summarize with Llama.cpp")
|
138 |
-
# def summarize_with_llama(api_url, file_path, token, custom_prompt)
|
139 |
-
summary = summarize_with_llama(json_file_path, article_custom_prompt, system_message)
|
140 |
-
|
141 |
-
elif api_name.lower() == "kobold":
|
142 |
-
logging.debug(f"MAIN: Trying to summarize with Kobold.cpp")
|
143 |
-
# def summarize_with_kobold(input_data, kobold_api_token, custom_prompt_input, api_url):
|
144 |
-
summary = summarize_with_kobold(json_file_path, api_key, article_custom_prompt, system_message)
|
145 |
-
|
146 |
-
elif api_name.lower() == "ooba":
|
147 |
-
# def summarize_with_oobabooga(input_data, api_key, custom_prompt, api_url):
|
148 |
-
summary = summarize_with_oobabooga(json_file_path, api_key, article_custom_prompt, system_message)
|
149 |
-
|
150 |
-
elif api_name.lower() == "tabbyapi":
|
151 |
-
# def summarize_with_tabbyapi(input_data, tabby_model, custom_prompt_input, api_key=None, api_IP):
|
152 |
-
summary = summarize_with_tabbyapi(json_file_path, article_custom_prompt, system_message)
|
153 |
-
|
154 |
-
elif api_name.lower() == "vllm":
|
155 |
-
logging.debug(f"MAIN: Trying to summarize with VLLM")
|
156 |
-
# def summarize_with_vllm(api_key, input_data, custom_prompt_input):
|
157 |
-
summary = summarize_with_vllm(json_file_path, article_custom_prompt, system_message)
|
158 |
-
|
159 |
-
elif api_name.lower() == "local-llm":
|
160 |
-
logging.debug(f"MAIN: Trying to summarize with Local LLM")
|
161 |
-
summary = summarize_with_local_llm(json_file_path, article_custom_prompt, system_message)
|
162 |
-
|
163 |
-
elif api_name.lower() == "huggingface":
|
164 |
-
logging.debug(f"MAIN: Trying to summarize with huggingface")
|
165 |
-
# def summarize_with_huggingface(api_key, input_data, custom_prompt_arg):
|
166 |
-
summarize_with_huggingface(api_key, json_file_path, article_custom_prompt, system_message)
|
167 |
-
# Add additional API handlers here...
|
168 |
-
except requests.exceptions.ConnectionError as e:
|
169 |
-
logging.error(f"Connection error while trying to summarize with {api_name}: {str(e)}")
|
170 |
-
|
171 |
-
if summary:
|
172 |
-
logging.info(f"Article_Summarizer: Summary generated using {api_name} API")
|
173 |
-
save_summary_to_file(summary, json_file_path)
|
174 |
-
else:
|
175 |
-
summary = "Summary not available"
|
176 |
-
logging.warning(f"Failed to generate summary using {api_name} API")
|
177 |
-
|
178 |
-
else:
|
179 |
-
summary = "Article Summarization: No API provided for summarization."
|
180 |
-
|
181 |
-
print(f"Summary: {summary}") # Debugging statement
|
182 |
-
|
183 |
-
# Step 3: Ingest the article into the database
|
184 |
-
ingestion_result = ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date,
|
185 |
-
article_custom_prompt)
|
186 |
-
|
187 |
-
return f"Title: {title}\nAuthor: {author}\nIngestion Result: {ingestion_result}\n\nSummary: {summary}\n\nArticle Contents: {content}"
|
188 |
-
except Exception as e:
|
189 |
-
logging.error(f"Error processing URL {url}: {str(e)}")
|
190 |
-
return f"Failed to process URL {url}: {str(e)}"
|
191 |
-
|
192 |
-
|
193 |
-
def scrape_and_no_summarize_then_ingest(url, keywords, custom_article_title):
|
194 |
-
try:
|
195 |
-
# Step 1: Scrape the article
|
196 |
-
article_data = scrape_article(url)
|
197 |
-
print(f"Scraped Article Data: {article_data}") # Debugging statement
|
198 |
-
if not article_data:
|
199 |
-
return "Failed to scrape the article."
|
200 |
-
|
201 |
-
# Use the custom title if provided, otherwise use the scraped title
|
202 |
-
title = custom_article_title.strip() if custom_article_title else article_data.get('title', 'Untitled')
|
203 |
-
author = article_data.get('author', 'Unknown')
|
204 |
-
content = article_data.get('content', '')
|
205 |
-
ingestion_date = datetime.now().strftime('%Y-%m-%d')
|
206 |
-
|
207 |
-
print(f"Title: {title}, Author: {author}, Content Length: {len(content)}") # Debugging statement
|
208 |
-
|
209 |
-
# Step 2: Ingest the article into the database
|
210 |
-
ingestion_result = ingest_article_to_db(url, title, author, content, keywords, ingestion_date, None, None)
|
211 |
-
|
212 |
-
return f"Title: {title}\nAuthor: {author}\nIngestion Result: {ingestion_result}\n\nArticle Contents: {content}"
|
213 |
-
except Exception as e:
|
214 |
-
logging.error(f"Error processing URL {url}: {str(e)}")
|
215 |
-
return f"Failed to process URL {url}: {str(e)}"
|
216 |
-
|
217 |
-
|
218 |
-
def ingest_unstructured_text(text, custom_prompt, api_name, api_key, keywords, custom_article_title, system_message=None):
|
219 |
-
title = custom_article_title.strip() if custom_article_title else "Unstructured Text"
|
220 |
-
author = "Unknown"
|
221 |
-
ingestion_date = datetime.now().strftime('%Y-%m-%d')
|
222 |
-
|
223 |
-
# Summarize the unstructured text
|
224 |
-
if api_name:
|
225 |
-
json_file_path = f"Results/{title.replace(' ', '_')}_segments.json"
|
226 |
-
with open(json_file_path, 'w') as json_file:
|
227 |
-
json.dump([{'text': text}], json_file, indent=2)
|
228 |
-
|
229 |
-
if api_name.lower() == 'openai':
|
230 |
-
summary = summarize_with_openai(api_key, json_file_path, custom_prompt, system_message)
|
231 |
-
# Add other APIs as needed
|
232 |
-
else:
|
233 |
-
summary = "Unsupported API."
|
234 |
-
else:
|
235 |
-
summary = "No API provided for summarization."
|
236 |
-
|
237 |
-
# Ingest the unstructured text into the database
|
238 |
-
ingestion_result = ingest_article_to_db('Unstructured Text', title, author, text, keywords, summary, ingestion_date,
|
239 |
-
custom_prompt)
|
240 |
-
return f"Title: {title}\nSummary: {summary}\nIngestion Result: {ingestion_result}"
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
#
|
245 |
-
#
|
246 |
-
#######################################################################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|