broadfield-dev commited on
Commit
ce02056
·
verified ·
1 Parent(s): bc7e9a3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +163 -64
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  import feedparser
3
- from flask import Flask, render_template
4
  from huggingface_hub import HfApi, Repository
5
  from langchain_huggingface import HuggingFaceInferenceClient
6
  from langchain.vectorstores import Chroma
@@ -14,62 +14,74 @@ app = Flask(__name__)
14
 
15
  # Hugging Face setup
16
  HF_API_TOKEN = os.getenv("HF_API_TOKEN", "YOUR_HF_API_TOKEN")
17
- HF_MODEL = "Qwen/Qwen-72B-Instruct" # Qwen-72B model
18
- REPO_ID = "your-username/news-rag-db" # Replace with your HF repo ID
19
  LOCAL_DB_DIR = "chroma_db"
20
  client = HuggingFaceInferenceClient(model=HF_MODEL, api_key=HF_API_TOKEN)
21
 
22
- # RSS feeds to fetch (example list)
23
  RSS_FEEDS = [
24
- "http://rss.cnn.com/rss/cnn_topstories.rss",
25
- "https://feeds.bbci.co.uk/news/rss.xml",
26
- "https://www.npr.org/rss/rss.php?id=1001",
 
 
 
 
 
 
 
27
  ]
28
 
29
- # Embedding model for vectorization
30
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
31
-
32
- # Initialize Chroma DB
33
  vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
34
-
35
- # HfApi for Hugging Face Hub
36
  hf_api = HfApi()
37
 
38
  def fetch_rss_feeds():
39
- """Fetch news articles from RSS feeds."""
40
  articles = []
41
  for feed_url in RSS_FEEDS:
42
  feed = feedparser.parse(feed_url)
43
- for entry in feed.entries[:5]: # Limit to 5 articles per feed for demo
44
  articles.append({
45
  "title": entry.get("title", "No Title"),
46
  "link": entry.get("link", ""),
47
  "description": entry.get("summary", entry.get("description", "No Description")),
48
  "published": entry.get("published", "Unknown Date"),
 
49
  })
50
  return articles
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  def summarize_article(text):
53
- """Summarize text using Qwen-72B via InferenceClient."""
54
- prompt = f"Summarize the following text in a concise manner:\n\n{text}"
55
  response = client.generate(prompt, max_new_tokens=100, temperature=0.7)
56
  return response.generated_text.strip()
57
 
58
  def categorize_article(text):
59
- """Categorize text into positive, negative, or neutral using Qwen-72B."""
60
- prompt = f"Classify the sentiment of the following text as positive, negative, or neutral:\n\n{text}"
61
  response = client.generate(prompt, max_new_tokens=10, temperature=0.7)
62
  return response.generated_text.strip()
63
 
64
  def process_and_store_articles(articles):
65
- """Process articles: summarize, categorize, vectorize, and store in RAG DB."""
66
  documents = []
67
  for article in articles:
68
- # Summarize and categorize
69
  summary = summarize_article(article["description"])
70
- category = categorize_article(article["description"])
71
-
72
- # Create document with metadata
73
  doc = Document(
74
  page_content=summary,
75
  metadata={
@@ -77,28 +89,21 @@ def process_and_store_articles(articles):
77
  "link": article["link"],
78
  "original_description": article["description"],
79
  "published": article["published"],
80
- "category": category,
 
81
  }
82
  )
83
  documents.append(doc)
84
-
85
- # Vectorize and store in Chroma DB
86
  vector_db.add_documents(documents)
87
  vector_db.persist()
88
-
89
- # Upload to Hugging Face Hub
90
  upload_to_hf_hub()
91
 
92
  def upload_to_hf_hub():
93
- """Upload the Chroma DB to Hugging Face Hub."""
94
  if os.path.exists(LOCAL_DB_DIR):
95
- # Check if repo exists, create if not
96
  try:
97
  hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True)
98
  except Exception as e:
99
  print(f"Error creating repo: {e}")
100
-
101
- # Upload all files in the DB directory
102
  for root, _, files in os.walk(LOCAL_DB_DIR):
103
  for file in files:
104
  local_path = os.path.join(root, file)
@@ -110,50 +115,149 @@ def upload_to_hf_hub():
110
  repo_type="dataset",
111
  token=HF_API_TOKEN
112
  )
113
- print(f"Database uploaded to Hugging Face Hub: {REPO_ID}")
114
 
115
- @app.route('/')
116
  def index():
117
- """Render the Flask frontend with news articles."""
118
  articles = fetch_rss_feeds()
119
  process_and_store_articles(articles)
120
-
121
- # Retrieve summaries from the vector DB for display
122
  stored_docs = vector_db.similarity_search("news", k=len(articles))
123
- enriched_articles = []
124
- for doc in stored_docs:
125
- enriched_articles.append({
126
  "title": doc.metadata["title"],
127
  "link": doc.metadata["link"],
128
  "summary": doc.page_content,
129
  "category": doc.metadata["category"],
 
130
  "published": doc.metadata["published"],
131
- })
132
-
133
- return render_template("index.html", articles=enriched_articles)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
- # HTML template as a string (for simplicity)
 
 
 
 
 
 
 
 
 
 
136
  HTML_TEMPLATE = """
137
  <!DOCTYPE html>
138
- <html>
139
  <head>
140
- <title>News Feed</title>
 
 
141
  <style>
142
- body { font-family: Arial, sans-serif; margin: 20px; }
143
- .article { border-bottom: 1px solid #ccc; padding: 10px; }
144
- .title { font-size: 1.2em; }
145
- .summary { color: #555; }
146
- .category { font-style: italic; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  </style>
148
  </head>
149
  <body>
150
- <h1>Latest News Feed</h1>
151
- {% for article in articles %}
152
- <div class="article">
153
- <div class="title"><a href="{{ article.link }}" target="_blank">{{ article.title }}</a></div>
154
- <div class="summary">{{ article.summary }}</div>
155
- <div class="category">Category: {{ article.category }}</div>
156
- <div>Published: {{ article.published }}</div>
 
 
 
 
 
 
 
 
 
 
157
  </div>
158
  {% endfor %}
159
  </body>
@@ -161,14 +265,9 @@ HTML_TEMPLATE = """
161
  """
162
 
163
  if __name__ == "__main__":
164
- # Save the HTML template to the templates folder
165
  os.makedirs("templates", exist_ok=True)
166
  with open("templates/index.html", "w") as f:
167
  f.write(HTML_TEMPLATE)
168
-
169
- # Clear existing DB for fresh start (optional)
170
  if os.path.exists(LOCAL_DB_DIR):
171
  shutil.rmtree(LOCAL_DB_DIR)
172
-
173
- # Run Flask app
174
- app.run(debug=True, host="0.0.0.0", port=7860)
 
1
  import os
2
  import feedparser
3
+ from flask import Flask, render_template, request
4
  from huggingface_hub import HfApi, Repository
5
  from langchain_huggingface import HuggingFaceInferenceClient
6
  from langchain.vectorstores import Chroma
 
14
 
15
  # Hugging Face setup
16
  HF_API_TOKEN = os.getenv("HF_API_TOKEN", "YOUR_HF_API_TOKEN")
17
+ HF_MODEL = "Qwen/Qwen-72B-Instruct"
18
+ REPO_ID = "your-username/news-rag-db"
19
  LOCAL_DB_DIR = "chroma_db"
20
  client = HuggingFaceInferenceClient(model=HF_MODEL, api_key=HF_API_TOKEN)
21
 
22
+ # Updated RSS feeds
23
  RSS_FEEDS = [
24
+ "https://www.sciencedaily.com/rss/top/science.xml",
25
+ "https://www.horoscope.com/us/horoscopes/general/rss/horoscope-rss.aspx",
26
+ "http://rss.cnn.com/rss/cnn_allpolitics.rss",
27
+ "https://phys.org/rss-feed/physics-news/",
28
+ "https://www.spaceweatherlive.com/en/news/rss",
29
+ "https://weather.com/feeds/rss",
30
+ "https://www.wired.com/feed/rss",
31
+ "https://www.nasa.gov/rss/dyn/breaking_news.rss",
32
+ "https://www.nationalgeographic.com/feed/",
33
+ # Add more from the list above as needed
34
  ]
35
 
36
+ # Embedding model
37
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
 
38
  vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
 
 
39
  hf_api = HfApi()
40
 
41
  def fetch_rss_feeds():
 
42
  articles = []
43
  for feed_url in RSS_FEEDS:
44
  feed = feedparser.parse(feed_url)
45
+ for entry in feed.entries[:5]: # Limit to 5 per feed
46
  articles.append({
47
  "title": entry.get("title", "No Title"),
48
  "link": entry.get("link", ""),
49
  "description": entry.get("summary", entry.get("description", "No Description")),
50
  "published": entry.get("published", "Unknown Date"),
51
+ "category": categorize_feed(feed_url),
52
  })
53
  return articles
54
 
55
+ def categorize_feed(url):
56
+ """Simple categorization based on URL."""
57
+ if "sciencedaily" in url or "phys.org" in url:
58
+ return "Science & Physics"
59
+ elif "horoscope" in url:
60
+ return "Astrology"
61
+ elif "politics" in url:
62
+ return "Politics"
63
+ elif "spaceweather" in url or "nasa" in url:
64
+ return "Solar & Space"
65
+ elif "weather" in url:
66
+ return "Earth Weather"
67
+ else:
68
+ return "Cool Stuff"
69
+
70
  def summarize_article(text):
71
+ prompt = f"Summarize the following text concisely:\n\n{text}"
 
72
  response = client.generate(prompt, max_new_tokens=100, temperature=0.7)
73
  return response.generated_text.strip()
74
 
75
  def categorize_article(text):
76
+ prompt = f"Classify the sentiment as positive, negative, or neutral:\n\n{text}"
 
77
  response = client.generate(prompt, max_new_tokens=10, temperature=0.7)
78
  return response.generated_text.strip()
79
 
80
  def process_and_store_articles(articles):
 
81
  documents = []
82
  for article in articles:
 
83
  summary = summarize_article(article["description"])
84
+ sentiment = categorize_article(article["description"])
 
 
85
  doc = Document(
86
  page_content=summary,
87
  metadata={
 
89
  "link": article["link"],
90
  "original_description": article["description"],
91
  "published": article["published"],
92
+ "category": article["category"],
93
+ "sentiment": sentiment,
94
  }
95
  )
96
  documents.append(doc)
 
 
97
  vector_db.add_documents(documents)
98
  vector_db.persist()
 
 
99
  upload_to_hf_hub()
100
 
101
  def upload_to_hf_hub():
 
102
  if os.path.exists(LOCAL_DB_DIR):
 
103
  try:
104
  hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True)
105
  except Exception as e:
106
  print(f"Error creating repo: {e}")
 
 
107
  for root, _, files in os.walk(LOCAL_DB_DIR):
108
  for file in files:
109
  local_path = os.path.join(root, file)
 
115
  repo_type="dataset",
116
  token=HF_API_TOKEN
117
  )
118
+ print(f"Database uploaded to: {REPO_ID}")
119
 
120
+ @app.route('/', methods=['GET', 'POST'])
121
  def index():
 
122
  articles = fetch_rss_feeds()
123
  process_and_store_articles(articles)
 
 
124
  stored_docs = vector_db.similarity_search("news", k=len(articles))
125
+ enriched_articles = [
126
+ {
 
127
  "title": doc.metadata["title"],
128
  "link": doc.metadata["link"],
129
  "summary": doc.page_content,
130
  "category": doc.metadata["category"],
131
+ "sentiment": doc.metadata["sentiment"],
132
  "published": doc.metadata["published"],
133
+ }
134
+ for doc in stored_docs
135
+ ]
136
+
137
+ if request.method == 'POST':
138
+ query = request.form.get('search')
139
+ if query:
140
+ results = vector_db.similarity_search(query, k=10)
141
+ enriched_articles = [
142
+ {
143
+ "title": doc.metadata["title"],
144
+ "link": doc.metadata["link"],
145
+ "summary": doc.page_content,
146
+ "category": doc.metadata["category"],
147
+ "sentiment": doc.metadata["sentiment"],
148
+ "published": doc.metadata["published"],
149
+ }
150
+ for doc in results
151
+ ]
152
 
153
+ # Organize by category
154
+ categorized_articles = {}
155
+ for article in enriched_articles:
156
+ cat = article["category"]
157
+ if cat not in categorized_articles:
158
+ categorized_articles[cat] = []
159
+ categorized_articles[cat].append(article)
160
+
161
+ return render_template("index.html", categorized_articles=categorized_articles)
162
+
163
+ # Updated HTML template
164
  HTML_TEMPLATE = """
165
  <!DOCTYPE html>
166
+ <html lang="en">
167
  <head>
168
+ <meta charset="UTF-8">
169
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
170
+ <title>News Feed Hub</title>
171
  <style>
172
+ body {
173
+ font-family: 'Arial', sans-serif;
174
+ margin: 0;
175
+ padding: 20px;
176
+ background-color: #f4f4f9;
177
+ color: #333;
178
+ }
179
+ h1 {
180
+ text-align: center;
181
+ color: #2c3e50;
182
+ }
183
+ .search-container {
184
+ text-align: center;
185
+ margin: 20px 0;
186
+ }
187
+ .search-bar {
188
+ width: 50%;
189
+ padding: 12px;
190
+ font-size: 16px;
191
+ border: 2px solid #3498db;
192
+ border-radius: 25px;
193
+ box-shadow: 0 2px 5px rgba(0,0,0,0.1);
194
+ outline: none;
195
+ transition: border-color 0.3s;
196
+ }
197
+ .search-bar:focus {
198
+ border-color: #2980b9;
199
+ }
200
+ .category-section {
201
+ margin: 30px 0;
202
+ }
203
+ .category-title {
204
+ background-color: #3498db;
205
+ color: white;
206
+ padding: 10px;
207
+ border-radius: 5px;
208
+ font-size: 1.4em;
209
+ }
210
+ .article {
211
+ background-color: white;
212
+ padding: 15px;
213
+ margin: 10px 0;
214
+ border-radius: 8px;
215
+ box-shadow: 0 2px 5px rgba(0,0,0,0.1);
216
+ transition: transform 0.2s;
217
+ }
218
+ .article:hover {
219
+ transform: translateY(-3px);
220
+ }
221
+ .title a {
222
+ font-size: 1.2em;
223
+ color: #2c3e50;
224
+ text-decoration: none;
225
+ }
226
+ .title a:hover {
227
+ color: #3498db;
228
+ }
229
+ .summary {
230
+ color: #555;
231
+ margin: 5px 0;
232
+ }
233
+ .sentiment {
234
+ font-style: italic;
235
+ color: #7f8c8d;
236
+ }
237
+ .published {
238
+ font-size: 0.9em;
239
+ color: #95a5a6;
240
+ }
241
  </style>
242
  </head>
243
  <body>
244
+ <h1>News Feed Hub</h1>
245
+ <div class="search-container">
246
+ <form method="POST">
247
+ <input type="text" name="search" class="search-bar" placeholder="Search news semantically...">
248
+ </form>
249
+ </div>
250
+ {% for category, articles in categorized_articles.items() %}
251
+ <div class="category-section">
252
+ <div class="category-title">{{ category }}</div>
253
+ {% for article in articles %}
254
+ <div class="article">
255
+ <div class="title"><a href="{{ article.link }}" target="_blank">{{ article.title }}</a></div>
256
+ <div class="summary">{{ article.summary }}</div>
257
+ <div class="sentiment">Sentiment: {{ article.sentiment }}</div>
258
+ <div class="published">Published: {{ article.published }}</div>
259
+ </div>
260
+ {% endfor %}
261
  </div>
262
  {% endfor %}
263
  </body>
 
265
  """
266
 
267
  if __name__ == "__main__":
 
268
  os.makedirs("templates", exist_ok=True)
269
  with open("templates/index.html", "w") as f:
270
  f.write(HTML_TEMPLATE)
 
 
271
  if os.path.exists(LOCAL_DB_DIR):
272
  shutil.rmtree(LOCAL_DB_DIR)
273
+ app.run(host="0.0.0.0", port=7560)