broadfield-dev commited on
Commit
935c631
·
verified ·
1 Parent(s): 6fa35e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -17
app.py CHANGED
@@ -34,23 +34,29 @@ def load_feeds_in_background():
34
 
35
  @app.route('/')
36
  def index():
37
- global loading_complete
38
- loading_complete = False # Reset on each load
39
 
40
- # Ensure Chroma DB is downloaded from Hugging Face Hub on first load
41
- if not os.path.exists("chroma_db"):
 
 
 
42
  logger.info("Downloading Chroma DB from Hugging Face Hub...")
43
  download_from_hf_hub()
44
-
45
- # Start background feed loading
46
- threading.Thread(target=load_feeds_in_background, daemon=True).start()
 
 
 
 
47
 
48
  try:
49
  # Retrieve all articles from Chroma DB
50
  all_docs = vector_db.get(include=['documents', 'metadatas'])
51
  if not all_docs.get('metadatas'):
52
  logger.info("No articles in DB yet")
53
- return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
54
 
55
  # Process and categorize articles, getting only 10 most recent per category with strict deduplication
56
  enriched_articles = []
@@ -61,15 +67,12 @@ def index():
61
  title = meta.get("title", "No Title").strip()
62
  link = meta.get("link", "").strip()
63
  published = meta.get("published", "Unknown Date").strip()
64
- # Use a more robust key including trimmed fields to prevent duplicates
65
  key = f"{title}|{link}|{published}"
66
  if key not in seen_keys:
67
  seen_keys.add(key)
68
- # Try to parse published date, fallback to string sorting
69
  try:
70
  published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
71
  except (ValueError, TypeError):
72
- # Fallback to a very old date for sorting if parsing fails
73
  published = "1970-01-01T00:00:00"
74
  enriched_articles.append({
75
  "title": title,
@@ -80,21 +83,17 @@ def index():
80
  "image": meta.get("image", "svg"),
81
  })
82
 
83
- # Sort by published date (handle both datetime and string)
84
  enriched_articles.sort(key=lambda x: x["published"], reverse=True)
85
 
86
- # Group by category and limit to 10 most recent per category with final deduplication
87
  categorized_articles = {}
88
  for article in enriched_articles:
89
  cat = article["category"]
90
  if cat not in categorized_articles:
91
  categorized_articles[cat] = []
92
- # Add only if not already in the category list (extra deduplication)
93
  key = f"{article['title']}|{article['link']}|{article['published']}"
94
  if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
95
  categorized_articles[cat].append(article)
96
 
97
- # Limit to 10 most recent per category and sort again for safety
98
  for cat in categorized_articles:
99
  unique_articles = []
100
  seen_cat_keys = set()
@@ -109,10 +108,10 @@ def index():
109
  return render_template("index.html",
110
  categorized_articles=categorized_articles,
111
  has_articles=True,
112
- loading=True)
113
  except Exception as e:
114
  logger.error(f"Error retrieving articles: {e}")
115
- return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
116
 
117
  @app.route('/search', methods=['POST'])
118
  def search():
 
34
 
35
  @app.route('/')
36
  def index():
37
+ global loading_complete, last_update_time
 
38
 
39
+ # Check if the database needs to be loaded (first time or empty)
40
+ db_exists = os.path.exists("chroma_db") and vector_db.get().get('documents')
41
+ if not db_exists:
42
+ # First load: DB doesn't exist or is empty
43
+ loading_complete = False
44
  logger.info("Downloading Chroma DB from Hugging Face Hub...")
45
  download_from_hf_hub()
46
+ threading.Thread(target=load_feeds_in_background, daemon=True).start()
47
+ elif not loading_complete:
48
+ # Background loading is still in progress from a previous request
49
+ pass # Let it continue, spinner will show
50
+ else:
51
+ # DB exists and loading is complete, no spinner needed
52
+ loading_complete = True
53
 
54
  try:
55
  # Retrieve all articles from Chroma DB
56
  all_docs = vector_db.get(include=['documents', 'metadatas'])
57
  if not all_docs.get('metadatas'):
58
  logger.info("No articles in DB yet")
59
+ return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
60
 
61
  # Process and categorize articles, getting only 10 most recent per category with strict deduplication
62
  enriched_articles = []
 
67
  title = meta.get("title", "No Title").strip()
68
  link = meta.get("link", "").strip()
69
  published = meta.get("published", "Unknown Date").strip()
 
70
  key = f"{title}|{link}|{published}"
71
  if key not in seen_keys:
72
  seen_keys.add(key)
 
73
  try:
74
  published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
75
  except (ValueError, TypeError):
 
76
  published = "1970-01-01T00:00:00"
77
  enriched_articles.append({
78
  "title": title,
 
83
  "image": meta.get("image", "svg"),
84
  })
85
 
 
86
  enriched_articles.sort(key=lambda x: x["published"], reverse=True)
87
 
 
88
  categorized_articles = {}
89
  for article in enriched_articles:
90
  cat = article["category"]
91
  if cat not in categorized_articles:
92
  categorized_articles[cat] = []
 
93
  key = f"{article['title']}|{article['link']}|{article['published']}"
94
  if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
95
  categorized_articles[cat].append(article)
96
 
 
97
  for cat in categorized_articles:
98
  unique_articles = []
99
  seen_cat_keys = set()
 
108
  return render_template("index.html",
109
  categorized_articles=categorized_articles,
110
  has_articles=True,
111
+ loading=not loading_complete)
112
  except Exception as e:
113
  logger.error(f"Error retrieving articles: {e}")
114
+ return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
115
 
116
  @app.route('/search', methods=['POST'])
117
  def search():