broadfield-dev commited on
Commit
3a7387c
·
verified ·
1 Parent(s): dd6d866

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +174 -0
app.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import feedparser
3
+ from flask import Flask, render_template
4
+ from huggingface_hub import HfApi, Repository
5
+ from langchain_huggingface import HuggingFaceInferenceClient
6
+ from langchain.vectorstores import Chroma
7
+ from langchain.embeddings import HuggingFaceEmbeddings
8
+ from langchain.docstore.document import Document
9
+ import requests
10
+ import shutil
11
+
12
+ # Flask app setup
13
+ app = Flask(__name__)
14
+
15
+ # Hugging Face setup
16
+ HF_API_TOKEN = os.getenv("HF_API_TOKEN", "YOUR_HF_API_TOKEN")
17
+ HF_MODEL = "Qwen/Qwen-72B-Instruct" # Qwen-72B model
18
+ REPO_ID = "your-username/news-rag-db" # Replace with your HF repo ID
19
+ LOCAL_DB_DIR = "chroma_db"
20
+ client = HuggingFaceInferenceClient(model=HF_MODEL, api_key=HF_API_TOKEN)
21
+
22
+ # RSS feeds to fetch (example list)
23
+ RSS_FEEDS = [
24
+ "http://rss.cnn.com/rss/cnn_topstories.rss",
25
+ "https://feeds.bbci.co.uk/news/rss.xml",
26
+ "https://www.npr.org/rss/rss.php?id=1001",
27
+ ]
28
+
29
+ # Embedding model for vectorization
30
+ embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
31
+
32
+ # Initialize Chroma DB
33
+ vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
34
+
35
+ # HfApi for Hugging Face Hub
36
+ hf_api = HfApi()
37
+
38
+ def fetch_rss_feeds():
39
+ """Fetch news articles from RSS feeds."""
40
+ articles = []
41
+ for feed_url in RSS_FEEDS:
42
+ feed = feedparser.parse(feed_url)
43
+ for entry in feed.entries[:5]: # Limit to 5 articles per feed for demo
44
+ articles.append({
45
+ "title": entry.get("title", "No Title"),
46
+ "link": entry.get("link", ""),
47
+ "description": entry.get("summary", entry.get("description", "No Description")),
48
+ "published": entry.get("published", "Unknown Date"),
49
+ })
50
+ return articles
51
+
52
+ def summarize_article(text):
53
+ """Summarize text using Qwen-72B via InferenceClient."""
54
+ prompt = f"Summarize the following text in a concise manner:\n\n{text}"
55
+ response = client.generate(prompt, max_new_tokens=100, temperature=0.7)
56
+ return response.generated_text.strip()
57
+
58
+ def categorize_article(text):
59
+ """Categorize text into positive, negative, or neutral using Qwen-72B."""
60
+ prompt = f"Classify the sentiment of the following text as positive, negative, or neutral:\n\n{text}"
61
+ response = client.generate(prompt, max_new_tokens=10, temperature=0.7)
62
+ return response.generated_text.strip()
63
+
64
+ def process_and_store_articles(articles):
65
+ """Process articles: summarize, categorize, vectorize, and store in RAG DB."""
66
+ documents = []
67
+ for article in articles:
68
+ # Summarize and categorize
69
+ summary = summarize_article(article["description"])
70
+ category = categorize_article(article["description"])
71
+
72
+ # Create document with metadata
73
+ doc = Document(
74
+ page_content=summary,
75
+ metadata={
76
+ "title": article["title"],
77
+ "link": article["link"],
78
+ "original_description": article["description"],
79
+ "published": article["published"],
80
+ "category": category,
81
+ }
82
+ )
83
+ documents.append(doc)
84
+
85
+ # Vectorize and store in Chroma DB
86
+ vector_db.add_documents(documents)
87
+ vector_db.persist()
88
+
89
+ # Upload to Hugging Face Hub
90
+ upload_to_hf_hub()
91
+
92
+ def upload_to_hf_hub():
93
+ """Upload the Chroma DB to Hugging Face Hub."""
94
+ if os.path.exists(LOCAL_DB_DIR):
95
+ # Check if repo exists, create if not
96
+ try:
97
+ hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True)
98
+ except Exception as e:
99
+ print(f"Error creating repo: {e}")
100
+
101
+ # Upload all files in the DB directory
102
+ for root, _, files in os.walk(LOCAL_DB_DIR):
103
+ for file in files:
104
+ local_path = os.path.join(root, file)
105
+ remote_path = os.path.relpath(local_path, LOCAL_DB_DIR)
106
+ hf_api.upload_file(
107
+ path_or_fileobj=local_path,
108
+ path_in_repo=remote_path,
109
+ repo_id=REPO_ID,
110
+ repo_type="dataset",
111
+ token=HF_API_TOKEN
112
+ )
113
+ print(f"Database uploaded to Hugging Face Hub: {REPO_ID}")
114
+
115
+ @app.route('/')
116
+ def index():
117
+ """Render the Flask frontend with news articles."""
118
+ articles = fetch_rss_feeds()
119
+ process_and_store_articles(articles)
120
+
121
+ # Retrieve summaries from the vector DB for display
122
+ stored_docs = vector_db.similarity_search("news", k=len(articles))
123
+ enriched_articles = []
124
+ for doc in stored_docs:
125
+ enriched_articles.append({
126
+ "title": doc.metadata["title"],
127
+ "link": doc.metadata["link"],
128
+ "summary": doc.page_content,
129
+ "category": doc.metadata["category"],
130
+ "published": doc.metadata["published"],
131
+ })
132
+
133
+ return render_template("index.html", articles=enriched_articles)
134
+
135
+ # HTML template as a string (for simplicity)
136
+ HTML_TEMPLATE = """
137
+ <!DOCTYPE html>
138
+ <html>
139
+ <head>
140
+ <title>News Feed</title>
141
+ <style>
142
+ body { font-family: Arial, sans-serif; margin: 20px; }
143
+ .article { border-bottom: 1px solid #ccc; padding: 10px; }
144
+ .title { font-size: 1.2em; }
145
+ .summary { color: #555; }
146
+ .category { font-style: italic; }
147
+ </style>
148
+ </head>
149
+ <body>
150
+ <h1>Latest News Feed</h1>
151
+ {% for article in articles %}
152
+ <div class="article">
153
+ <div class="title"><a href="{{ article.link }}" target="_blank">{{ article.title }}</a></div>
154
+ <div class="summary">{{ article.summary }}</div>
155
+ <div class="category">Category: {{ article.category }}</div>
156
+ <div>Published: {{ article.published }}</div>
157
+ </div>
158
+ {% endfor %}
159
+ </body>
160
+ </html>
161
+ """
162
+
163
+ if __name__ == "__main__":
164
+ # Save the HTML template to the templates folder
165
+ os.makedirs("templates", exist_ok=True)
166
+ with open("templates/index.html", "w") as f:
167
+ f.write(HTML_TEMPLATE)
168
+
169
+ # Clear existing DB for fresh start (optional)
170
+ if os.path.exists(LOCAL_DB_DIR):
171
+ shutil.rmtree(LOCAL_DB_DIR)
172
+
173
+ # Run Flask app
174
+ app.run(debug=True, host="0.0.0.0", port=5000)