Spaces:
Running
Running
File size: 6,165 Bytes
3a7387c 10d9c15 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import os
import feedparser
from flask import Flask, render_template
from huggingface_hub import HfApi, Repository
from langchain_huggingface import HuggingFaceInferenceClient
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
import requests
import shutil
# Flask app setup
app = Flask(__name__)
# Hugging Face setup
HF_API_TOKEN = os.getenv("HF_API_TOKEN", "YOUR_HF_API_TOKEN")
HF_MODEL = "Qwen/Qwen-72B-Instruct" # Qwen-72B model
REPO_ID = "your-username/news-rag-db" # Replace with your HF repo ID
LOCAL_DB_DIR = "chroma_db"
client = HuggingFaceInferenceClient(model=HF_MODEL, api_key=HF_API_TOKEN)
# RSS feeds to fetch (example list)
RSS_FEEDS = [
"http://rss.cnn.com/rss/cnn_topstories.rss",
"https://feeds.bbci.co.uk/news/rss.xml",
"https://www.npr.org/rss/rss.php?id=1001",
]
# Embedding model for vectorization
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# Initialize Chroma DB
vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
# HfApi for Hugging Face Hub
hf_api = HfApi()
def fetch_rss_feeds():
"""Fetch news articles from RSS feeds."""
articles = []
for feed_url in RSS_FEEDS:
feed = feedparser.parse(feed_url)
for entry in feed.entries[:5]: # Limit to 5 articles per feed for demo
articles.append({
"title": entry.get("title", "No Title"),
"link": entry.get("link", ""),
"description": entry.get("summary", entry.get("description", "No Description")),
"published": entry.get("published", "Unknown Date"),
})
return articles
def summarize_article(text):
"""Summarize text using Qwen-72B via InferenceClient."""
prompt = f"Summarize the following text in a concise manner:\n\n{text}"
response = client.generate(prompt, max_new_tokens=100, temperature=0.7)
return response.generated_text.strip()
def categorize_article(text):
"""Categorize text into positive, negative, or neutral using Qwen-72B."""
prompt = f"Classify the sentiment of the following text as positive, negative, or neutral:\n\n{text}"
response = client.generate(prompt, max_new_tokens=10, temperature=0.7)
return response.generated_text.strip()
def process_and_store_articles(articles):
"""Process articles: summarize, categorize, vectorize, and store in RAG DB."""
documents = []
for article in articles:
# Summarize and categorize
summary = summarize_article(article["description"])
category = categorize_article(article["description"])
# Create document with metadata
doc = Document(
page_content=summary,
metadata={
"title": article["title"],
"link": article["link"],
"original_description": article["description"],
"published": article["published"],
"category": category,
}
)
documents.append(doc)
# Vectorize and store in Chroma DB
vector_db.add_documents(documents)
vector_db.persist()
# Upload to Hugging Face Hub
upload_to_hf_hub()
def upload_to_hf_hub():
"""Upload the Chroma DB to Hugging Face Hub."""
if os.path.exists(LOCAL_DB_DIR):
# Check if repo exists, create if not
try:
hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True)
except Exception as e:
print(f"Error creating repo: {e}")
# Upload all files in the DB directory
for root, _, files in os.walk(LOCAL_DB_DIR):
for file in files:
local_path = os.path.join(root, file)
remote_path = os.path.relpath(local_path, LOCAL_DB_DIR)
hf_api.upload_file(
path_or_fileobj=local_path,
path_in_repo=remote_path,
repo_id=REPO_ID,
repo_type="dataset",
token=HF_API_TOKEN
)
print(f"Database uploaded to Hugging Face Hub: {REPO_ID}")
@app.route('/')
def index():
"""Render the Flask frontend with news articles."""
articles = fetch_rss_feeds()
process_and_store_articles(articles)
# Retrieve summaries from the vector DB for display
stored_docs = vector_db.similarity_search("news", k=len(articles))
enriched_articles = []
for doc in stored_docs:
enriched_articles.append({
"title": doc.metadata["title"],
"link": doc.metadata["link"],
"summary": doc.page_content,
"category": doc.metadata["category"],
"published": doc.metadata["published"],
})
return render_template("index.html", articles=enriched_articles)
# HTML template as a string (for simplicity)
HTML_TEMPLATE = """
<!DOCTYPE html>
<html>
<head>
<title>News Feed</title>
<style>
body { font-family: Arial, sans-serif; margin: 20px; }
.article { border-bottom: 1px solid #ccc; padding: 10px; }
.title { font-size: 1.2em; }
.summary { color: #555; }
.category { font-style: italic; }
</style>
</head>
<body>
<h1>Latest News Feed</h1>
{% for article in articles %}
<div class="article">
<div class="title"><a href="{{ article.link }}" target="_blank">{{ article.title }}</a></div>
<div class="summary">{{ article.summary }}</div>
<div class="category">Category: {{ article.category }}</div>
<div>Published: {{ article.published }}</div>
</div>
{% endfor %}
</body>
</html>
"""
if __name__ == "__main__":
# Save the HTML template to the templates folder
os.makedirs("templates", exist_ok=True)
with open("templates/index.html", "w") as f:
f.write(HTML_TEMPLATE)
# Clear existing DB for fresh start (optional)
if os.path.exists(LOCAL_DB_DIR):
shutil.rmtree(LOCAL_DB_DIR)
# Run Flask app
app.run(debug=True, host="0.0.0.0", port=7860) |