File size: 6,165 Bytes
3a7387c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10d9c15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import os
import feedparser
from flask import Flask, render_template
from huggingface_hub import HfApi, Repository
from langchain_huggingface import HuggingFaceInferenceClient
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
import requests
import shutil

# Flask app setup
app = Flask(__name__)

# Hugging Face setup
HF_API_TOKEN = os.getenv("HF_API_TOKEN", "YOUR_HF_API_TOKEN")
HF_MODEL = "Qwen/Qwen-72B-Instruct"  # Qwen-72B model
REPO_ID = "your-username/news-rag-db"  # Replace with your HF repo ID
LOCAL_DB_DIR = "chroma_db"
client = HuggingFaceInferenceClient(model=HF_MODEL, api_key=HF_API_TOKEN)

# RSS feeds to fetch (example list)
RSS_FEEDS = [
    "http://rss.cnn.com/rss/cnn_topstories.rss",
    "https://feeds.bbci.co.uk/news/rss.xml",
    "https://www.npr.org/rss/rss.php?id=1001",
]

# Embedding model for vectorization
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize Chroma DB
vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)

# HfApi for Hugging Face Hub
hf_api = HfApi()

def fetch_rss_feeds():
    """Fetch news articles from RSS feeds."""
    articles = []
    for feed_url in RSS_FEEDS:
        feed = feedparser.parse(feed_url)
        for entry in feed.entries[:5]:  # Limit to 5 articles per feed for demo
            articles.append({
                "title": entry.get("title", "No Title"),
                "link": entry.get("link", ""),
                "description": entry.get("summary", entry.get("description", "No Description")),
                "published": entry.get("published", "Unknown Date"),
            })
    return articles

def summarize_article(text):
    """Summarize text using Qwen-72B via InferenceClient."""
    prompt = f"Summarize the following text in a concise manner:\n\n{text}"
    response = client.generate(prompt, max_new_tokens=100, temperature=0.7)
    return response.generated_text.strip()

def categorize_article(text):
    """Categorize text into positive, negative, or neutral using Qwen-72B."""
    prompt = f"Classify the sentiment of the following text as positive, negative, or neutral:\n\n{text}"
    response = client.generate(prompt, max_new_tokens=10, temperature=0.7)
    return response.generated_text.strip()

def process_and_store_articles(articles):
    """Process articles: summarize, categorize, vectorize, and store in RAG DB."""
    documents = []
    for article in articles:
        # Summarize and categorize
        summary = summarize_article(article["description"])
        category = categorize_article(article["description"])
        
        # Create document with metadata
        doc = Document(
            page_content=summary,
            metadata={
                "title": article["title"],
                "link": article["link"],
                "original_description": article["description"],
                "published": article["published"],
                "category": category,
            }
        )
        documents.append(doc)
    
    # Vectorize and store in Chroma DB
    vector_db.add_documents(documents)
    vector_db.persist()
    
    # Upload to Hugging Face Hub
    upload_to_hf_hub()

def upload_to_hf_hub():
    """Upload the Chroma DB to Hugging Face Hub."""
    if os.path.exists(LOCAL_DB_DIR):
        # Check if repo exists, create if not
        try:
            hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True)
        except Exception as e:
            print(f"Error creating repo: {e}")
        
        # Upload all files in the DB directory
        for root, _, files in os.walk(LOCAL_DB_DIR):
            for file in files:
                local_path = os.path.join(root, file)
                remote_path = os.path.relpath(local_path, LOCAL_DB_DIR)
                hf_api.upload_file(
                    path_or_fileobj=local_path,
                    path_in_repo=remote_path,
                    repo_id=REPO_ID,
                    repo_type="dataset",
                    token=HF_API_TOKEN
                )
        print(f"Database uploaded to Hugging Face Hub: {REPO_ID}")

@app.route('/')
def index():
    """Render the Flask frontend with news articles."""
    articles = fetch_rss_feeds()
    process_and_store_articles(articles)
    
    # Retrieve summaries from the vector DB for display
    stored_docs = vector_db.similarity_search("news", k=len(articles))
    enriched_articles = []
    for doc in stored_docs:
        enriched_articles.append({
            "title": doc.metadata["title"],
            "link": doc.metadata["link"],
            "summary": doc.page_content,
            "category": doc.metadata["category"],
            "published": doc.metadata["published"],
        })
    
    return render_template("index.html", articles=enriched_articles)

# HTML template as a string (for simplicity)
HTML_TEMPLATE = """
<!DOCTYPE html>
<html>
<head>
    <title>News Feed</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; }
        .article { border-bottom: 1px solid #ccc; padding: 10px; }
        .title { font-size: 1.2em; }
        .summary { color: #555; }
        .category { font-style: italic; }
    </style>
</head>
<body>
    <h1>Latest News Feed</h1>
    {% for article in articles %}
    <div class="article">
        <div class="title"><a href="{{ article.link }}" target="_blank">{{ article.title }}</a></div>
        <div class="summary">{{ article.summary }}</div>
        <div class="category">Category: {{ article.category }}</div>
        <div>Published: {{ article.published }}</div>
    </div>
    {% endfor %}
</body>
</html>
"""

if __name__ == "__main__":
    # Save the HTML template to the templates folder
    os.makedirs("templates", exist_ok=True)
    with open("templates/index.html", "w") as f:
        f.write(HTML_TEMPLATE)
    
    # Clear existing DB for fresh start (optional)
    if os.path.exists(LOCAL_DB_DIR):
        shutil.rmtree(LOCAL_DB_DIR)
    
    # Run Flask app
    app.run(debug=True, host="0.0.0.0", port=7860)