broadfield-dev commited on
Commit
36572bc
·
verified ·
1 Parent(s): 6680594

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +5 -33
rss_processor.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  import feedparser
3
- from huggingface_hub import HfApi, InferenceClient, login
4
  from langchain.vectorstores import Chroma
5
  from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain.docstore.document import Document
@@ -13,13 +13,12 @@ logger = logging.getLogger(__name__)
13
 
14
  # Hugging Face setup
15
  HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
16
- HF_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct" # Updated to your specified model
17
  REPO_ID = "broadfield-dev/news-rag-db"
18
  LOCAL_DB_DIR = "chroma_db"
19
 
20
- # Explicitly login to Hugging Face Hub
21
  login(token=HF_API_TOKEN)
22
- client = InferenceClient(model=HF_MODEL, token=HF_API_TOKEN)
23
 
24
  # RSS feeds
25
  RSS_FEEDS = [
@@ -67,7 +66,6 @@ RSS_FEEDS = [
67
  # Embedding model and vector DB
68
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
69
  vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
70
- hf_api = HfApi()
71
 
72
  def fetch_rss_feeds():
73
  articles = []
@@ -107,38 +105,18 @@ def categorize_feed(url):
107
  else:
108
  return "Cool Stuff"
109
 
110
- def summarize_article(text):
111
- prompt = f"Summarize the following text concisely:\n\n{text}"
112
- try:
113
- response = client.text_generation(prompt, max_new_tokens=100, temperature=0.7)
114
- return response.strip()
115
- except Exception as e:
116
- logger.error(f"Error summarizing article: {e}")
117
- return "Summary unavailable"
118
-
119
- def categorize_article(text):
120
- prompt = f"Classify the sentiment as positive, negative, or neutral:\n\n{text}"
121
- try:
122
- response = client.text_generation(prompt, max_new_tokens=10, temperature=0.7)
123
- return response.strip()
124
- except Exception as e:
125
- logger.error(f"Error categorizing article: {e}")
126
- return "Neutral"
127
-
128
  def process_and_store_articles(articles):
129
  documents = []
130
  for article in articles:
131
  try:
132
- sentiment = categorize_article(article["description"]) # Still categorize for initial display
133
  doc = Document(
134
- page_content=article["description"], # Store original description without summarization
135
  metadata={
136
  "title": article["title"],
137
  "link": article["link"],
138
  "original_description": article["description"],
139
  "published": article["published"],
140
  "category": article["category"],
141
- "sentiment": sentiment,
142
  "image": article["image"],
143
  }
144
  )
@@ -167,10 +145,4 @@ def upload_to_hf_hub():
167
  path_or_fileobj=local_path,
168
  path_in_repo=remote_path,
169
  repo_id=REPO_ID,
170
- repo_type="dataset",
171
- token=HF_API_TOKEN
172
- )
173
- logger.info(f"Uploaded {file} to {REPO_ID}")
174
- except Exception as e:
175
- logger.error(f"Error uploading file {file}: {e}")
176
- logger.info(f"Database uploaded to: {REPO_ID}")
 
1
  import os
2
  import feedparser
3
+ from huggingface_hub import HfApi, login
4
  from langchain.vectorstores import Chroma
5
  from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain.docstore.document import Document
 
13
 
14
  # Hugging Face setup
15
  HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
 
16
  REPO_ID = "broadfield-dev/news-rag-db"
17
  LOCAL_DB_DIR = "chroma_db"
18
 
19
+ # Explicitly login to Hugging Face Hub (no InferenceClient needed anymore)
20
  login(token=HF_API_TOKEN)
21
+ hf_api = HfApi()
22
 
23
  # RSS feeds
24
  RSS_FEEDS = [
 
66
  # Embedding model and vector DB
67
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
68
  vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
 
69
 
70
  def fetch_rss_feeds():
71
  articles = []
 
105
  else:
106
  return "Cool Stuff"
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  def process_and_store_articles(articles):
109
  documents = []
110
  for article in articles:
111
  try:
 
112
  doc = Document(
113
+ page_content=article["description"],
114
  metadata={
115
  "title": article["title"],
116
  "link": article["link"],
117
  "original_description": article["description"],
118
  "published": article["published"],
119
  "category": article["category"],
 
120
  "image": article["image"],
121
  }
122
  )
 
145
  path_or_fileobj=local_path,
146
  path_in_repo=remote_path,
147
  repo_id=REPO_ID,
148
+ repo