ragV98 commited on
Commit
8e17b80
Β·
1 Parent(s): e9cce51

updated ingestion

Browse files
Files changed (1) hide show
  1. pipeline/news_ingest.py +37 -13
pipeline/news_ingest.py CHANGED
@@ -1,6 +1,8 @@
1
  import sys
2
  import os
3
  import json
 
 
4
  sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
5
 
6
  from components.indexers.news_indexer import get_or_build_index
@@ -8,6 +10,7 @@ from components.fetchers.google_search import fetch_google_news
8
  from components.fetchers.scraper import scrape_url
9
  from llama_index.core.settings import Settings
10
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 
11
 
12
  # βœ… Set up local embedding model
13
  Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
@@ -24,40 +27,60 @@ QUERIES = [
24
  # βœ… Paths
25
  INDEX_DIR = "storage/index"
26
  DATA_DIR = "data/news"
27
- RAW_FILE = os.path.join(DATA_DIR, "news.txt")
28
 
29
- def write_articles_to_file(articles, file_path):
30
  os.makedirs(os.path.dirname(file_path), exist_ok=True)
31
  with open(file_path, "w", encoding="utf-8") as f:
32
  for article in articles:
33
- f.write(article.strip() + "\n\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  if __name__ == "__main__":
36
  if not API_KEY or not CSE_ID:
37
  raise EnvironmentError("Missing GOOGLE_API_KEY or GOOGLE_CX_ID in environment.")
38
 
39
- print("🌍 Fetching news URLs from Google...")
40
 
41
  all_articles = []
42
 
43
  for query in QUERIES:
44
- print(f"πŸ” Searching for: {query}")
45
  try:
46
  results = fetch_google_news(query, API_KEY, CSE_ID, num_results=10)
47
  print(f" β†’ Found {len(results)} links for '{query}'.")
48
 
49
  for item in results:
50
  url = item.get("link", "").strip()
51
- if not url:
 
 
52
  continue
53
 
54
- print(f"🌐 Scraping: {url}")
55
  article_text = scrape_url(url)
56
 
57
  if article_text:
58
- tagged_text = f"[{query.upper()}]\n{article_text}"
59
- print("Adding text to vector", tagged_text)
60
- all_articles.append(tagged_text)
 
 
 
 
61
  else:
62
  print(f"⚠️ Skipped: {url}")
63
 
@@ -67,10 +90,11 @@ if __name__ == "__main__":
67
  if not all_articles:
68
  print("⚠️ No content scraped. Exiting.")
69
  else:
70
- print(f"πŸ“ Writing {len(all_articles)} articles to {RAW_FILE}...")
71
- write_articles_to_file(all_articles, RAW_FILE)
72
 
73
  print("🧠 Building index...")
74
- get_or_build_index(DATA_DIR)
 
75
 
76
  print(f"βœ… Indexed and stored at: {INDEX_DIR}")
 
1
  import sys
2
  import os
3
  import json
4
+ from typing import List, Dict
5
+
6
  sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
7
 
8
  from components.indexers.news_indexer import get_or_build_index
 
10
  from components.fetchers.scraper import scrape_url
11
  from llama_index.core.settings import Settings
12
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
13
+ from llama_index.core.schema import Document
14
 
15
  # βœ… Set up local embedding model
16
  Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
 
27
  # βœ… Paths
28
  INDEX_DIR = "storage/index"
29
  DATA_DIR = "data/news"
30
+ RAW_JSON = os.path.join(DATA_DIR, "news.jsonl")
31
 
32
+ def write_articles_jsonl(articles: List[Dict], file_path: str):
33
  os.makedirs(os.path.dirname(file_path), exist_ok=True)
34
  with open(file_path, "w", encoding="utf-8") as f:
35
  for article in articles:
36
+ f.write(json.dumps(article, ensure_ascii=False) + "\n")
37
+
38
+ def build_documents(data: List[Dict]) -> List[Document]:
39
+ return [
40
+ Document(
41
+ text=entry["content"],
42
+ metadata={
43
+ "title": entry["title"],
44
+ "url": entry["url"],
45
+ "topic": entry["topic"],
46
+ "source": entry["source"]
47
+ }
48
+ )
49
+ for entry in data
50
+ ]
51
 
52
  if __name__ == "__main__":
53
  if not API_KEY or not CSE_ID:
54
  raise EnvironmentError("Missing GOOGLE_API_KEY or GOOGLE_CX_ID in environment.")
55
 
56
+ print("\U0001F30D Fetching news URLs from Google...")
57
 
58
  all_articles = []
59
 
60
  for query in QUERIES:
61
+ print(f"\U0001F50D Searching for: {query}")
62
  try:
63
  results = fetch_google_news(query, API_KEY, CSE_ID, num_results=10)
64
  print(f" β†’ Found {len(results)} links for '{query}'.")
65
 
66
  for item in results:
67
  url = item.get("link", "").strip()
68
+ title = item.get("title", "").strip()
69
+ source = item.get("displayLink", "").strip()
70
+ if not url or not title:
71
  continue
72
 
73
+ print(f"\U0001F310 Scraping: {url}")
74
  article_text = scrape_url(url)
75
 
76
  if article_text:
77
+ all_articles.append({
78
+ "topic": query,
79
+ "title": title,
80
+ "url": url,
81
+ "source": source,
82
+ "content": article_text
83
+ })
84
  else:
85
  print(f"⚠️ Skipped: {url}")
86
 
 
90
  if not all_articles:
91
  print("⚠️ No content scraped. Exiting.")
92
  else:
93
+ print(f"πŸ“ Writing {len(all_articles)} articles to {RAW_JSON}...")
94
+ write_articles_jsonl(all_articles, RAW_JSON)
95
 
96
  print("🧠 Building index...")
97
+ documents = build_documents(all_articles)
98
+ get_or_build_index(documents)
99
 
100
  print(f"βœ… Indexed and stored at: {INDEX_DIR}")