updated ingestion
Browse files- pipeline/news_ingest.py +37 -13
pipeline/news_ingest.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
import sys
|
2 |
import os
|
3 |
import json
|
|
|
|
|
4 |
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
5 |
|
6 |
from components.indexers.news_indexer import get_or_build_index
|
@@ -8,6 +10,7 @@ from components.fetchers.google_search import fetch_google_news
|
|
8 |
from components.fetchers.scraper import scrape_url
|
9 |
from llama_index.core.settings import Settings
|
10 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
|
|
11 |
|
12 |
# β
Set up local embedding model
|
13 |
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
|
@@ -24,40 +27,60 @@ QUERIES = [
|
|
24 |
# β
Paths
|
25 |
INDEX_DIR = "storage/index"
|
26 |
DATA_DIR = "data/news"
|
27 |
-
|
28 |
|
29 |
-
def
|
30 |
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
31 |
with open(file_path, "w", encoding="utf-8") as f:
|
32 |
for article in articles:
|
33 |
-
f.write(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
if __name__ == "__main__":
|
36 |
if not API_KEY or not CSE_ID:
|
37 |
raise EnvironmentError("Missing GOOGLE_API_KEY or GOOGLE_CX_ID in environment.")
|
38 |
|
39 |
-
print("
|
40 |
|
41 |
all_articles = []
|
42 |
|
43 |
for query in QUERIES:
|
44 |
-
print(f"
|
45 |
try:
|
46 |
results = fetch_google_news(query, API_KEY, CSE_ID, num_results=10)
|
47 |
print(f" β Found {len(results)} links for '{query}'.")
|
48 |
|
49 |
for item in results:
|
50 |
url = item.get("link", "").strip()
|
51 |
-
|
|
|
|
|
52 |
continue
|
53 |
|
54 |
-
print(f"
|
55 |
article_text = scrape_url(url)
|
56 |
|
57 |
if article_text:
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
61 |
else:
|
62 |
print(f"β οΈ Skipped: {url}")
|
63 |
|
@@ -67,10 +90,11 @@ if __name__ == "__main__":
|
|
67 |
if not all_articles:
|
68 |
print("β οΈ No content scraped. Exiting.")
|
69 |
else:
|
70 |
-
print(f"π Writing {len(all_articles)} articles to {
|
71 |
-
|
72 |
|
73 |
print("π§ Building index...")
|
74 |
-
|
|
|
75 |
|
76 |
print(f"β
Indexed and stored at: {INDEX_DIR}")
|
|
|
1 |
import sys
|
2 |
import os
|
3 |
import json
|
4 |
+
from typing import List, Dict
|
5 |
+
|
6 |
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
7 |
|
8 |
from components.indexers.news_indexer import get_or_build_index
|
|
|
10 |
from components.fetchers.scraper import scrape_url
|
11 |
from llama_index.core.settings import Settings
|
12 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
13 |
+
from llama_index.core.schema import Document
|
14 |
|
15 |
# β
Set up local embedding model
|
16 |
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
|
|
|
27 |
# β
Paths
|
28 |
INDEX_DIR = "storage/index"
|
29 |
DATA_DIR = "data/news"
|
30 |
+
RAW_JSON = os.path.join(DATA_DIR, "news.jsonl")
|
31 |
|
32 |
+
def write_articles_jsonl(articles: List[Dict], file_path: str):
|
33 |
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
34 |
with open(file_path, "w", encoding="utf-8") as f:
|
35 |
for article in articles:
|
36 |
+
f.write(json.dumps(article, ensure_ascii=False) + "\n")
|
37 |
+
|
38 |
+
def build_documents(data: List[Dict]) -> List[Document]:
|
39 |
+
return [
|
40 |
+
Document(
|
41 |
+
text=entry["content"],
|
42 |
+
metadata={
|
43 |
+
"title": entry["title"],
|
44 |
+
"url": entry["url"],
|
45 |
+
"topic": entry["topic"],
|
46 |
+
"source": entry["source"]
|
47 |
+
}
|
48 |
+
)
|
49 |
+
for entry in data
|
50 |
+
]
|
51 |
|
52 |
if __name__ == "__main__":
|
53 |
if not API_KEY or not CSE_ID:
|
54 |
raise EnvironmentError("Missing GOOGLE_API_KEY or GOOGLE_CX_ID in environment.")
|
55 |
|
56 |
+
print("\U0001F30D Fetching news URLs from Google...")
|
57 |
|
58 |
all_articles = []
|
59 |
|
60 |
for query in QUERIES:
|
61 |
+
print(f"\U0001F50D Searching for: {query}")
|
62 |
try:
|
63 |
results = fetch_google_news(query, API_KEY, CSE_ID, num_results=10)
|
64 |
print(f" β Found {len(results)} links for '{query}'.")
|
65 |
|
66 |
for item in results:
|
67 |
url = item.get("link", "").strip()
|
68 |
+
title = item.get("title", "").strip()
|
69 |
+
source = item.get("displayLink", "").strip()
|
70 |
+
if not url or not title:
|
71 |
continue
|
72 |
|
73 |
+
print(f"\U0001F310 Scraping: {url}")
|
74 |
article_text = scrape_url(url)
|
75 |
|
76 |
if article_text:
|
77 |
+
all_articles.append({
|
78 |
+
"topic": query,
|
79 |
+
"title": title,
|
80 |
+
"url": url,
|
81 |
+
"source": source,
|
82 |
+
"content": article_text
|
83 |
+
})
|
84 |
else:
|
85 |
print(f"β οΈ Skipped: {url}")
|
86 |
|
|
|
90 |
if not all_articles:
|
91 |
print("β οΈ No content scraped. Exiting.")
|
92 |
else:
|
93 |
+
print(f"π Writing {len(all_articles)} articles to {RAW_JSON}...")
|
94 |
+
write_articles_jsonl(all_articles, RAW_JSON)
|
95 |
|
96 |
print("π§ Building index...")
|
97 |
+
documents = build_documents(all_articles)
|
98 |
+
get_or_build_index(documents)
|
99 |
|
100 |
print(f"β
Indexed and stored at: {INDEX_DIR}")
|