Eurosmart's picture
.
2700879
from CrawDag.saving.DataLake import DataLake
from CrawDag.models import News
from pymongo import MongoClient
from bson.objectid import ObjectId
import os
from pymongo.server_api import ServerApi
from dotenv import load_dotenv
import logging
class MongoDataLake(DataLake):
def __init__(self) -> None:
self.database = self.__connect()
pass
def __connect(self):
uri = f"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0"
uri = (
"mongodb+srv://{}:{}@{}/?retryWrites=true&w=majority&appName=Cluster0".format(
os.getenv("MONGO_INITDB_ROOT_USERNAME"), os.getenv("MONGO_INITDB_ROOT_PASSWORD"),
os.getenv("MONGO_HOST"),
)
)
client = MongoClient(uri, server_api=ServerApi('1'))
database = client.get_database(os.getenv("MONGO_DATABASE"))
return database
def save(self, listNews: list[News]) -> list[str]:
newsCollection = self.database.get_collection('news')
newsListIds = []
for new in listNews:
existing = newsCollection.find_one({'topic': new.topic, 'title': new.title})
if existing:
if new.content != existing['content']:
newsCollection.update_one({'_id': existing['_id']}, {'$set': new.to_json()})
newsListIds.append(str(existing['_id']))
else:
result = newsCollection.insert_one(new.to_json())
newsListIds.append(str(result.inserted_id))
return newsListIds
def delete(self, listNewsId: list[str]) -> None:
newsCollection = self.database.get_collection('news')
for newsId in listNewsId:
newsCollection.delete_one({'_id': ObjectId(newsId)})