File size: 1,811 Bytes
2700879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from CrawDag.saving.DataLake import DataLake
from CrawDag.models import News
from pymongo import MongoClient
from bson.objectid import ObjectId
import os
from pymongo.server_api import ServerApi
from dotenv import load_dotenv
import logging
class MongoDataLake(DataLake):
    def __init__(self) -> None:
        self.database = self.__connect()
        pass

    def __connect(self):      
        uri = f"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0"
        uri = (
        "mongodb+srv://{}:{}@{}/?retryWrites=true&w=majority&appName=Cluster0".format(
            os.getenv("MONGO_INITDB_ROOT_USERNAME"), os.getenv("MONGO_INITDB_ROOT_PASSWORD"),
            os.getenv("MONGO_HOST"),
            )
        )
    
        client = MongoClient(uri, server_api=ServerApi('1'))
        database = client.get_database(os.getenv("MONGO_DATABASE"))
        return database

    def save(self, listNews: list[News]) -> list[str]:
        newsCollection = self.database.get_collection('news')
        newsListIds = []
        for new in listNews:
            existing = newsCollection.find_one({'topic': new.topic, 'title': new.title})
            if existing:
                if new.content != existing['content']:
                    newsCollection.update_one({'_id': existing['_id']}, {'$set': new.to_json()})
                    newsListIds.append(str(existing['_id']))
            else:
                result = newsCollection.insert_one(new.to_json())
                newsListIds.append(str(result.inserted_id))

        return newsListIds
    
    def delete(self, listNewsId: list[str]) -> None:
        newsCollection = self.database.get_collection('news')
        for newsId in listNewsId:
            newsCollection.delete_one({'_id': ObjectId(newsId)})