File size: 4,590 Bytes
947c08e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdc95a6
 
947c08e
 
 
 
cdc95a6
947c08e
 
 
 
 
 
 
 
 
 
cdc95a6
947c08e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cffd4ca
cdc95a6
cffd4ca
cdc95a6
 
 
 
 
947c08e
 
 
 
cdc95a6
 
 
 
947c08e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from bs4 import BeautifulSoup
from pprint import pprint
from ..utils import SeleniumScraper
from selenium.webdriver.common.by import By
from core.settings import BASE_DIR
from PIL import Image


from backend.module.utils import date_utils

import json,time, threading,os, uuid, sqlite3, io, base64, sys
MAX_TIMEOUT = 10

scraper = None



def __scrollToBottom(driver:object=None):
    if not driver: raise ValueError("The 'driver' argument is required.")
    
    timeout = date_utils.utc_time().add(60,'second').get()
    
    previous_height = 0
    scrolledY = 0
    while True:
        if date_utils.utc_time().get() >= timeout: raise Exception("[Get Chapter] Finding lastest element Timed out!")
        # Scroll to the bottom of the page
        driver.execute_script(f"window.scrollBy(0, {scrolledY});")
        
        current_height = driver.execute_script("return document.documentElement.scrollHeight")
        
        if current_height > previous_height: 
            previous_height = current_height
        else:
            parent_div = driver.find_element(By.CLASS_NAME, "mh_mangalist")
            child_elements = parent_div.find_elements(By.XPATH, "./*")
            if child_elements[-1].get_attribute('text') != '__cad.read_periodical();': break
        scrolledY += 50
        


def scrap(comic_id:str="",chapter_id:str="",output_dir:str=""):
    if not comic_id: raise ValueError("The 'comic_id' parameter is required.")
    if not chapter_id: raise ValueError("The 'chapter_id' parameter is required.")
    if not output_dir: raise ValueError("The 'output_dir' parameter is required.")
    global scraper
    

    try:
        url = f"https://www.colamanga.com/{chapter_id}"
        
        if not scraper: scraper = SeleniumScraper()
        driver = scraper.driver()
        driver.get(url)
        
        __scrollToBottom(driver=driver)
        
        parent_element = driver.find_element(By.ID, "mangalist")
        child_list = parent_element.find_elements(By.CLASS_NAME, "mh_comicpic")
        
        blob_list = []
        for child in child_list:
            timeout = date_utils.utc_time().add(5,'second').get()
            while True:
                if date_utils.utc_time().get() > timeout: break
                image_element = child.find_element(By.TAG_NAME, "img")
                driver.execute_script("arguments[0].scrollIntoView({ behavior: 'smooth', block: 'center' });", image_element)
                
                url = image_element.get_attribute("src")
                if url: 
                    is_image_loaded = driver.execute_script(
                        "return arguments[0].complete", 
                        image_element
                    )
                    if is_image_loaded: 
                        blob_list.append(url)
                        break
                    
                    
        
        
        def process_browser_log_entry(entry):
            
            response = json.loads(entry['message'])['message']
            return response

        browser_log = driver.get_log('performance') 
        events = [process_browser_log_entry(entry) for entry in browser_log]
        events = [event for event in events if 'Network.response' in event['method']]
        
        
        for e in events:
            if e.get("params").get("type") == "Image":
                url = e.get("params").get("response").get("url")
                if url.split(":")[0] == "blob":
                    request_id = e["params"]["requestId"]
                    response = driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': request_id})
                    img = Image.open(io.BytesIO(base64.decodebytes(bytes(response.get("body"), "utf-8"))))
                    
                    chapter_id = chapter_id.split("/")[-1].split(".")[0]
                    
                    dir = os.path.join(output_dir)
                    
                    os.makedirs(dir, exist_ok=True)
                    img.save(os.path.join(dir,f"{blob_list.index(url)}.png"))
                
                
        return {"status":"success"}
    except Exception as e: 
        exc_type, exc_obj, exc_tb = sys.exc_info()
        line_number = exc_tb.tb_lineno
        print(f"Error on line {line_number}: {e}")
        raise Exception(e) 
    finally: pass

if __name__ == "__main__":
    DATA = scrap(chapter_id="manga-gu881388",chapter=334)

    
    # with open("./temp.html","w", encoding='utf-8') as f:

    #     f.write(ul.prettify())  # Write each element prettified

    
    # pprint(DATA)