File size: 3,101 Bytes
947c08e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdc95a6
947c08e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from bs4 import BeautifulSoup
from pprint import pprint
from ..utils import SeleniumScraper
from core.settings import BASE_DIR
import os, threading, uuid, time, sqlite3,sys

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from backend.module.utils import date_utils


scraper = None

def scrap(id:int=1):
    if not id: raise ValueError("The 'id' parameter is required.")
    global scraper

    try:
        url = f"https://www.colamanga.com/{id}/"
        
        if not scraper: scraper = SeleniumScraper()
        driver = scraper.driver()
        driver.get(url)
        
        DATA = {}
        
        DATA["id"] = id
        
        # Get info
        
        cover_url = driver.find_element(By.CLASS_NAME, "fed-list-pics").get_attribute("data-original")
        cover_url_split = cover_url.split("/")
        cover_id = cover_url_split[len(cover_url_split)-2]
        DATA["cover"] = f"/api/web_scrap/get_cover/colamanga/{id}/{cover_id}/"
        
        
        content_info_element = driver.find_element(By.CLASS_NAME, "fed-deta-content")
        DATA["title"] = content_info_element.find_element(By.TAG_NAME, "h1").text
        li_info_elements = content_info_element.find_element(By.TAG_NAME, "ul").find_elements(By.CLASS_NAME, "fed-col-md6")
        
        DATA["status"] = li_info_elements[0].find_element(By.TAG_NAME,"a").text
        DATA["author"] =  li_info_elements[1].find_element(By.TAG_NAME,"a").text
        DATA["updated"] = li_info_elements[2].find_element(By.TAG_NAME,"a").text
        
        category_li = li_info_elements[4].find_elements(By.TAG_NAME,"a")
        array = []
        for c in category_li:
            array.append(c.text)
        DATA["category"] = array
        
        
        DATA["synopsis"] = driver.find_element(By.CLASS_NAME, "fed-tabs-boxs").find_element(By.CSS_SELECTOR, "p.fed-text-muted").get_attribute('innerHTML')
        
        ul_element = BeautifulSoup(driver.find_element(By.CLASS_NAME, "all_data_list").find_element(By.TAG_NAME, "ul").get_attribute('innerHTML'), 'html.parser')
        li_elements = ul_element.find_all('li')
        
        chapter_array = []
        for li in li_elements:
            a_element = li.find('a')
            obj = {
                "idx": int(a_element.get('href').split("/")[-1].split(".")[0]),
                "title": a_element.get('title'),
                "id": a_element.get('href').lstrip("/")
            }
            chapter_array.append(obj)

        DATA["chapters"] = chapter_array
        
        
        return DATA
    except Exception as e: 
        exc_type, exc_obj, exc_tb = sys.exc_info()
        line_number = exc_tb.tb_lineno
        print(f"Error on line {line_number}: {e}")
        raise Exception(e) 
    finally: pass

if __name__ == "__main__":
    DATA = scrap(page=1,search="妖")

    
    # with open("./temp.html","w", encoding='utf-8') as f:

    #     f.write(ul.prettify())  # Write each element prettified

    
    # pprint(DATA)