from bs4 import BeautifulSoup from pprint import pprint from ..utils import SeleniumScraper from core.settings import BASE_DIR import os, threading, uuid, time, sqlite3,sys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from backend.module.utils import date_utils scraper = None def scrap(id:int=1): if not id: raise ValueError("The 'id' parameter is required.") global scraper try: url = f"https://www.colamanga.com/{id}/" if not scraper: scraper = SeleniumScraper() driver = scraper.driver() driver.get(url) DATA = {} DATA["id"] = id # Get info cover_url = driver.find_element(By.CLASS_NAME, "fed-list-pics").get_attribute("data-original") cover_url_split = cover_url.split("/") cover_id = cover_url_split[len(cover_url_split)-2] DATA["cover"] = f"/api/web_scrap/get_cover/colamanga/{id}/{cover_id}/" content_info_element = driver.find_element(By.CLASS_NAME, "fed-deta-content") DATA["title"] = content_info_element.find_element(By.TAG_NAME, "h1").text li_info_elements = content_info_element.find_element(By.TAG_NAME, "ul").find_elements(By.CLASS_NAME, "fed-col-md6") DATA["status"] = li_info_elements[0].find_element(By.TAG_NAME,"a").text DATA["author"] = li_info_elements[1].find_element(By.TAG_NAME,"a").text DATA["updated"] = li_info_elements[2].find_element(By.TAG_NAME,"a").text category_li = li_info_elements[4].find_elements(By.TAG_NAME,"a") array = [] for c in category_li: array.append(c.text) DATA["category"] = array DATA["synopsis"] = driver.find_element(By.CLASS_NAME, "fed-tabs-boxs").find_element(By.CSS_SELECTOR, "p.fed-text-muted").get_attribute('innerHTML') ul_element = BeautifulSoup(driver.find_element(By.CLASS_NAME, "all_data_list").find_element(By.TAG_NAME, "ul").get_attribute('innerHTML'), 'html.parser') li_elements = ul_element.find_all('li') chapter_array = [] for li in li_elements: a_element = li.find('a') obj = { "idx": int(a_element.get('href').split("/")[-1].split(".")[0]), "title": a_element.get('title'), "id": a_element.get('href').lstrip("/") } chapter_array.append(obj) DATA["chapters"] = chapter_array return DATA except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() line_number = exc_tb.tb_lineno print(f"Error on line {line_number}: {e}") raise Exception(e) finally: pass if __name__ == "__main__": DATA = scrap(page=1,search="妖") # with open("./temp.html","w", encoding='utf-8') as f: # f.write(ul.prettify()) # Write each element prettified # pprint(DATA)