import os import re from typing import List import httpx from Powers import * # import requests # from selenium import webdriver # from selenium.webdriver.chrome.options import Options # from selenium.webdriver.chrome.service import Service # from selenium.webdriver.common.by import By # from selenium.webdriver.support.expected_conditions import \ # presence_of_element_located # from selenium.webdriver.support.wait import WebDriverWait class SCRAP_DATA: """Class to get and handel scrapped data""" def __init__(self, urls: List[str] or str) -> None: self.urls = urls self.path = scrap_dir if not os.path.isdir(self.path): os.makedirs(self.path) def get_images(self) -> list: images = [] if isinstance(self.urls, str): requested = httpx.get(self.urls) try: name = f"{self.path}img_{str(time()).replace('.', '_')}.jpg" with open(name, "wb") as f: f.write(requested.content) images.append(name) except Exception as e: LOGGER.error(e) LOGGER.error(format_exc()) requested.close() else: for i in self.urls: if i: requested = httpx.get(i) else: continue try: name = f"{self.path}img_{str(time()).replace('.', '_')}.jpg" with open(name, "wb") as f: f.write(requested.content) images.append(name) except Exception as e: LOGGER.error(format_exc()) LOGGER.error(e) requested.close() continue return images def get_videos(self) -> list: videos = [] if isinstance(self.urls, str): if i: requested = httpx.get(i) else: return [] try: name = f"{self.path}vid_{str(time()).replace('.', '_')}.mp4" with open(name, "wb") as f: f.write(requested.content) videos.append(name) except Exception as e: LOGGER.error(e) LOGGER.error(format_exc()) requested.close() else: for i in self.urls: if i: requested = httpx.get(i) else: continue try: name = f"{self.path}vid_{str(time()).replace('.', '_')}.mp4" with open(name, "wb") as f: f.write(requested.content) videos.append(name) except Exception as e: LOGGER.error(e) LOGGER.error(format_exc()) requested.close() continue return videos # class DRIVER: # """Class to make selenium driver""" # def __init__(self) -> None: # self.BIN = CHROME_BIN # self.CHROME_DRIVER = CHROME_DRIVER # def initialize_driver(self): # if not self.BIN: # LOGGER.error( # "ChromeBinaryErr: No binary path found! Install Chromium or Google Chrome.") # return ( # None, # "ChromeBinaryErr: No binary path found! Install Chromium or Google Chrome.", # ) # try: # options = Options() # options.binary_location = self.BIN # options.add_argument("--disable-dev-shm-usage") # options.add_argument("--ignore-certificate-errors") # options.add_argument("--disable-gpu") # options.add_argument("--headless=new") # options.add_argument("--test-type") # options.add_argument("--no-sandbox") # service = Service(self.CHROME_DRIVER) # driver = webdriver.Chrome(options, service) # return driver, None # except Exception as e: # LOGGER.error(f"ChromeDriverErr: {e}") # return None, f"ChromeDriverErr: {e}" # def driver_close(self, driver: webdriver.Chrome): # driver.close() # driver.quit() # class INSTAGRAM(DRIVER): # """Class to scrap data from instagram""" # def __init__(self, url: str) -> None: # self.url = url # self.article = "article._aa6a" # self.ul_class = "_acay" # self.image_class = "x5yr21d" # self.video_class = "x1lliihq" # self.next_button = "button._afxw" # self.return_dict = {"image": [], "video": []} # super().__init__() # def is_correct_link(self): # return bool((re.compile(r"^https?://(?:www\.)?instagram\.com/")).match(self.url)) # def get_all(self): # driver, error = self.initialize_driver() # if not driver: # return error # driver.get(self.url) # wait = WebDriverWait(driver, 30) # if "reel" in self.url: # element = wait.until( # presence_of_element_located((By.TAG_NAME, "video"))) # reels = element.get_attribute("src") # self.driver_close(driver) # self.return_dict.get("video").append(reels) # return self.return_dict # elif bool((re.compile(r"^https?://(?:www\.)?instagram\.com/p/")).match(self.url)): # image_links = [] # video_links = [] # try: # element = wait.until(presence_of_element_located( # (By.CLASS_NAME, self.ul_class))) # while True: # sub_element = element.find_elements( # By.CLASS_NAME, self.image_class) # for i in sub_element: # url = i.get_attribute("src") # image_links.append(url) # sub_element = element.find_elements( # By.CLASS_NAME, self.video_class) # for i in sub_element: # url = i.get_attribute("src") # video_links.append(url) # try: # driver.find_element( # By.CSS_SELECTOR, self.next_button).click() # except: # Failed to either find the element or click on next i.e. no more media left in post # break # except: # element = wait.until(presence_of_element_located( # (By.CSS_SELECTOR, self.article))) # try: # sub_element = element.find_element(By.TAG_NAME, "img") # image_links.append(sub_element.get_attribute("src")) # except: # sub_element = element.find_element(By.TAG_NAME, "video") # video_links.append(sub_element.get_attribute("src")) # self.driver_close(driver) # # To remove duplicates here I am converting into set # if image_links: # image_links = list(set(image_links)) # if video_links: # video_links = list(set(video_links)) # for i in video_links: # image_links.remove(i) # self.return_dict.get("image").extend(image_links) # self.return_dict.get("video").extend(video_links) # return self.return_dict # else: # return {} class INSTAGRAM: def __init__(self, url): self.url = url def is_correct_url(self): return bool((re.compile(r"^https?://(?:www\.)?instagram\.com/")).match(self.url)) def get_media(self): try: return httpx.post( f"https://api.qewertyy.dev/downloaders/instagram?url={self.url}" ).json() except httpx.ReadTimeout: try: curr_timeout = 10 timeout = httpx.Timeout(curr_timeout) return httpx.post( f"https://api.qewertyy.dev/downloaders/instagram?url={self.url}", timeout=timeout ).json() except httpx.ReadTimeout: return {"code": 69, "message": "Please retry after few seconds"} except Exception as e: LOGGER.error(e) LOGGER.error(format_exc()) return {"code": 69, "message": e} except Exception as e: LOGGER.error(e) LOGGER.error(format_exc()) return {"code": 69, "message": e}