Spaces:
Sleeping
Sleeping
import os | |
import re | |
import time | |
from typing import List | |
import requests | |
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver.chrome.service import Service | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.expected_conditions import \ | |
presence_of_element_located | |
from selenium.webdriver.support.wait import WebDriverWait | |
from Powers import * | |
class SCRAP_DATA: | |
"""Class to get and handel scrapped data""" | |
def __init__(self, urls: List[str] or str) -> None: | |
self.urls = urls | |
self.path = "./scrapped/" | |
if not os.path.isdir(self.path): | |
os.makedirs("./scrapped/") | |
def get_images(self) -> list: | |
images = [] | |
if isinstance(self.urls, str): | |
requested = requests.get(self.urls) | |
try: | |
name = self.path + f"img_{time.time()}.jpg" | |
with open(name, "wb") as f: | |
f.write(requested.content) | |
images.append(name) | |
except Exception as e: | |
LOGGER.error(e) | |
requested.close() | |
else: | |
for i in self.urls: | |
if i: | |
requested = requests.get(i) | |
else: | |
continue | |
try: | |
name = self.path + f"img_{time.time()}.jpg" | |
with open(name, "wb") as f: | |
f.write(requested.content) | |
images.append(name) | |
except Exception as e: | |
LOGGER.error(e) | |
requested.close() | |
continue | |
return images | |
def get_videos(self) -> list: | |
videos = [] | |
if isinstance(self.urls, str): | |
if i: | |
requested = requests.get(i) | |
else: | |
return [] | |
try: | |
name = self.path + f"vid_{time.time()}.mp4" | |
with open(name, "wb") as f: | |
f.write(requested.content) | |
videos.append(name) | |
except Exception as e: | |
LOGGER.error(e) | |
requested.close() | |
else: | |
for i in self.urls: | |
if i: | |
requested = requests.get(i) | |
else: | |
continue | |
try: | |
name = self.path + f"vid_{time.time()}.mp4" | |
with open(name, "wb") as f: | |
f.write(requested.content) | |
videos.append(name) | |
except Exception as e: | |
LOGGER.error(e) | |
requested.close() | |
continue | |
return videos | |
class DRIVER: | |
"""Class to make selenium driver""" | |
def __init__(self) -> None: | |
self.BIN = CHROME_BIN | |
self.CHROME_DRIVER = CHROME_DRIVER | |
def initialize_driver(self): | |
if not self.BIN: | |
LOGGER.error( | |
"ChromeBinaryErr: No binary path found! Install Chromium or Google Chrome.") | |
return ( | |
None, | |
"ChromeBinaryErr: No binary path found! Install Chromium or Google Chrome.", | |
) | |
try: | |
options = Options() | |
options.binary_location = self.BIN | |
options.add_argument("--disable-dev-shm-usage") | |
options.add_argument("--ignore-certificate-errors") | |
options.add_argument("--disable-gpu") | |
options.add_argument("--headless=new") | |
options.add_argument("--test-type") | |
options.add_argument("--no-sandbox") | |
service = Service(self.CHROME_DRIVER) | |
driver = webdriver.Chrome(options, service) | |
return driver, None | |
except Exception as e: | |
LOGGER.error(f"ChromeDriverErr: {e}") | |
return None, f"ChromeDriverErr: {e}" | |
def driver_close(self, driver: webdriver.Chrome): | |
driver.close() | |
driver.quit() | |
class INSTAGRAM(DRIVER): | |
"""Class to scrap data from instagram""" | |
def __init__(self, url: str) -> None: | |
self.url = url | |
self.article = "article._aa6a" | |
self.ul_class = "_acay" | |
self.image_class = "x5yr21d" | |
self.video_class = "x1lliihq" | |
self.next_button = "button._afxw" | |
self.return_dict = {"image": [], "video": []} | |
super().__init__() | |
def is_correct_link(self): | |
return bool((re.compile(r"^https?://(?:www\.)?instagram\.com/")).match(self.url)) | |
def get_all(self): | |
driver, error = self.initialize_driver() | |
if not driver: | |
return error | |
driver.get(self.url) | |
wait = WebDriverWait(driver, 30) | |
if "reel" in self.url: | |
element = wait.until( | |
presence_of_element_located((By.TAG_NAME, "video"))) | |
reels = element.get_attribute("src") | |
self.driver_close(driver) | |
self.return_dict.get("video").append(reels) | |
return self.return_dict | |
elif bool((re.compile(r"^https?://(?:www\.)?instagram\.com/p/")).match(self.url)): | |
image_links = [] | |
video_links = [] | |
try: | |
element = wait.until(presence_of_element_located( | |
(By.CLASS_NAME, self.ul_class))) | |
while True: | |
sub_element = element.find_elements( | |
By.CLASS_NAME, self.image_class) | |
for i in sub_element: | |
url = i.get_attribute("src") | |
image_links.append(url) | |
sub_element = element.find_elements( | |
By.CLASS_NAME, self.video_class) | |
for i in sub_element: | |
url = i.get_attribute("src") | |
video_links.append(url) | |
try: | |
driver.find_element( | |
By.CSS_SELECTOR, self.next_button).click() | |
except: # Failed to either find the element or click on next i.e. no more media left in post | |
break | |
except: | |
element = wait.until(presence_of_element_located( | |
(By.CSS_SELECTOR, self.article))) | |
try: | |
sub_element = element.find_element(By.TAG_NAME, "img") | |
image_links.append(sub_element.get_attribute("src")) | |
except: | |
sub_element = element.find_element(By.TAG_NAME, "video") | |
video_links.append(sub_element.get_attribute("src")) | |
self.driver_close(driver) | |
# To remove duplicates here I am converting into set | |
if image_links: | |
image_links = list(set(image_links)) | |
if video_links: | |
video_links = list(set(video_links)) | |
for i in video_links: | |
image_links.remove(i) | |
self.return_dict.get("image").extend(image_links) | |
self.return_dict.get("video").extend(video_links) | |
return self.return_dict | |
else: | |
return {} | |