Spaces:
Runtime error
Runtime error
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.chrome.service import Service as ChromeService | |
from webdriver_manager.chrome import ChromeDriverManager | |
import time | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import numpy as np | |
from sqlalchemy import create_engine | |
class AdScraper: | |
def __init__(self): | |
self.driver = None | |
self.ad_details = [] | |
self.companies_urls = { | |
"Airtel India": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=147351511955143&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all", | |
"Celcom Malaysia": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=103384636066809&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all", | |
"Vodafone UK": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=67884984384&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all", | |
"T-mobile Polska": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=166466416745074&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all" | |
} | |
def setup_driver(self): | |
options = webdriver.ChromeOptions() | |
options.headless = True | |
self.driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options) | |
def scroll_page(self): | |
last_height = self.driver.execute_script("return document.body.scrollHeight") | |
while True: | |
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
time.sleep(6) | |
new_height = self.driver.execute_script("return document.body.scrollHeight") | |
if new_height == last_height: | |
break | |
last_height = new_height | |
def parse_ads(self, soup, company): | |
ads = soup.find_all('div', class_='xh8yej3') | |
for ad in ads: | |
ad_text = ad.find('div', class_='x6ikm8r x10wlt62').text if ad.find('div', class_='x6ikm8r x10wlt62') else 'N/A' | |
ad_status = ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx').text if ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx') else 'N/A' | |
img_tag = ad.find('img', class_='x1ll5gia x19kjcj4 xh8yej3') | |
video_tag = ad.find('video', class_='x1lliihq x5yr21d xh8yej3') | |
artwork_link = img_tag['src'] if img_tag else video_tag['src'] if video_tag else 'N/A' | |
self.ad_details.append({'Company name': company, 'Ad Text': ad_text, 'Ad status': ad_status, 'Artwork Link': artwork_link}) | |
def scrape_ads(self): | |
self.setup_driver() | |
for company, url in self.companies_urls.items(): | |
self.driver.get(url) | |
self.scroll_page() | |
soup = BeautifulSoup(self.driver.page_source, 'html.parser') | |
self.parse_ads(soup, company) | |
self.driver.quit() | |
def process_data(self): | |
df = pd.DataFrame(self.ad_details) | |
df.replace('N/A', np.nan, inplace=True) | |
df.dropna(how='all', inplace=True) | |
df.drop_duplicates(subset=['Artwork Link'], inplace=True) | |
return df | |
def upload_data(self, df): | |
engine = create_engine('mysql+pymysql://root:Binu1997#$@localhost/research_db') | |
df.to_sql('ads_table', engine, if_exists='replace', index=False) | |
print("Data uploaded successfully!") | |
if __name__ == "__main__": | |
scraper = AdScraper() | |
scraper.scrape_ads() | |
data = scraper.process_data() | |
print("\nDataFrame with duplicates removed:\n", data) | |
scraper.upload_data(data) | |