from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service as ChromeService from webdriver_manager.chrome import ChromeDriverManager import time from bs4 import BeautifulSoup import pandas as pd import numpy as np from sqlalchemy import create_engine class AdScraper: def __init__(self): self.driver = None self.ad_details = [] self.companies_urls = { "Airtel India": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=147351511955143&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all", "Celcom Malaysia": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=103384636066809&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all", "Vodafone UK": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=67884984384&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all", "T-mobile Polska": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=166466416745074&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all" } def setup_driver(self): options = webdriver.ChromeOptions() options.headless = True self.driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options) def scroll_page(self): last_height = self.driver.execute_script("return document.body.scrollHeight") while True: self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(6) new_height = self.driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height def parse_ads(self, soup, company): ads = soup.find_all('div', class_='xh8yej3') for ad in ads: ad_text = ad.find('div', class_='x6ikm8r x10wlt62').text if ad.find('div', class_='x6ikm8r x10wlt62') else 'N/A' ad_status = ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx').text if ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx') else 'N/A' img_tag = ad.find('img', class_='x1ll5gia x19kjcj4 xh8yej3') video_tag = ad.find('video', class_='x1lliihq x5yr21d xh8yej3') artwork_link = img_tag['src'] if img_tag else video_tag['src'] if video_tag else 'N/A' self.ad_details.append({'Company name': company, 'Ad Text': ad_text, 'Ad status': ad_status, 'Artwork Link': artwork_link}) def scrape_ads(self): self.setup_driver() for company, url in self.companies_urls.items(): self.driver.get(url) self.scroll_page() soup = BeautifulSoup(self.driver.page_source, 'html.parser') self.parse_ads(soup, company) self.driver.quit() def process_data(self): df = pd.DataFrame(self.ad_details) df.replace('N/A', np.nan, inplace=True) df.dropna(how='all', inplace=True) df.drop_duplicates(subset=['Artwork Link'], inplace=True) return df def upload_data(self, df): engine = create_engine('mysql+pymysql://root:Binu1997#$@localhost/research_db') df.to_sql('ads_table', engine, if_exists='replace', index=False) print("Data uploaded successfully!") if __name__ == "__main__": scraper = AdScraper() scraper.scrape_ads() data = scraper.process_data() print("\nDataFrame with duplicates removed:\n", data) scraper.upload_data(data)