BinuraYasodya's picture
Upload 4 files
6c34694 verified
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
class AdScraper:
def __init__(self):
self.driver = None
self.ad_details = []
self.companies_urls = {
"Airtel India": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=147351511955143&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all",
"Celcom Malaysia": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=103384636066809&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all",
"Vodafone UK": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=67884984384&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all",
"T-mobile Polska": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=166466416745074&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all"
}
def setup_driver(self):
options = webdriver.ChromeOptions()
options.headless = True
self.driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
def scroll_page(self):
last_height = self.driver.execute_script("return document.body.scrollHeight")
while True:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(6)
new_height = self.driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
def parse_ads(self, soup, company):
ads = soup.find_all('div', class_='xh8yej3')
for ad in ads:
ad_text = ad.find('div', class_='x6ikm8r x10wlt62').text if ad.find('div', class_='x6ikm8r x10wlt62') else 'N/A'
ad_status = ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx').text if ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx') else 'N/A'
img_tag = ad.find('img', class_='x1ll5gia x19kjcj4 xh8yej3')
video_tag = ad.find('video', class_='x1lliihq x5yr21d xh8yej3')
artwork_link = img_tag['src'] if img_tag else video_tag['src'] if video_tag else 'N/A'
self.ad_details.append({'Company name': company, 'Ad Text': ad_text, 'Ad status': ad_status, 'Artwork Link': artwork_link})
def scrape_ads(self):
self.setup_driver()
for company, url in self.companies_urls.items():
self.driver.get(url)
self.scroll_page()
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
self.parse_ads(soup, company)
self.driver.quit()
def process_data(self):
df = pd.DataFrame(self.ad_details)
df.replace('N/A', np.nan, inplace=True)
df.dropna(how='all', inplace=True)
df.drop_duplicates(subset=['Artwork Link'], inplace=True)
return df
def upload_data(self, df):
engine = create_engine('mysql+pymysql://root:Binu1997#$@localhost/research_db')
df.to_sql('ads_table', engine, if_exists='replace', index=False)
print("Data uploaded successfully!")
if __name__ == "__main__":
scraper = AdScraper()
scraper.scrape_ads()
data = scraper.process_data()
print("\nDataFrame with duplicates removed:\n", data)
scraper.upload_data(data)