In [10]:
from selenium import webdriver 
from selenium.webdriver.common.by import By 
from selenium.webdriver.chrome.service import Service as ChromeService 
from webdriver_manager.chrome import ChromeDriverManager
import time
from bs4 import BeautifulSoup 
import pandas as pd

# instantiate options 
options = webdriver.ChromeOptions() 

# run browser in headless mode 
options.headless = True 

# instantiate driver 
driver = webdriver.Chrome(service=ChromeService( 
    ChromeDriverManager().install()), options=options) 

# List of companies and their URLs
companies_urls = {
    "Airtel India": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=147351511955143&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all",
    "Celcom Malaysia": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=103384636066809&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all",
    "Vodafone UK": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=67884984384&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all",
    "T-mobile Polska": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=166466416745074&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all"
}

ad_details = []

for company, url in companies_urls.items():
    # Load the page
    driver.get(url)

    # Scroll to the bottom of the page
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(6)  # Wait for new content to load
        
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Parse the page content
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find advertisement details
    ads = soup.find_all('div', class_='xh8yej3')

    for ad in ads:
        # Extract the ad text
        ad_text = ad.find('div', class_='x6ikm8r x10wlt62').text if ad.find('div', class_='x6ikm8r x10wlt62') else 'N/A'

        # Extract company name
        company_name = company

        # Extract status
        ad_status = ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx').text if ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx') else 'N/A'

        # Extract image or video link
        img_tag = ad.find('img', class_='x1ll5gia x19kjcj4 xh8yej3')
        video_tag = ad.find('video', class_='x1lliihq x5yr21d xh8yej3')

        if img_tag:
            artwork_link = img_tag['src']
        elif video_tag:
            artwork_link = video_tag['src']
        else:
            artwork_link = 'N/A'

        ad_details.append({'Company name': company_name, 'Ad Text': ad_text, 'Ad status': ad_status, 'Artwork Link': artwork_link})

# Close the WebDriver
driver.quit()

# Convert to pandas DataFrame
df = pd.DataFrame(ad_details)
print(df)


         Company name                                            Ad Text  \
0        Airtel India                                                N/A   
1        Airtel India                                                N/A   
2        Airtel India                                                N/A   
3        Airtel India                                                N/A   
4        Airtel India                                                N/A   
...               ...                                                ...   
2663  T-mobile Polska                                                N/A   
2664  T-mobile Polska                                                N/A   
2665  T-mobile Polska  Jak to siƒô dzieje: li≈õcie spadajƒÖ, a Internetu...   
2666  T-mobile Polska                                                N/A   
2667  T-mobile Polska                                                N/A   

     Ad status                                       Artwork Link  
0          N/A  

In [11]:
df.shape

(2668, 4)

In [12]:
import numpy as np
# Replace 'N/A' strings with np.nan
df.replace('N/A', np.nan, inplace=True)

In [13]:
# Remove rows where all values are NaN
df_all_null_removed = df.dropna(how='all')

In [14]:
duplicates = df_all_null_removed.duplicated()

In [15]:
# Filter duplicate rows
duplicate_rows = df_all_null_removed[df_all_null_removed.duplicated()]

print("\nDuplicate rows only:\n", duplicate_rows)


Duplicate rows only:
          Company name Ad Text Ad status Artwork Link
1        Airtel India     NaN       NaN          NaN
2        Airtel India     NaN       NaN          NaN
3        Airtel India     NaN       NaN          NaN
4        Airtel India     NaN       NaN          NaN
5        Airtel India     NaN       NaN          NaN
...               ...     ...       ...          ...
2662  T-mobile Polska     NaN    Active          NaN
2663  T-mobile Polska     NaN       NaN          NaN
2664  T-mobile Polska     NaN       NaN          NaN
2666  T-mobile Polska     NaN       NaN          NaN
2667  T-mobile Polska     NaN       NaN          NaN

[2059 rows x 4 columns]


In [16]:
# Remove duplicate rows
df_no_duplicates = df_all_null_removed.drop_duplicates(subset=['Artwork Link'])

print("\nDataFrame with duplicates removed:\n", df_no_duplicates)


DataFrame with duplicates removed:
          Company name                                            Ad Text  \
0        Airtel India                                                NaN   
59       Airtel India  True Stories of Kerala by Airtel0:00 / 0:15AD....   
60       Airtel India  True Stories of Kerala by Airtel0:00 / 0:15AD....   
78       Airtel India  Sometimes there's no right & wrong. This isn't...   
96       Airtel India  Why‚Ä¶‚Ä¶. not switch, when you get so many benefi...   
...               ...                                                ...   
2634  T-mobile Polska                               T-MOBILE.PLGet Offer   
2640  T-mobile Polska  Telefon, aplikacja i akcja! üé¨ Zapraszamy do Ci...   
2646  T-mobile Polska  Z Magenta Moments ≈ºycie smakuje podw√≥jnie. üíó W...   
2652  T-mobile Polska  Najlepszy moment na zakupy? üõçÔ∏è Ten, kiedy sƒÖ n...   
2660  T-mobile Polska  Jak to siƒô dzieje: li≈õcie spadajƒÖ, a Internetu...   

     Ad status               

In [17]:
from sqlalchemy import create_engine

# Create MySQL engine
engine = create_engine('mysql+pymysql://root:Binu1997#$@localhost/research_db')

In [18]:
# Upload DataFrame to SQL
df_no_duplicates.to_sql('ads_table', engine, if_exists='replace', index=False)

print("Data uploaded successfully!")

Data uploaded successfully!
