{ "cells": [ { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Company name Ad Text \\\n", "0 Airtel India N/A \n", "1 Airtel India N/A \n", "2 Airtel India N/A \n", "3 Airtel India N/A \n", "4 Airtel India N/A \n", "... ... ... \n", "2663 T-mobile Polska N/A \n", "2664 T-mobile Polska N/A \n", "2665 T-mobile Polska Jak to się dzieje: liście spadają, a Internetu... \n", "2666 T-mobile Polska N/A \n", "2667 T-mobile Polska N/A \n", "\n", " Ad status Artwork Link \n", "0 N/A N/A \n", "1 N/A N/A \n", "2 N/A N/A \n", "3 N/A N/A \n", "4 N/A N/A \n", "... ... ... \n", "2663 N/A N/A \n", "2664 N/A N/A \n", "2665 N/A https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3... \n", "2666 N/A N/A \n", "2667 N/A N/A \n", "\n", "[2668 rows x 4 columns]\n" ] } ], "source": [ "from selenium import webdriver \n", "from selenium.webdriver.common.by import By \n", "from selenium.webdriver.chrome.service import Service as ChromeService \n", "from webdriver_manager.chrome import ChromeDriverManager\n", "import time\n", "from bs4 import BeautifulSoup \n", "import pandas as pd\n", "\n", "# instantiate options \n", "options = webdriver.ChromeOptions() \n", "\n", "# run browser in headless mode \n", "options.headless = True \n", "\n", "# instantiate driver \n", "driver = webdriver.Chrome(service=ChromeService( \n", " ChromeDriverManager().install()), options=options) \n", "\n", "# List of companies and their URLs\n", "companies_urls = {\n", " \"Airtel India\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=147351511955143&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\",\n", " \"Celcom Malaysia\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=103384636066809&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\",\n", " \"Vodafone UK\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=67884984384&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\",\n", " \"T-mobile Polska\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=166466416745074&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\"\n", "}\n", "\n", "ad_details = []\n", "\n", "for company, url in companies_urls.items():\n", " # Load the page\n", " driver.get(url)\n", "\n", " # Scroll to the bottom of the page\n", " last_height = driver.execute_script(\"return document.body.scrollHeight\")\n", " \n", " while True:\n", " driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n", " time.sleep(6) # Wait for new content to load\n", " \n", " new_height = driver.execute_script(\"return document.body.scrollHeight\")\n", " if new_height == last_height:\n", " break\n", " last_height = new_height\n", "\n", " # Parse the page content\n", " soup = BeautifulSoup(driver.page_source, 'html.parser')\n", "\n", " # Find advertisement details\n", " ads = soup.find_all('div', class_='xh8yej3')\n", "\n", " for ad in ads:\n", " # Extract the ad text\n", " ad_text = ad.find('div', class_='x6ikm8r x10wlt62').text if ad.find('div', class_='x6ikm8r x10wlt62') else 'N/A'\n", "\n", " # Extract company name\n", " company_name = company\n", "\n", " # Extract status\n", " ad_status = ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx').text if ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx') else 'N/A'\n", "\n", " # Extract image or video link\n", " img_tag = ad.find('img', class_='x1ll5gia x19kjcj4 xh8yej3')\n", " video_tag = ad.find('video', class_='x1lliihq x5yr21d xh8yej3')\n", "\n", " if img_tag:\n", " artwork_link = img_tag['src']\n", " elif video_tag:\n", " artwork_link = video_tag['src']\n", " else:\n", " artwork_link = 'N/A'\n", "\n", " ad_details.append({'Company name': company_name, 'Ad Text': ad_text, 'Ad status': ad_status, 'Artwork Link': artwork_link})\n", "\n", "# Close the WebDriver\n", "driver.quit()\n", "\n", "# Convert to pandas DataFrame\n", "df = pd.DataFrame(ad_details)\n", "print(df)\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2668, 4)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "# Replace 'N/A' strings with np.nan\n", "df.replace('N/A', np.nan, inplace=True)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Remove rows where all values are NaN\n", "df_all_null_removed = df.dropna(how='all')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "duplicates = df_all_null_removed.duplicated()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Duplicate rows only:\n", " Company name Ad Text Ad status Artwork Link\n", "1 Airtel India NaN NaN NaN\n", "2 Airtel India NaN NaN NaN\n", "3 Airtel India NaN NaN NaN\n", "4 Airtel India NaN NaN NaN\n", "5 Airtel India NaN NaN NaN\n", "... ... ... ... ...\n", "2662 T-mobile Polska NaN Active NaN\n", "2663 T-mobile Polska NaN NaN NaN\n", "2664 T-mobile Polska NaN NaN NaN\n", "2666 T-mobile Polska NaN NaN NaN\n", "2667 T-mobile Polska NaN NaN NaN\n", "\n", "[2059 rows x 4 columns]\n" ] } ], "source": [ "# Filter duplicate rows\n", "duplicate_rows = df_all_null_removed[df_all_null_removed.duplicated()]\n", "\n", "print(\"\\nDuplicate rows only:\\n\", duplicate_rows)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "DataFrame with duplicates removed:\n", " Company name Ad Text \\\n", "0 Airtel India NaN \n", "59 Airtel India True Stories of Kerala by Airtel0:00 / 0:15AD.... \n", "60 Airtel India True Stories of Kerala by Airtel0:00 / 0:15AD.... \n", "78 Airtel India Sometimes there's no right & wrong. This isn't... \n", "96 Airtel India Why……. not switch, when you get so many benefi... \n", "... ... ... \n", "2634 T-mobile Polska T-MOBILE.PLGet Offer \n", "2640 T-mobile Polska Telefon, aplikacja i akcja! 🎬 Zapraszamy do Ci... \n", "2646 T-mobile Polska Z Magenta Moments życie smakuje podwójnie. 💗 W... \n", "2652 T-mobile Polska Najlepszy moment na zakupy? 🛍️ Ten, kiedy są n... \n", "2660 T-mobile Polska Jak to się dzieje: liście spadają, a Internetu... \n", "\n", " Ad status Artwork Link \n", "0 NaN NaN \n", "59 Active https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3... \n", "60 Active https://video.fcmb1-2.fna.fbcdn.net/v/t42.1790... \n", "78 Active https://video.fcmb1-2.fna.fbcdn.net/v/t42.1790... \n", "96 Active https://video.fcmb1-2.fna.fbcdn.net/v/t42.1790... \n", "... ... ... \n", "2634 Active https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3... \n", "2640 Active https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3... \n", "2646 Active https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3... \n", "2652 Active https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3... \n", "2660 Active https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3... \n", "\n", "[263 rows x 4 columns]\n" ] } ], "source": [ "# Remove duplicate rows\n", "df_no_duplicates = df_all_null_removed.drop_duplicates(subset=['Artwork Link'])\n", "\n", "print(\"\\nDataFrame with duplicates removed:\\n\", df_no_duplicates)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "from sqlalchemy import create_engine\n", "\n", "# Create MySQL engine\n", "engine = create_engine('mysql+pymysql://root:Binu1997#$@localhost/research_db')" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Data uploaded successfully!\n" ] } ], "source": [ "# Upload DataFrame to SQL\n", "df_no_duplicates.to_sql('ads_table', engine, if_exists='replace', index=False)\n", "\n", "print(\"Data uploaded successfully!\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "myenv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.0" } }, "nbformat": 4, "nbformat_minor": 2 }