{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "         Company name                                            Ad Text  \\\n",
      "0        Airtel India                                                N/A   \n",
      "1        Airtel India                                                N/A   \n",
      "2        Airtel India                                                N/A   \n",
      "3        Airtel India                                                N/A   \n",
      "4        Airtel India                                                N/A   \n",
      "...               ...                                                ...   \n",
      "2663  T-mobile Polska                                                N/A   \n",
      "2664  T-mobile Polska                                                N/A   \n",
      "2665  T-mobile Polska  Jak to się dzieje: liście spadają, a Internetu...   \n",
      "2666  T-mobile Polska                                                N/A   \n",
      "2667  T-mobile Polska                                                N/A   \n",
      "\n",
      "     Ad status                                       Artwork Link  \n",
      "0          N/A                                                N/A  \n",
      "1          N/A                                                N/A  \n",
      "2          N/A                                                N/A  \n",
      "3          N/A                                                N/A  \n",
      "4          N/A                                                N/A  \n",
      "...        ...                                                ...  \n",
      "2663       N/A                                                N/A  \n",
      "2664       N/A                                                N/A  \n",
      "2665       N/A  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
      "2666       N/A                                                N/A  \n",
      "2667       N/A                                                N/A  \n",
      "\n",
      "[2668 rows x 4 columns]\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver \n",
    "from selenium.webdriver.common.by import By \n",
    "from selenium.webdriver.chrome.service import Service as ChromeService \n",
    "from webdriver_manager.chrome import ChromeDriverManager\n",
    "import time\n",
    "from bs4 import BeautifulSoup \n",
    "import pandas as pd\n",
    "\n",
    "# instantiate options \n",
    "options = webdriver.ChromeOptions() \n",
    "\n",
    "# run browser in headless mode \n",
    "options.headless = True \n",
    "\n",
    "# instantiate driver \n",
    "driver = webdriver.Chrome(service=ChromeService( \n",
    "    ChromeDriverManager().install()), options=options) \n",
    "\n",
    "# List of companies and their URLs\n",
    "companies_urls = {\n",
    "    \"Airtel India\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=147351511955143&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\",\n",
    "    \"Celcom Malaysia\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=103384636066809&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\",\n",
    "    \"Vodafone UK\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=67884984384&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\",\n",
    "    \"T-mobile Polska\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=166466416745074&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\"\n",
    "}\n",
    "\n",
    "ad_details = []\n",
    "\n",
    "for company, url in companies_urls.items():\n",
    "    # Load the page\n",
    "    driver.get(url)\n",
    "\n",
    "    # Scroll to the bottom of the page\n",
    "    last_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
    "    \n",
    "    while True:\n",
    "        driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
    "        time.sleep(6)  # Wait for new content to load\n",
    "        \n",
    "        new_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
    "        if new_height == last_height:\n",
    "            break\n",
    "        last_height = new_height\n",
    "\n",
    "    # Parse the page content\n",
    "    soup = BeautifulSoup(driver.page_source, 'html.parser')\n",
    "\n",
    "    # Find advertisement details\n",
    "    ads = soup.find_all('div', class_='xh8yej3')\n",
    "\n",
    "    for ad in ads:\n",
    "        # Extract the ad text\n",
    "        ad_text = ad.find('div', class_='x6ikm8r x10wlt62').text if ad.find('div', class_='x6ikm8r x10wlt62') else 'N/A'\n",
    "\n",
    "        # Extract company name\n",
    "        company_name = company\n",
    "\n",
    "        # Extract status\n",
    "        ad_status = ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx').text if ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx') else 'N/A'\n",
    "\n",
    "        # Extract image or video link\n",
    "        img_tag = ad.find('img', class_='x1ll5gia x19kjcj4 xh8yej3')\n",
    "        video_tag = ad.find('video', class_='x1lliihq x5yr21d xh8yej3')\n",
    "\n",
    "        if img_tag:\n",
    "            artwork_link = img_tag['src']\n",
    "        elif video_tag:\n",
    "            artwork_link = video_tag['src']\n",
    "        else:\n",
    "            artwork_link = 'N/A'\n",
    "\n",
    "        ad_details.append({'Company name': company_name, 'Ad Text': ad_text, 'Ad status': ad_status, 'Artwork Link': artwork_link})\n",
    "\n",
    "# Close the WebDriver\n",
    "driver.quit()\n",
    "\n",
    "# Convert to pandas DataFrame\n",
    "df = pd.DataFrame(ad_details)\n",
    "print(df)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2668, 4)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "# Replace 'N/A' strings with np.nan\n",
    "df.replace('N/A', np.nan, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove rows where all values are NaN\n",
    "df_all_null_removed = df.dropna(how='all')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "duplicates = df_all_null_removed.duplicated()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Duplicate rows only:\n",
      "          Company name Ad Text Ad status Artwork Link\n",
      "1        Airtel India     NaN       NaN          NaN\n",
      "2        Airtel India     NaN       NaN          NaN\n",
      "3        Airtel India     NaN       NaN          NaN\n",
      "4        Airtel India     NaN       NaN          NaN\n",
      "5        Airtel India     NaN       NaN          NaN\n",
      "...               ...     ...       ...          ...\n",
      "2662  T-mobile Polska     NaN    Active          NaN\n",
      "2663  T-mobile Polska     NaN       NaN          NaN\n",
      "2664  T-mobile Polska     NaN       NaN          NaN\n",
      "2666  T-mobile Polska     NaN       NaN          NaN\n",
      "2667  T-mobile Polska     NaN       NaN          NaN\n",
      "\n",
      "[2059 rows x 4 columns]\n"
     ]
    }
   ],
   "source": [
    "# Filter duplicate rows\n",
    "duplicate_rows = df_all_null_removed[df_all_null_removed.duplicated()]\n",
    "\n",
    "print(\"\\nDuplicate rows only:\\n\", duplicate_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "DataFrame with duplicates removed:\n",
      "          Company name                                            Ad Text  \\\n",
      "0        Airtel India                                                NaN   \n",
      "59       Airtel India  True Stories of Kerala by Airtel0:00 / 0:15AD....   \n",
      "60       Airtel India  True Stories of Kerala by Airtel0:00 / 0:15AD....   \n",
      "78       Airtel India  Sometimes there's no right & wrong. This isn't...   \n",
      "96       Airtel India  Why……. not switch, when you get so many benefi...   \n",
      "...               ...                                                ...   \n",
      "2634  T-mobile Polska                               T-MOBILE.PLGet Offer   \n",
      "2640  T-mobile Polska  Telefon, aplikacja i akcja! 🎬 Zapraszamy do Ci...   \n",
      "2646  T-mobile Polska  Z Magenta Moments życie smakuje podwójnie. 💗 W...   \n",
      "2652  T-mobile Polska  Najlepszy moment na zakupy? 🛍️ Ten, kiedy są n...   \n",
      "2660  T-mobile Polska  Jak to się dzieje: liście spadają, a Internetu...   \n",
      "\n",
      "     Ad status                                       Artwork Link  \n",
      "0          NaN                                                NaN  \n",
      "59      Active  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
      "60      Active  https://video.fcmb1-2.fna.fbcdn.net/v/t42.1790...  \n",
      "78      Active  https://video.fcmb1-2.fna.fbcdn.net/v/t42.1790...  \n",
      "96      Active  https://video.fcmb1-2.fna.fbcdn.net/v/t42.1790...  \n",
      "...        ...                                                ...  \n",
      "2634    Active  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
      "2640    Active  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
      "2646    Active  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
      "2652    Active  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
      "2660    Active  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
      "\n",
      "[263 rows x 4 columns]\n"
     ]
    }
   ],
   "source": [
    "# Remove duplicate rows\n",
    "df_no_duplicates = df_all_null_removed.drop_duplicates(subset=['Artwork Link'])\n",
    "\n",
    "print(\"\\nDataFrame with duplicates removed:\\n\", df_no_duplicates)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sqlalchemy import create_engine\n",
    "\n",
    "# Create MySQL engine\n",
    "engine = create_engine('mysql+pymysql://root:Binu1997#$@localhost/research_db')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data uploaded successfully!\n"
     ]
    }
   ],
   "source": [
    "# Upload DataFrame to SQL\n",
    "df_no_duplicates.to_sql('ads_table', engine, if_exists='replace', index=False)\n",
    "\n",
    "print(\"Data uploaded successfully!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "myenv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}