Spaces:

BinuraYasodya
/

Facebook-ad-search

Runtime error

App Files Files Community

BinuraYasodya commited on Jul 23, 2024

Commit

6c34694

verified ·

1 Parent(s): 7c51e14

Upload 4 files

Browse files

add css and test files as well

Files changed (4) hide show

main.py +73 -0
requirements.txt +11 -0
styles.css +26 -0
test5.ipynb +346 -0

main.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.service import Service as ChromeService
+from webdriver_manager.chrome import ChromeDriverManager
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+import numpy as np
+from sqlalchemy import create_engine
+class AdScraper:
+    def __init__(self):
+        self.driver = None
+        self.ad_details = []
+        self.companies_urls = {
+            "Airtel India": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=147351511955143&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all",
+            "Celcom Malaysia": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=103384636066809&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all",
+            "Vodafone UK": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=67884984384&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all",
+            "T-mobile Polska": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=166466416745074&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all"
+        }
+    def setup_driver(self):
+        options = webdriver.ChromeOptions()
+        options.headless = True
+        self.driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
+    def scroll_page(self):
+        last_height = self.driver.execute_script("return document.body.scrollHeight")
+        while True:
+            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+            time.sleep(6)
+            new_height = self.driver.execute_script("return document.body.scrollHeight")
+            if new_height == last_height:
+                break
+            last_height = new_height
+    def parse_ads(self, soup, company):
+        ads = soup.find_all('div', class_='xh8yej3')
+        for ad in ads:
+            ad_text = ad.find('div', class_='x6ikm8r x10wlt62').text if ad.find('div', class_='x6ikm8r x10wlt62') else 'N/A'
+            ad_status = ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx').text if ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx') else 'N/A'
+            img_tag = ad.find('img', class_='x1ll5gia x19kjcj4 xh8yej3')
+            video_tag = ad.find('video', class_='x1lliihq x5yr21d xh8yej3')
+            artwork_link = img_tag['src'] if img_tag else video_tag['src'] if video_tag else 'N/A'
+            self.ad_details.append({'Company name': company, 'Ad Text': ad_text, 'Ad status': ad_status, 'Artwork Link': artwork_link})
+    def scrape_ads(self):
+        self.setup_driver()
+        for company, url in self.companies_urls.items():
+            self.driver.get(url)
+            self.scroll_page()
+            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
+            self.parse_ads(soup, company)
+        self.driver.quit()
+    def process_data(self):
+        df = pd.DataFrame(self.ad_details)
+        df.replace('N/A', np.nan, inplace=True)
+        df.dropna(how='all', inplace=True)
+        df.drop_duplicates(subset=['Artwork Link'], inplace=True)
+        return df
+    def upload_data(self, df):
+        engine = create_engine('mysql+pymysql://root:Binu1997#$@localhost/research_db')
+        df.to_sql('ads_table', engine, if_exists='replace', index=False)
+        print("Data uploaded successfully!")
+if __name__ == "__main__":
+    scraper = AdScraper()
+    scraper.scrape_ads()
+    data = scraper.process_data()
+    print("\nDataFrame with duplicates removed:\n", data)
+    scraper.upload_data(data)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+requests
+beautifulsoup4
+pandas
+openpyxl
+lxml
+xlwt
+selenium
+sqlalchemy
+pymysql
+cryptography
+streamlit

styles.css ADDED Viewed

	@@ -0,0 +1,26 @@

+.main-title {
+    font-size: 50px;
+    color: #FFA500;
+    text-align: center;
+    font-weight: 700;
+}
+.heading {
+    font-size: 25px;
+    color: #FF6347;
+    font-weight: 600;
+}
+.summary-text {
+    font-size: 18px;
+    color: #008080;
+    font-weight: 500;
+}
+.company-select {
+    font-size: 18px;
+    color: #FF4500;
+    font-weight: 600;
+}
+.dataframe-style {
+    background-color: #F5F5F5;
+    border-radius: 10px;
+    padding: 10px;
+}

test5.ipynb ADDED Viewed

	@@ -0,0 +1,346 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "         Company name                                            Ad Text  \\\n",
+      "0        Airtel India                                                N/A   \n",
+      "1        Airtel India                                                N/A   \n",
+      "2        Airtel India                                                N/A   \n",
+      "3        Airtel India                                                N/A   \n",
+      "4        Airtel India                                                N/A   \n",
+      "...               ...                                                ...   \n",
+      "2663  T-mobile Polska                                                N/A   \n",
+      "2664  T-mobile Polska                                                N/A   \n",
+      "2665  T-mobile Polska  Jak to się dzieje: liście spadają, a Internetu...   \n",
+      "2666  T-mobile Polska                                                N/A   \n",
+      "2667  T-mobile Polska                                                N/A   \n",
+      "\n",
+      "     Ad status                                       Artwork Link  \n",
+      "0          N/A                                                N/A  \n",
+      "1          N/A                                                N/A  \n",
+      "2          N/A                                                N/A  \n",
+      "3          N/A                                                N/A  \n",
+      "4          N/A                                                N/A  \n",
+      "...        ...                                                ...  \n",
+      "2663       N/A                                                N/A  \n",
+      "2664       N/A                                                N/A  \n",
+      "2665       N/A  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
+      "2666       N/A                                                N/A  \n",
+      "2667       N/A                                                N/A  \n",
+      "\n",
+      "[2668 rows x 4 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from selenium import webdriver \n",
+    "from selenium.webdriver.common.by import By \n",
+    "from selenium.webdriver.chrome.service import Service as ChromeService \n",
+    "from webdriver_manager.chrome import ChromeDriverManager\n",
+    "import time\n",
+    "from bs4 import BeautifulSoup \n",
+    "import pandas as pd\n",
+    "\n",
+    "# instantiate options \n",
+    "options = webdriver.ChromeOptions() \n",
+    "\n",
+    "# run browser in headless mode \n",
+    "options.headless = True \n",
+    "\n",
+    "# instantiate driver \n",
+    "driver = webdriver.Chrome(service=ChromeService( \n",
+    "    ChromeDriverManager().install()), options=options) \n",
+    "\n",
+    "# List of companies and their URLs\n",
+    "companies_urls = {\n",
+    "    \"Airtel India\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=147351511955143&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\",\n",
+    "    \"Celcom Malaysia\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=103384636066809&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\",\n",
+    "    \"Vodafone UK\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=67884984384&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\",\n",
+    "    \"T-mobile Polska\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=166466416745074&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\"\n",
+    "}\n",
+    "\n",
+    "ad_details = []\n",
+    "\n",
+    "for company, url in companies_urls.items():\n",
+    "    # Load the page\n",
+    "    driver.get(url)\n",
+    "\n",
+    "    # Scroll to the bottom of the page\n",
+    "    last_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
+    "    \n",
+    "    while True:\n",
+    "        driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
+    "        time.sleep(6)  # Wait for new content to load\n",
+    "        \n",
+    "        new_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
+    "        if new_height == last_height:\n",
+    "            break\n",
+    "        last_height = new_height\n",
+    "\n",
+    "    # Parse the page content\n",
+    "    soup = BeautifulSoup(driver.page_source, 'html.parser')\n",
+    "\n",
+    "    # Find advertisement details\n",
+    "    ads = soup.find_all('div', class_='xh8yej3')\n",
+    "\n",
+    "    for ad in ads:\n",
+    "        # Extract the ad text\n",
+    "        ad_text = ad.find('div', class_='x6ikm8r x10wlt62').text if ad.find('div', class_='x6ikm8r x10wlt62') else 'N/A'\n",
+    "\n",
+    "        # Extract company name\n",
+    "        company_name = company\n",
+    "\n",
+    "        # Extract status\n",
+    "        ad_status = ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx').text if ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx') else 'N/A'\n",
+    "\n",
+    "        # Extract image or video link\n",
+    "        img_tag = ad.find('img', class_='x1ll5gia x19kjcj4 xh8yej3')\n",
+    "        video_tag = ad.find('video', class_='x1lliihq x5yr21d xh8yej3')\n",
+    "\n",
+    "        if img_tag:\n",
+    "            artwork_link = img_tag['src']\n",
+    "        elif video_tag:\n",
+    "            artwork_link = video_tag['src']\n",
+    "        else:\n",
+    "            artwork_link = 'N/A'\n",
+    "\n",
+    "        ad_details.append({'Company name': company_name, 'Ad Text': ad_text, 'Ad status': ad_status, 'Artwork Link': artwork_link})\n",
+    "\n",
+    "# Close the WebDriver\n",
+    "driver.quit()\n",
+    "\n",
+    "# Convert to pandas DataFrame\n",
+    "df = pd.DataFrame(ad_details)\n",
+    "print(df)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(2668, 4)"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "# Replace 'N/A' strings with np.nan\n",
+    "df.replace('N/A', np.nan, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Remove rows where all values are NaN\n",
+    "df_all_null_removed = df.dropna(how='all')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "duplicates = df_all_null_removed.duplicated()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Duplicate rows only:\n",
+      "          Company name Ad Text Ad status Artwork Link\n",
+      "1        Airtel India     NaN       NaN          NaN\n",
+      "2        Airtel India     NaN       NaN          NaN\n",
+      "3        Airtel India     NaN       NaN          NaN\n",
+      "4        Airtel India     NaN       NaN          NaN\n",
+      "5        Airtel India     NaN       NaN          NaN\n",
+      "...               ...     ...       ...          ...\n",
+      "2662  T-mobile Polska     NaN    Active          NaN\n",
+      "2663  T-mobile Polska     NaN       NaN          NaN\n",
+      "2664  T-mobile Polska     NaN       NaN          NaN\n",
+      "2666  T-mobile Polska     NaN       NaN          NaN\n",
+      "2667  T-mobile Polska     NaN       NaN          NaN\n",
+      "\n",
+      "[2059 rows x 4 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Filter duplicate rows\n",
+    "duplicate_rows = df_all_null_removed[df_all_null_removed.duplicated()]\n",
+    "\n",
+    "print(\"\\nDuplicate rows only:\\n\", duplicate_rows)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "DataFrame with duplicates removed:\n",
+      "          Company name                                            Ad Text  \\\n",
+      "0        Airtel India                                                NaN   \n",
+      "59       Airtel India  True Stories of Kerala by Airtel0:00 / 0:15AD....   \n",
+      "60       Airtel India  True Stories of Kerala by Airtel0:00 / 0:15AD....   \n",
+      "78       Airtel India  Sometimes there's no right & wrong. This isn't...   \n",
+      "96       Airtel India  Why……. not switch, when you get so many benefi...   \n",
+      "...               ...                                                ...   \n",
+      "2634  T-mobile Polska                               T-MOBILE.PLGet Offer   \n",
+      "2640  T-mobile Polska  Telefon, aplikacja i akcja! 🎬 Zapraszamy do Ci...   \n",
+      "2646  T-mobile Polska  Z Magenta Moments życie smakuje podwójnie. 💗 W...   \n",
+      "2652  T-mobile Polska  Najlepszy moment na zakupy? 🛍️ Ten, kiedy są n...   \n",
+      "2660  T-mobile Polska  Jak to się dzieje: liście spadają, a Internetu...   \n",
+      "\n",
+      "     Ad status                                       Artwork Link  \n",
+      "0          NaN                                                NaN  \n",
+      "59      Active  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
+      "60      Active  https://video.fcmb1-2.fna.fbcdn.net/v/t42.1790...  \n",
+      "78      Active  https://video.fcmb1-2.fna.fbcdn.net/v/t42.1790...  \n",
+      "96      Active  https://video.fcmb1-2.fna.fbcdn.net/v/t42.1790...  \n",
+      "...        ...                                                ...  \n",
+      "2634    Active  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
+      "2640    Active  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
+      "2646    Active  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
+      "2652    Active  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
+      "2660    Active  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
+      "\n",
+      "[263 rows x 4 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Remove duplicate rows\n",
+    "df_no_duplicates = df_all_null_removed.drop_duplicates(subset=['Artwork Link'])\n",
+    "\n",
+    "print(\"\\nDataFrame with duplicates removed:\\n\", df_no_duplicates)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sqlalchemy import create_engine\n",
+    "\n",
+    "# Create MySQL engine\n",
+    "engine = create_engine('mysql+pymysql://root:Binu1997#$@localhost/research_db')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data uploaded successfully!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Upload DataFrame to SQL\n",
+    "df_no_duplicates.to_sql('ads_table', engine, if_exists='replace', index=False)\n",
+    "\n",
+    "print(\"Data uploaded successfully!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "myenv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}