File size: 12,784 Bytes
6c34694
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "         Company name                                            Ad Text  \\\n",
      "0        Airtel India                                                N/A   \n",
      "1        Airtel India                                                N/A   \n",
      "2        Airtel India                                                N/A   \n",
      "3        Airtel India                                                N/A   \n",
      "4        Airtel India                                                N/A   \n",
      "...               ...                                                ...   \n",
      "2663  T-mobile Polska                                                N/A   \n",
      "2664  T-mobile Polska                                                N/A   \n",
      "2665  T-mobile Polska  Jak to się dzieje: liście spadają, a Internetu...   \n",
      "2666  T-mobile Polska                                                N/A   \n",
      "2667  T-mobile Polska                                                N/A   \n",
      "\n",
      "     Ad status                                       Artwork Link  \n",
      "0          N/A                                                N/A  \n",
      "1          N/A                                                N/A  \n",
      "2          N/A                                                N/A  \n",
      "3          N/A                                                N/A  \n",
      "4          N/A                                                N/A  \n",
      "...        ...                                                ...  \n",
      "2663       N/A                                                N/A  \n",
      "2664       N/A                                                N/A  \n",
      "2665       N/A  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
      "2666       N/A                                                N/A  \n",
      "2667       N/A                                                N/A  \n",
      "\n",
      "[2668 rows x 4 columns]\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver \n",
    "from selenium.webdriver.common.by import By \n",
    "from selenium.webdriver.chrome.service import Service as ChromeService \n",
    "from webdriver_manager.chrome import ChromeDriverManager\n",
    "import time\n",
    "from bs4 import BeautifulSoup \n",
    "import pandas as pd\n",
    "\n",
    "# instantiate options \n",
    "options = webdriver.ChromeOptions() \n",
    "\n",
    "# run browser in headless mode \n",
    "options.headless = True \n",
    "\n",
    "# instantiate driver \n",
    "driver = webdriver.Chrome(service=ChromeService( \n",
    "    ChromeDriverManager().install()), options=options) \n",
    "\n",
    "# List of companies and their URLs\n",
    "companies_urls = {\n",
    "    \"Airtel India\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=147351511955143&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\",\n",
    "    \"Celcom Malaysia\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=103384636066809&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\",\n",
    "    \"Vodafone UK\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=67884984384&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\",\n",
    "    \"T-mobile Polska\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=166466416745074&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\"\n",
    "}\n",
    "\n",
    "ad_details = []\n",
    "\n",
    "for company, url in companies_urls.items():\n",
    "    # Load the page\n",
    "    driver.get(url)\n",
    "\n",
    "    # Scroll to the bottom of the page\n",
    "    last_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
    "    \n",
    "    while True:\n",
    "        driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
    "        time.sleep(6)  # Wait for new content to load\n",
    "        \n",
    "        new_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
    "        if new_height == last_height:\n",
    "            break\n",
    "        last_height = new_height\n",
    "\n",
    "    # Parse the page content\n",
    "    soup = BeautifulSoup(driver.page_source, 'html.parser')\n",
    "\n",
    "    # Find advertisement details\n",
    "    ads = soup.find_all('div', class_='xh8yej3')\n",
    "\n",
    "    for ad in ads:\n",
    "        # Extract the ad text\n",
    "        ad_text = ad.find('div', class_='x6ikm8r x10wlt62').text if ad.find('div', class_='x6ikm8r x10wlt62') else 'N/A'\n",
    "\n",
    "        # Extract company name\n",
    "        company_name = company\n",
    "\n",
    "        # Extract status\n",
    "        ad_status = ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx').text if ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx') else 'N/A'\n",
    "\n",
    "        # Extract image or video link\n",
    "        img_tag = ad.find('img', class_='x1ll5gia x19kjcj4 xh8yej3')\n",
    "        video_tag = ad.find('video', class_='x1lliihq x5yr21d xh8yej3')\n",
    "\n",
    "        if img_tag:\n",
    "            artwork_link = img_tag['src']\n",
    "        elif video_tag:\n",
    "            artwork_link = video_tag['src']\n",
    "        else:\n",
    "            artwork_link = 'N/A'\n",
    "\n",
    "        ad_details.append({'Company name': company_name, 'Ad Text': ad_text, 'Ad status': ad_status, 'Artwork Link': artwork_link})\n",
    "\n",
    "# Close the WebDriver\n",
    "driver.quit()\n",
    "\n",
    "# Convert to pandas DataFrame\n",
    "df = pd.DataFrame(ad_details)\n",
    "print(df)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2668, 4)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "# Replace 'N/A' strings with np.nan\n",
    "df.replace('N/A', np.nan, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove rows where all values are NaN\n",
    "df_all_null_removed = df.dropna(how='all')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "duplicates = df_all_null_removed.duplicated()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Duplicate rows only:\n",
      "          Company name Ad Text Ad status Artwork Link\n",
      "1        Airtel India     NaN       NaN          NaN\n",
      "2        Airtel India     NaN       NaN          NaN\n",
      "3        Airtel India     NaN       NaN          NaN\n",
      "4        Airtel India     NaN       NaN          NaN\n",
      "5        Airtel India     NaN       NaN          NaN\n",
      "...               ...     ...       ...          ...\n",
      "2662  T-mobile Polska     NaN    Active          NaN\n",
      "2663  T-mobile Polska     NaN       NaN          NaN\n",
      "2664  T-mobile Polska     NaN       NaN          NaN\n",
      "2666  T-mobile Polska     NaN       NaN          NaN\n",
      "2667  T-mobile Polska     NaN       NaN          NaN\n",
      "\n",
      "[2059 rows x 4 columns]\n"
     ]
    }
   ],
   "source": [
    "# Filter duplicate rows\n",
    "duplicate_rows = df_all_null_removed[df_all_null_removed.duplicated()]\n",
    "\n",
    "print(\"\\nDuplicate rows only:\\n\", duplicate_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "DataFrame with duplicates removed:\n",
      "          Company name                                            Ad Text  \\\n",
      "0        Airtel India                                                NaN   \n",
      "59       Airtel India  True Stories of Kerala by Airtel0:00 / 0:15AD....   \n",
      "60       Airtel India  True Stories of Kerala by Airtel0:00 / 0:15AD....   \n",
      "78       Airtel India  Sometimes there's no right & wrong. This isn't...   \n",
      "96       Airtel India  Why……. not switch, when you get so many benefi...   \n",
      "...               ...                                                ...   \n",
      "2634  T-mobile Polska                               T-MOBILE.PLGet Offer   \n",
      "2640  T-mobile Polska  Telefon, aplikacja i akcja! 🎬 Zapraszamy do Ci...   \n",
      "2646  T-mobile Polska  Z Magenta Moments życie smakuje podwójnie. 💗 W...   \n",
      "2652  T-mobile Polska  Najlepszy moment na zakupy? 🛍️ Ten, kiedy są n...   \n",
      "2660  T-mobile Polska  Jak to się dzieje: liście spadają, a Internetu...   \n",
      "\n",
      "     Ad status                                       Artwork Link  \n",
      "0          NaN                                                NaN  \n",
      "59      Active  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
      "60      Active  https://video.fcmb1-2.fna.fbcdn.net/v/t42.1790...  \n",
      "78      Active  https://video.fcmb1-2.fna.fbcdn.net/v/t42.1790...  \n",
      "96      Active  https://video.fcmb1-2.fna.fbcdn.net/v/t42.1790...  \n",
      "...        ...                                                ...  \n",
      "2634    Active  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
      "2640    Active  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
      "2646    Active  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
      "2652    Active  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
      "2660    Active  https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3...  \n",
      "\n",
      "[263 rows x 4 columns]\n"
     ]
    }
   ],
   "source": [
    "# Remove duplicate rows\n",
    "df_no_duplicates = df_all_null_removed.drop_duplicates(subset=['Artwork Link'])\n",
    "\n",
    "print(\"\\nDataFrame with duplicates removed:\\n\", df_no_duplicates)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sqlalchemy import create_engine\n",
    "\n",
    "# Create MySQL engine\n",
    "engine = create_engine('mysql+pymysql://root:Binu1997#$@localhost/research_db')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data uploaded successfully!\n"
     ]
    }
   ],
   "source": [
    "# Upload DataFrame to SQL\n",
    "df_no_duplicates.to_sql('ads_table', engine, if_exists='replace', index=False)\n",
    "\n",
    "print(\"Data uploaded successfully!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "myenv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}