BinuraYasodya commited on
Commit
6c34694
·
verified ·
1 Parent(s): 7c51e14

Upload 4 files

Browse files

add css and test files as well

Files changed (4) hide show
  1. main.py +73 -0
  2. requirements.txt +11 -0
  3. styles.css +26 -0
  4. test5.ipynb +346 -0
main.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.common.by import By
3
+ from selenium.webdriver.chrome.service import Service as ChromeService
4
+ from webdriver_manager.chrome import ChromeDriverManager
5
+ import time
6
+ from bs4 import BeautifulSoup
7
+ import pandas as pd
8
+ import numpy as np
9
+ from sqlalchemy import create_engine
10
+
11
+ class AdScraper:
12
+ def __init__(self):
13
+ self.driver = None
14
+ self.ad_details = []
15
+ self.companies_urls = {
16
+ "Airtel India": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=147351511955143&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all",
17
+ "Celcom Malaysia": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=103384636066809&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all",
18
+ "Vodafone UK": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=67884984384&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all",
19
+ "T-mobile Polska": "https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=166466416745074&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all"
20
+ }
21
+
22
+ def setup_driver(self):
23
+ options = webdriver.ChromeOptions()
24
+ options.headless = True
25
+ self.driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
26
+
27
+ def scroll_page(self):
28
+ last_height = self.driver.execute_script("return document.body.scrollHeight")
29
+ while True:
30
+ self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
31
+ time.sleep(6)
32
+ new_height = self.driver.execute_script("return document.body.scrollHeight")
33
+ if new_height == last_height:
34
+ break
35
+ last_height = new_height
36
+
37
+ def parse_ads(self, soup, company):
38
+ ads = soup.find_all('div', class_='xh8yej3')
39
+ for ad in ads:
40
+ ad_text = ad.find('div', class_='x6ikm8r x10wlt62').text if ad.find('div', class_='x6ikm8r x10wlt62') else 'N/A'
41
+ ad_status = ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx').text if ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx') else 'N/A'
42
+ img_tag = ad.find('img', class_='x1ll5gia x19kjcj4 xh8yej3')
43
+ video_tag = ad.find('video', class_='x1lliihq x5yr21d xh8yej3')
44
+ artwork_link = img_tag['src'] if img_tag else video_tag['src'] if video_tag else 'N/A'
45
+ self.ad_details.append({'Company name': company, 'Ad Text': ad_text, 'Ad status': ad_status, 'Artwork Link': artwork_link})
46
+
47
+ def scrape_ads(self):
48
+ self.setup_driver()
49
+ for company, url in self.companies_urls.items():
50
+ self.driver.get(url)
51
+ self.scroll_page()
52
+ soup = BeautifulSoup(self.driver.page_source, 'html.parser')
53
+ self.parse_ads(soup, company)
54
+ self.driver.quit()
55
+
56
+ def process_data(self):
57
+ df = pd.DataFrame(self.ad_details)
58
+ df.replace('N/A', np.nan, inplace=True)
59
+ df.dropna(how='all', inplace=True)
60
+ df.drop_duplicates(subset=['Artwork Link'], inplace=True)
61
+ return df
62
+
63
+ def upload_data(self, df):
64
+ engine = create_engine('mysql+pymysql://root:Binu1997#$@localhost/research_db')
65
+ df.to_sql('ads_table', engine, if_exists='replace', index=False)
66
+ print("Data uploaded successfully!")
67
+
68
+ if __name__ == "__main__":
69
+ scraper = AdScraper()
70
+ scraper.scrape_ads()
71
+ data = scraper.process_data()
72
+ print("\nDataFrame with duplicates removed:\n", data)
73
+ scraper.upload_data(data)
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ requests
2
+ beautifulsoup4
3
+ pandas
4
+ openpyxl
5
+ lxml
6
+ xlwt
7
+ selenium
8
+ sqlalchemy
9
+ pymysql
10
+ cryptography
11
+ streamlit
styles.css ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .main-title {
2
+ font-size: 50px;
3
+ color: #FFA500;
4
+ text-align: center;
5
+ font-weight: 700;
6
+ }
7
+ .heading {
8
+ font-size: 25px;
9
+ color: #FF6347;
10
+ font-weight: 600;
11
+ }
12
+ .summary-text {
13
+ font-size: 18px;
14
+ color: #008080;
15
+ font-weight: 500;
16
+ }
17
+ .company-select {
18
+ font-size: 18px;
19
+ color: #FF4500;
20
+ font-weight: 600;
21
+ }
22
+ .dataframe-style {
23
+ background-color: #F5F5F5;
24
+ border-radius: 10px;
25
+ padding: 10px;
26
+ }
test5.ipynb ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 10,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ " Company name Ad Text \\\n",
13
+ "0 Airtel India N/A \n",
14
+ "1 Airtel India N/A \n",
15
+ "2 Airtel India N/A \n",
16
+ "3 Airtel India N/A \n",
17
+ "4 Airtel India N/A \n",
18
+ "... ... ... \n",
19
+ "2663 T-mobile Polska N/A \n",
20
+ "2664 T-mobile Polska N/A \n",
21
+ "2665 T-mobile Polska Jak to się dzieje: liście spadają, a Internetu... \n",
22
+ "2666 T-mobile Polska N/A \n",
23
+ "2667 T-mobile Polska N/A \n",
24
+ "\n",
25
+ " Ad status Artwork Link \n",
26
+ "0 N/A N/A \n",
27
+ "1 N/A N/A \n",
28
+ "2 N/A N/A \n",
29
+ "3 N/A N/A \n",
30
+ "4 N/A N/A \n",
31
+ "... ... ... \n",
32
+ "2663 N/A N/A \n",
33
+ "2664 N/A N/A \n",
34
+ "2665 N/A https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3... \n",
35
+ "2666 N/A N/A \n",
36
+ "2667 N/A N/A \n",
37
+ "\n",
38
+ "[2668 rows x 4 columns]\n"
39
+ ]
40
+ }
41
+ ],
42
+ "source": [
43
+ "from selenium import webdriver \n",
44
+ "from selenium.webdriver.common.by import By \n",
45
+ "from selenium.webdriver.chrome.service import Service as ChromeService \n",
46
+ "from webdriver_manager.chrome import ChromeDriverManager\n",
47
+ "import time\n",
48
+ "from bs4 import BeautifulSoup \n",
49
+ "import pandas as pd\n",
50
+ "\n",
51
+ "# instantiate options \n",
52
+ "options = webdriver.ChromeOptions() \n",
53
+ "\n",
54
+ "# run browser in headless mode \n",
55
+ "options.headless = True \n",
56
+ "\n",
57
+ "# instantiate driver \n",
58
+ "driver = webdriver.Chrome(service=ChromeService( \n",
59
+ " ChromeDriverManager().install()), options=options) \n",
60
+ "\n",
61
+ "# List of companies and their URLs\n",
62
+ "companies_urls = {\n",
63
+ " \"Airtel India\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=147351511955143&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\",\n",
64
+ " \"Celcom Malaysia\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=103384636066809&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\",\n",
65
+ " \"Vodafone UK\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=67884984384&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\",\n",
66
+ " \"T-mobile Polska\": \"https://www.facebook.com/ads/library/?active_status=active&ad_type=all&country=ALL&view_all_page_id=166466416745074&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=page&media_type=all\"\n",
67
+ "}\n",
68
+ "\n",
69
+ "ad_details = []\n",
70
+ "\n",
71
+ "for company, url in companies_urls.items():\n",
72
+ " # Load the page\n",
73
+ " driver.get(url)\n",
74
+ "\n",
75
+ " # Scroll to the bottom of the page\n",
76
+ " last_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
77
+ " \n",
78
+ " while True:\n",
79
+ " driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
80
+ " time.sleep(6) # Wait for new content to load\n",
81
+ " \n",
82
+ " new_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
83
+ " if new_height == last_height:\n",
84
+ " break\n",
85
+ " last_height = new_height\n",
86
+ "\n",
87
+ " # Parse the page content\n",
88
+ " soup = BeautifulSoup(driver.page_source, 'html.parser')\n",
89
+ "\n",
90
+ " # Find advertisement details\n",
91
+ " ads = soup.find_all('div', class_='xh8yej3')\n",
92
+ "\n",
93
+ " for ad in ads:\n",
94
+ " # Extract the ad text\n",
95
+ " ad_text = ad.find('div', class_='x6ikm8r x10wlt62').text if ad.find('div', class_='x6ikm8r x10wlt62') else 'N/A'\n",
96
+ "\n",
97
+ " # Extract company name\n",
98
+ " company_name = company\n",
99
+ "\n",
100
+ " # Extract status\n",
101
+ " ad_status = ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx').text if ad.find('span', class_='x8t9es0 xw23nyj xo1l8bm x63nzvj x108nfp6 xq9mrsl x1h4wwuj xeuugli x1i64zmx') else 'N/A'\n",
102
+ "\n",
103
+ " # Extract image or video link\n",
104
+ " img_tag = ad.find('img', class_='x1ll5gia x19kjcj4 xh8yej3')\n",
105
+ " video_tag = ad.find('video', class_='x1lliihq x5yr21d xh8yej3')\n",
106
+ "\n",
107
+ " if img_tag:\n",
108
+ " artwork_link = img_tag['src']\n",
109
+ " elif video_tag:\n",
110
+ " artwork_link = video_tag['src']\n",
111
+ " else:\n",
112
+ " artwork_link = 'N/A'\n",
113
+ "\n",
114
+ " ad_details.append({'Company name': company_name, 'Ad Text': ad_text, 'Ad status': ad_status, 'Artwork Link': artwork_link})\n",
115
+ "\n",
116
+ "# Close the WebDriver\n",
117
+ "driver.quit()\n",
118
+ "\n",
119
+ "# Convert to pandas DataFrame\n",
120
+ "df = pd.DataFrame(ad_details)\n",
121
+ "print(df)\n"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "code",
126
+ "execution_count": 11,
127
+ "metadata": {},
128
+ "outputs": [
129
+ {
130
+ "data": {
131
+ "text/plain": [
132
+ "(2668, 4)"
133
+ ]
134
+ },
135
+ "execution_count": 11,
136
+ "metadata": {},
137
+ "output_type": "execute_result"
138
+ }
139
+ ],
140
+ "source": [
141
+ "df.shape"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": 12,
147
+ "metadata": {},
148
+ "outputs": [],
149
+ "source": [
150
+ "import numpy as np\n",
151
+ "# Replace 'N/A' strings with np.nan\n",
152
+ "df.replace('N/A', np.nan, inplace=True)"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "execution_count": 13,
158
+ "metadata": {},
159
+ "outputs": [],
160
+ "source": [
161
+ "# Remove rows where all values are NaN\n",
162
+ "df_all_null_removed = df.dropna(how='all')"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": 14,
168
+ "metadata": {},
169
+ "outputs": [],
170
+ "source": [
171
+ "duplicates = df_all_null_removed.duplicated()"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": 15,
177
+ "metadata": {},
178
+ "outputs": [
179
+ {
180
+ "name": "stdout",
181
+ "output_type": "stream",
182
+ "text": [
183
+ "\n",
184
+ "Duplicate rows only:\n",
185
+ " Company name Ad Text Ad status Artwork Link\n",
186
+ "1 Airtel India NaN NaN NaN\n",
187
+ "2 Airtel India NaN NaN NaN\n",
188
+ "3 Airtel India NaN NaN NaN\n",
189
+ "4 Airtel India NaN NaN NaN\n",
190
+ "5 Airtel India NaN NaN NaN\n",
191
+ "... ... ... ... ...\n",
192
+ "2662 T-mobile Polska NaN Active NaN\n",
193
+ "2663 T-mobile Polska NaN NaN NaN\n",
194
+ "2664 T-mobile Polska NaN NaN NaN\n",
195
+ "2666 T-mobile Polska NaN NaN NaN\n",
196
+ "2667 T-mobile Polska NaN NaN NaN\n",
197
+ "\n",
198
+ "[2059 rows x 4 columns]\n"
199
+ ]
200
+ }
201
+ ],
202
+ "source": [
203
+ "# Filter duplicate rows\n",
204
+ "duplicate_rows = df_all_null_removed[df_all_null_removed.duplicated()]\n",
205
+ "\n",
206
+ "print(\"\\nDuplicate rows only:\\n\", duplicate_rows)"
207
+ ]
208
+ },
209
+ {
210
+ "cell_type": "code",
211
+ "execution_count": 16,
212
+ "metadata": {},
213
+ "outputs": [
214
+ {
215
+ "name": "stdout",
216
+ "output_type": "stream",
217
+ "text": [
218
+ "\n",
219
+ "DataFrame with duplicates removed:\n",
220
+ " Company name Ad Text \\\n",
221
+ "0 Airtel India NaN \n",
222
+ "59 Airtel India True Stories of Kerala by Airtel0:00 / 0:15AD.... \n",
223
+ "60 Airtel India True Stories of Kerala by Airtel0:00 / 0:15AD.... \n",
224
+ "78 Airtel India Sometimes there's no right & wrong. This isn't... \n",
225
+ "96 Airtel India Why……. not switch, when you get so many benefi... \n",
226
+ "... ... ... \n",
227
+ "2634 T-mobile Polska T-MOBILE.PLGet Offer \n",
228
+ "2640 T-mobile Polska Telefon, aplikacja i akcja! 🎬 Zapraszamy do Ci... \n",
229
+ "2646 T-mobile Polska Z Magenta Moments życie smakuje podwójnie. 💗 W... \n",
230
+ "2652 T-mobile Polska Najlepszy moment na zakupy? 🛍️ Ten, kiedy są n... \n",
231
+ "2660 T-mobile Polska Jak to się dzieje: liście spadają, a Internetu... \n",
232
+ "\n",
233
+ " Ad status Artwork Link \n",
234
+ "0 NaN NaN \n",
235
+ "59 Active https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3... \n",
236
+ "60 Active https://video.fcmb1-2.fna.fbcdn.net/v/t42.1790... \n",
237
+ "78 Active https://video.fcmb1-2.fna.fbcdn.net/v/t42.1790... \n",
238
+ "96 Active https://video.fcmb1-2.fna.fbcdn.net/v/t42.1790... \n",
239
+ "... ... ... \n",
240
+ "2634 Active https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3... \n",
241
+ "2640 Active https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3... \n",
242
+ "2646 Active https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3... \n",
243
+ "2652 Active https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3... \n",
244
+ "2660 Active https://scontent.fcmb1-2.fna.fbcdn.net/v/t39.3... \n",
245
+ "\n",
246
+ "[263 rows x 4 columns]\n"
247
+ ]
248
+ }
249
+ ],
250
+ "source": [
251
+ "# Remove duplicate rows\n",
252
+ "df_no_duplicates = df_all_null_removed.drop_duplicates(subset=['Artwork Link'])\n",
253
+ "\n",
254
+ "print(\"\\nDataFrame with duplicates removed:\\n\", df_no_duplicates)"
255
+ ]
256
+ },
257
+ {
258
+ "cell_type": "code",
259
+ "execution_count": 17,
260
+ "metadata": {},
261
+ "outputs": [],
262
+ "source": [
263
+ "from sqlalchemy import create_engine\n",
264
+ "\n",
265
+ "# Create MySQL engine\n",
266
+ "engine = create_engine('mysql+pymysql://root:Binu1997#$@localhost/research_db')"
267
+ ]
268
+ },
269
+ {
270
+ "cell_type": "code",
271
+ "execution_count": 18,
272
+ "metadata": {},
273
+ "outputs": [
274
+ {
275
+ "name": "stdout",
276
+ "output_type": "stream",
277
+ "text": [
278
+ "Data uploaded successfully!\n"
279
+ ]
280
+ }
281
+ ],
282
+ "source": [
283
+ "# Upload DataFrame to SQL\n",
284
+ "df_no_duplicates.to_sql('ads_table', engine, if_exists='replace', index=False)\n",
285
+ "\n",
286
+ "print(\"Data uploaded successfully!\")"
287
+ ]
288
+ },
289
+ {
290
+ "cell_type": "code",
291
+ "execution_count": null,
292
+ "metadata": {},
293
+ "outputs": [],
294
+ "source": []
295
+ },
296
+ {
297
+ "cell_type": "code",
298
+ "execution_count": null,
299
+ "metadata": {},
300
+ "outputs": [],
301
+ "source": []
302
+ },
303
+ {
304
+ "cell_type": "code",
305
+ "execution_count": null,
306
+ "metadata": {},
307
+ "outputs": [],
308
+ "source": []
309
+ },
310
+ {
311
+ "cell_type": "code",
312
+ "execution_count": null,
313
+ "metadata": {},
314
+ "outputs": [],
315
+ "source": []
316
+ },
317
+ {
318
+ "cell_type": "code",
319
+ "execution_count": null,
320
+ "metadata": {},
321
+ "outputs": [],
322
+ "source": []
323
+ }
324
+ ],
325
+ "metadata": {
326
+ "kernelspec": {
327
+ "display_name": "myenv",
328
+ "language": "python",
329
+ "name": "python3"
330
+ },
331
+ "language_info": {
332
+ "codemirror_mode": {
333
+ "name": "ipython",
334
+ "version": 3
335
+ },
336
+ "file_extension": ".py",
337
+ "mimetype": "text/x-python",
338
+ "name": "python",
339
+ "nbconvert_exporter": "python",
340
+ "pygments_lexer": "ipython3",
341
+ "version": "3.9.0"
342
+ }
343
+ },
344
+ "nbformat": 4,
345
+ "nbformat_minor": 2
346
+ }