gopiashokan commited on
Commit
1582538
·
verified ·
1 Parent(s): dfc708a

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +186 -148
app.py CHANGED
@@ -12,6 +12,7 @@ from langchain.chat_models import ChatOpenAI
12
  from langchain.chains.question_answering import load_qa_chain
13
  from selenium import webdriver
14
  from selenium.webdriver.common.by import By
 
15
  import warnings
16
  warnings.filterwarnings('ignore')
17
 
@@ -121,165 +122,238 @@ class resume_analyzer:
121
  return response
122
 
123
 
124
- class linkedin_scrap:
125
 
126
- def linkedin_open_scrolldown(driver, user_job_title):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
  b = []
129
- for i in user_job_title:
130
  x = i.split()
131
  y = '%20'.join(x)
132
  b.append(y)
133
- job_title = '%2C%20'.join(b)
134
 
 
135
  link = f"https://in.linkedin.com/jobs/search?keywords={job_title}&location=India&locationId=&geoId=102713980&f_TPR=r604800&position=1&pageNum=0"
136
 
 
 
 
 
 
 
137
  driver.get(link)
138
  driver.implicitly_wait(10)
139
 
140
- for i in range(0,3):
 
141
  driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
142
- time.sleep(5)
 
 
143
  try:
144
  x = driver.find_element(by=By.CSS_SELECTOR, value="button[aria-label='See more jobs']").click()
145
- time.sleep(3)
146
  except:
147
  pass
148
 
149
 
150
- def company_name(driver):
151
-
152
- company = driver.find_elements(by=By.CSS_SELECTOR, value='h4[class="base-search-card__subtitle"]')
 
 
 
153
 
154
- company_name = []
 
155
 
156
- for i in company:
157
- company_name.append(i.text)
158
 
159
- return company_name
 
160
 
161
 
162
- def company_location(driver):
163
-
164
- location = driver.find_elements(by=By.CSS_SELECTOR, value='span[class="job-search-card__location"]')
165
 
166
- company_location = []
 
 
167
 
168
- for i in location:
169
- company_location.append(i.text)
170
-
171
- return company_location
172
-
173
 
174
- def job_title(driver):
175
-
176
  title = driver.find_elements(by=By.CSS_SELECTOR, value='h3[class="base-search-card__title"]')
177
-
178
- job_title = []
179
-
180
- for i in title:
181
- job_title.append(i.text)
182
-
183
- return job_title
184
-
185
-
186
- def job_url(driver):
187
 
188
  url = driver.find_elements(by=By.XPATH, value='//a[contains(@href, "/jobs/")]')
189
-
190
- url_list = [i.get_attribute('href') for i in url]
191
-
192
- job_url = []
193
-
194
- for url in url_list:
195
- job_url.append(url)
196
-
197
- return job_url
198
-
199
-
200
- def job_title_filter(x, user_job_title):
201
-
202
- s = [i.lower() for i in user_job_title]
203
- suggestion = []
204
- for i in s:
205
- suggestion.extend(i.split())
206
-
207
- s = x.split()
208
- a = [i.lower() for i in s]
209
-
210
- intersection = list(set(suggestion).intersection(set(a)))
211
- return x if len(intersection) > 1 else np.nan
212
 
 
 
 
 
 
213
 
214
- def get_description(driver, link):
 
215
 
216
- driver.get(link)
217
- time.sleep(3)
 
218
 
219
- driver.find_element(by=By.CSS_SELECTOR,
220
- value='button[data-tracking-control-name="public_jobs_show-more-html-btn"]').click()
221
- time.sleep(2)
222
 
223
- description = driver.find_elements(by=By.CSS_SELECTOR,
224
- value='div[class="show-more-less-html__markup relative overflow-hidden"]')
225
- driver.implicitly_wait(4)
226
 
227
- for j in description:
228
- return j.text
229
-
230
-
231
- def data_scrap(driver, user_job_title):
232
 
233
- # combine the all data to single dataframe
234
- df = pd.DataFrame(linkedin_scrap.company_name(driver), columns=['Company Name'])
235
- df['Job Title'] = pd.DataFrame(linkedin_scrap.job_title(driver))
236
- df['Location'] = pd.DataFrame(linkedin_scrap.company_location(driver))
237
- df['Website URL'] = pd.DataFrame(linkedin_scrap.job_url(driver))
238
-
239
- # job title filter based on user input
240
- df['Job Title'] = df['Job Title'].apply(lambda x: linkedin_scrap.job_title_filter(x, user_job_title))
241
- df = df.dropna()
242
- df.reset_index(drop=True, inplace=True)
243
- df = df.iloc[:10, :]
244
-
245
- # make a list after filter
246
  website_url = df['Website URL'].tolist()
247
-
248
- # add job description in df
249
  job_description = []
250
-
251
  for i in range(0, len(website_url)):
252
- link = website_url[i]
253
- data = linkedin_scrap.get_description(driver, link)
254
- if data is not None and len(data.strip()) > 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  job_description.append(data)
256
  else:
257
  job_description.append('Description Not Available')
258
 
 
 
259
  df['Job Description'] = pd.DataFrame(job_description, columns=['Description'])
260
  df = df.dropna()
261
  df.reset_index(drop=True, inplace=True)
262
  return df
263
 
264
 
265
- def main(user_job_title):
266
 
267
- options = webdriver.ChromeOptions()
268
- options.add_argument('--headless')
269
- options.add_argument('--no-sandbox')
270
- options.add_argument('--disable-dev-shm-usage')
271
 
272
- driver = webdriver.Chrome(options=options)
273
- driver.maximize_window()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
- linkedin_scrap.linkedin_open_scrolldown(driver, user_job_title)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
- final_df = linkedin_scrap.data_scrap(driver, user_job_title)
278
- driver.quit()
 
 
 
 
 
 
 
 
 
 
 
279
 
280
- return final_df
281
 
282
 
 
283
  streamlit_config()
284
  add_vertical_space(1)
285
 
@@ -310,9 +384,8 @@ if option == 'Summary':
310
  st.write(result_summary)
311
 
312
  except Exception as e:
313
- col1, col2 = st.columns(2)
314
- with col1:
315
- st.warning(e)
316
 
317
 
318
  elif option == 'Strength':
@@ -337,9 +410,8 @@ elif option == 'Strength':
337
  st.write(result_strength)
338
 
339
  except Exception as e:
340
- col1, col2 = st.columns(2)
341
- with col1:
342
- st.warning(e)
343
 
344
 
345
  elif option == 'Weakness':
@@ -364,9 +436,8 @@ elif option == 'Weakness':
364
  st.write(result_weakness)
365
 
366
  except Exception as e:
367
- col1, col2 = st.columns(2)
368
- with col1:
369
- st.warning(e)
370
 
371
 
372
  elif option == 'Job Titles':
@@ -390,47 +461,14 @@ elif option == 'Job Titles':
390
  st.write(result_suggestion)
391
 
392
  except Exception as e:
393
- col1, col2 = st.columns(2)
394
- with col1:
395
- st.warning(e)
396
 
397
 
398
  elif option == 'Linkedin Jobs':
399
 
400
- try:
401
- # get user input of job title
402
- user_input_job_title = st.text_input(label='Enter Job Titles (with comma separated):')
403
- submit = st.button('Submit')
404
-
405
- if submit and len(user_input_job_title) > 0:
406
-
407
- user_job_title = user_input_job_title.split(',')
408
-
409
- df = linkedin_scrap.main(user_job_title)
410
-
411
- l = len(df['Company Name'])
412
- for i in range(0, l):
413
- st.write(f"Company Name : {df.iloc[i,0]}")
414
- st.write(f"Job Title : {df.iloc[i,1]}")
415
- st.write(f"Location : {df.iloc[i,2]}")
416
- st.write(f"Website URL : {df.iloc[i,3]}")
417
- with st.expander(label='Job Desription'):
418
- st.write(df.iloc[i, 4])
419
- st.write('')
420
- st.write('')
421
-
422
- elif submit and len(user_input_job_title) == 0:
423
- col1, col2 = st.columns(2)
424
- with col1:
425
- st.info('Please Enter the Job Titles')
426
-
427
- except:
428
- st.write('')
429
- st.info("This feature is currently not working in the deployed Streamlit application due to a 'selenium.common.exceptions.WebDriverException' error.")
430
- st.write('')
431
-
432
- st.write(
433
- "Please use the local Streamlit application for a smooth experience: [http://localhost:8501](http://localhost:8501)")
434
 
435
 
436
  elif option == 'Exit':
 
12
  from langchain.chains.question_answering import load_qa_chain
13
  from selenium import webdriver
14
  from selenium.webdriver.common.by import By
15
+ from selenium.common.exceptions import NoSuchElementException
16
  import warnings
17
  warnings.filterwarnings('ignore')
18
 
 
122
  return response
123
 
124
 
125
+ class linkedin_scraper:
126
 
127
+ def webdriver_setup():
128
+
129
+ options = webdriver.ChromeOptions()
130
+ options.add_argument('--headless')
131
+ options.add_argument('--no-sandbox')
132
+ options.add_argument('--disable-dev-shm-usage')
133
+
134
+ driver = webdriver.Chrome(options=options)
135
+ driver.maximize_window()
136
+ return driver
137
+
138
+
139
+ def get_userinput():
140
+
141
+ add_vertical_space(2)
142
+ with st.form(key='linkedin_scarp'):
143
+
144
+ add_vertical_space(1)
145
+ col1,col2 = st.columns([0.7,0.3], gap='medium')
146
+ with col1:
147
+ job_title = st.text_input(label='Job Title')
148
+ job_title = job_title.split()
149
+ with col2:
150
+ job_count = st.number_input(label='Job Count', min_value=1, value=1, step=1)
151
+
152
+ # Submit Button
153
+ add_vertical_space(1)
154
+ submit = st.form_submit_button(label='Submit')
155
+ add_vertical_space(1)
156
+
157
+ return job_title, job_count, submit
158
+
159
+
160
+ def build_url(job_title):
161
 
162
  b = []
163
+ for i in job_title:
164
  x = i.split()
165
  y = '%20'.join(x)
166
  b.append(y)
 
167
 
168
+ job_title = '%2C%20'.join(b)
169
  link = f"https://in.linkedin.com/jobs/search?keywords={job_title}&location=India&locationId=&geoId=102713980&f_TPR=r604800&position=1&pageNum=0"
170
 
171
+ return link
172
+
173
+
174
+ def link_open_scrolldown(driver, link, job_count):
175
+
176
+ # Open the Link in LinkedIn
177
  driver.get(link)
178
  driver.implicitly_wait(10)
179
 
180
+ # Scroll Down the Page
181
+ for i in range(0,job_count):
182
  driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
183
+ driver.implicitly_wait(5)
184
+
185
+ # Click on See More Jobs Button if Present
186
  try:
187
  x = driver.find_element(by=By.CSS_SELECTOR, value="button[aria-label='See more jobs']").click()
188
+ driver.implicitly_wait(5)
189
  except:
190
  pass
191
 
192
 
193
+ def job_title_filter(scrap_job_title, user_job_title_input):
194
+
195
+ # User Job Title Convert Lower Case and Split into List
196
+ user_input = []
197
+ for i in [i.lower() for i in user_job_title_input]:
198
+ user_input.extend(i.split())
199
 
200
+ # scraped Job Title Convert Lower Case and Split into List
201
+ scrap_title = [i.lower() for i in scrap_job_title.split()]
202
 
203
+ # Identify Same Words in Both Lists
204
+ matched_words = list(set(user_input).intersection(set(scrap_title)))
205
 
206
+ # Return Job Title if there are more than 1 matched word else return NaN
207
+ return scrap_job_title if len(matched_words) > 1 else np.nan
208
 
209
 
210
+ def scrap_company_data(driver, job_title_input, job_count):
 
 
211
 
212
+ # scraping the Company Data
213
+ company = driver.find_elements(by=By.CSS_SELECTOR, value='h4[class="base-search-card__subtitle"]')
214
+ company_name = [i.text for i in company]
215
 
216
+ location = driver.find_elements(by=By.CSS_SELECTOR, value='span[class="job-search-card__location"]')
217
+ company_location = [i.text for i in location]
 
 
 
218
 
 
 
219
  title = driver.find_elements(by=By.CSS_SELECTOR, value='h3[class="base-search-card__title"]')
220
+ job_title = [i.text for i in title]
 
 
 
 
 
 
 
 
 
221
 
222
  url = driver.find_elements(by=By.XPATH, value='//a[contains(@href, "/jobs/")]')
223
+ website_url = [i.get_attribute('href') for i in url]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
+ # combine the all data to single dataframe
226
+ df = pd.DataFrame(company_name, columns=['Company Name'])
227
+ df['Job Title'] = pd.DataFrame(job_title)
228
+ df['Location'] = pd.DataFrame(company_location)
229
+ df['Website URL'] = pd.DataFrame(website_url)
230
 
231
+ # Return Job Title if there are more than 1 matched word else return NaN
232
+ df['Job Title'] = df['Job Title'].apply(lambda x: linkedin_scraper.job_title_filter(x, job_title_input))
233
 
234
+ # Drop Null Values and Reset Index
235
+ df = df.dropna()
236
+ df.reset_index(drop=True, inplace=True)
237
 
238
+ # Filter Job Title Based on User Input
239
+ df = df.iloc[:job_count, :]
 
240
 
241
+ return df
 
 
242
 
 
 
 
 
 
243
 
244
+ def scrap_job_description(driver, df):
245
+
246
+ # Get URL into List
 
 
 
 
 
 
 
 
 
 
247
  website_url = df['Website URL'].tolist()
248
+
249
+ # Scrap the Job Description
250
  job_description = []
 
251
  for i in range(0, len(website_url)):
252
+ # Open the URL
253
+ driver.get(website_url[i])
254
+ driver.implicitly_wait(10)
255
+ time.sleep(1)
256
+
257
+ try:
258
+ # Click on Show More Button
259
+ driver.find_element(by=By.CSS_SELECTOR, value='button[data-tracking-control-name="public_jobs_show-more-html-btn"]').click()
260
+ driver.implicitly_wait(10)
261
+ time.sleep(1)
262
+
263
+ except NoSuchElementException:
264
+ # Open the URL
265
+ driver.get(website_url[i])
266
+ driver.implicitly_wait(10)
267
+ time.sleep(1)
268
+
269
+ # Click on Show More Button
270
+ driver.find_element(by=By.CSS_SELECTOR, value='button[data-tracking-control-name="public_jobs_show-more-html-btn"]').click()
271
+ driver.implicitly_wait(10)
272
+ time.sleep(1)
273
+
274
+ # Get Job Description
275
+ description = driver.find_elements(by=By.CSS_SELECTOR, value='div[class="show-more-less-html__markup relative overflow-hidden"]')
276
+ driver.implicitly_wait(10)
277
+ data = [i.text for i in description][0]
278
+
279
+ if len(data.strip()) > 0:
280
  job_description.append(data)
281
  else:
282
  job_description.append('Description Not Available')
283
 
284
+
285
+ # Add Job Description in Dataframe
286
  df['Job Description'] = pd.DataFrame(job_description, columns=['Description'])
287
  df = df.dropna()
288
  df.reset_index(drop=True, inplace=True)
289
  return df
290
 
291
 
292
+ def display_data_userinterface(df_final):
293
 
294
+ # Display the Data in User Interface
295
+ add_vertical_space(1)
296
+ for i in range(0, len(df_final)):
 
297
 
298
+ st.write(f"Company Name : {df_final.iloc[i,0]}")
299
+ st.write(f"Job Title : {df_final.iloc[i,1]}")
300
+ st.write(f"Location : {df_final.iloc[i,2]}")
301
+ st.write(f"Website URL : {df_final.iloc[i,3]}")
302
+
303
+ with st.expander(label='Job Desription'):
304
+ st.write(df_final.iloc[i, 4])
305
+ add_vertical_space(3)
306
+
307
+
308
+ def main():
309
+
310
+ # Initially set driver to None
311
+ # driver = None
312
+
313
+ # try:
314
+ job_title_input, job_count, submit = linkedin_scraper.get_userinput()
315
+ add_vertical_space(2)
316
 
317
+ if submit:
318
+ if job_title_input != '':
319
+
320
+ with st.spinner('Webdriver Setup Initializing...'):
321
+ driver = linkedin_scraper.webdriver_setup()
322
+
323
+ with st.spinner('Build URL and Open Link...'):
324
+
325
+ # build URL based on User Job Title Input
326
+ link = linkedin_scraper.build_url(job_title_input)
327
+
328
+ # Open the Link in LinkedIn and Scroll Down the Page
329
+ linkedin_scraper.link_open_scrolldown(driver, link, job_count)
330
+
331
+ with st.spinner('scraping Company Data...'):
332
+ df = linkedin_scraper.scrap_company_data(driver, job_title_input, job_count)
333
+
334
+ with st.spinner('Scraping Job Description Data...'):
335
+ df_final = linkedin_scraper. scrap_job_description(driver, df)
336
+
337
+ # Display the Data in User Interface
338
+ linkedin_scraper.display_data_userinterface(df_final)
339
 
340
+
341
+ # If User Click Submit Button and Job Title is Empty
342
+ elif job_title_input == '':
343
+ st.markdown(f'<h5 style="text-align: center;color: orange;">Job Title is Empty</h5>',
344
+ unsafe_allow_html=True)
345
+
346
+ # except Exception as e:
347
+ # add_vertical_space(2)
348
+ # st.markdown(f'<h5 style="text-align: center;color: orange;">{e}</h5>', unsafe_allow_html=True)
349
+
350
+ # finally:
351
+ # if driver:
352
+ # driver.quit()
353
 
 
354
 
355
 
356
+ # Streamlit Configuration Setup
357
  streamlit_config()
358
  add_vertical_space(1)
359
 
 
384
  st.write(result_summary)
385
 
386
  except Exception as e:
387
+ add_vertical_space(2)
388
+ st.markdown(f'<h5 style="text-align: center;color: orange;">{e}</h5>', unsafe_allow_html=True)
 
389
 
390
 
391
  elif option == 'Strength':
 
410
  st.write(result_strength)
411
 
412
  except Exception as e:
413
+ add_vertical_space(2)
414
+ st.markdown(f'<h5 style="text-align: center;color: orange;">{e}</h5>', unsafe_allow_html=True)
 
415
 
416
 
417
  elif option == 'Weakness':
 
436
  st.write(result_weakness)
437
 
438
  except Exception as e:
439
+ add_vertical_space(2)
440
+ st.markdown(f'<h5 style="text-align: center;color: orange;">{e}</h5>', unsafe_allow_html=True)
 
441
 
442
 
443
  elif option == 'Job Titles':
 
461
  st.write(result_suggestion)
462
 
463
  except Exception as e:
464
+ add_vertical_space(2)
465
+ st.markdown(f'<h5 style="text-align: center;color: orange;">{e}</h5>', unsafe_allow_html=True)
 
466
 
467
 
468
  elif option == 'Linkedin Jobs':
469
 
470
+ add_vertical_space(2)
471
+ linkedin_scraper.main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
 
473
 
474
  elif option == 'Exit':