Niharmahesh commited on
Commit
7d0b74c
·
verified ·
1 Parent(s): 2673151

Update pages/test.py

Browse files
Files changed (1) hide show
  1. pages/test.py +117 -48
pages/test.py CHANGED
@@ -19,67 +19,67 @@ HF_TOKEN = st.secrets["HF_TOKEN"]
19
  HF_USERNAME = st.secrets["HF_USERNAME"]
20
  DATASET_NAME = "jobeasz"
21
 
22
- import dask.dataframe as dd
23
- from distributed import Client
24
-
25
  @st.cache_data(ttl=3600)
26
  def load_and_concat_data():
27
  api = HfApi()
28
  dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
29
  feather_files = [file for file in dataset_files if file.endswith('.feather')]
30
 
31
- # Create a Dask client for local parallelization
32
- client = Client()
33
-
34
- # Use Dask to read feather files in parallel
35
- ddf = dd.from_delayed([
36
- dd.from_delayed(client.submit(feather.read_feather, api.hf_hub_download(
37
- repo_id=f"{HF_USERNAME}/{DATASET_NAME}",
38
- filename=file,
39
- repo_type="dataset",
40
- token=HF_TOKEN
41
- )))
42
- for file in feather_files
43
- ])
44
-
45
- # Perform operations on the Dask DataFrame
46
  columns_to_keep = [
47
  'site', 'job_url', 'title', 'company', 'location',
48
  'job_type', 'date_posted', 'is_remote', 'company_url', 'description'
49
  ]
50
- ddf = ddf[columns_to_keep]
51
- ddf['date_posted'] = dd.to_datetime(ddf['date_posted'], errors='coerce')
52
- ddf = ddf.dropna(subset=['date_posted'])
53
- ddf = ddf[ddf['date_posted'].dt.year == 2024]
54
- ddf['title'] = ddf['title'].str.lower()
55
- ddf['company'] = ddf['company'].str.lower()
56
-
 
 
 
 
 
57
  def clean_location(location):
58
  if pd.isna(location):
59
- return location
 
60
  location = location.lower()
 
61
  location = re.sub(r',\s*(us|usa)$', '', location)
62
  return location
63
 
64
- ddf['location'] = ddf['location'].map(clean_location)
65
-
66
- # Compute the final result
67
- filtered_df = ddf.compute()
68
  filtered_df = filtered_df.drop_duplicates()
69
-
70
  return filtered_df
71
 
72
-
73
- def clean_description(text):
74
- if not isinstance(text, str):
75
- return '' # Return an empty string for non-string inputs
76
- # Remove newline characters and asterisks
77
- cleaned_text = re.sub(r'[\n\r\*]', ' ', text)
78
- # Remove extra spaces
79
- cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
80
  return cleaned_text
81
 
82
-
83
  @st.cache_resource
84
  def load_models():
85
  return {
@@ -95,6 +95,82 @@ def generate_embeddings(text, models):
95
  'paraphrase': models['paraphrase'].encode(text)
96
  }
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  def main():
99
  st.title("Resume-Job Matcher")
100
 
@@ -114,36 +190,30 @@ def main():
114
 
115
  # Clean description and create embeddings
116
  models = load_models()
117
- df_filtered['cleaned_description'] = df_filtered['description'].apply(clean_description)
118
 
119
  for model_name in ['minilm', 'mpnet', 'paraphrase']:
120
  df_filtered[f'embeddings_{model_name}'] = df_filtered['cleaned_description'].apply(lambda x: models[model_name].encode(x))
121
 
122
- # Rest of the code remains the same
123
  uploaded_file = st.file_uploader("Upload your resume", type=["txt", "pdf"], key="resume_uploader")
124
  if uploaded_file is not None:
125
  try:
126
- # Read and clean the resume
127
  resume_text = read_file_content(uploaded_file)
128
  cleaned_resume = remove_special_chars(resume_text)
129
  st.subheader("Parsed Resume")
130
  st.text(cleaned_resume)
131
 
132
- # Generate embeddings for resume
133
  resume_embeddings = generate_embeddings(cleaned_resume, models)
134
 
135
- # Calculate similarities
136
  for model_name in ['minilm', 'mpnet', 'paraphrase']:
137
  similarities = calculate_similarities(df_filtered[f'embeddings_{model_name}'].tolist(), resume_embeddings[model_name])
138
  for metric in ['cosine', 'pearson', 'euclidean']:
139
  df_filtered[f'{model_name}_{metric}'] = [s[metric] for s in similarities]
140
 
141
- # Get top 150 matches
142
  top_matches = get_top_matches(df_filtered, 50)
143
  st.subheader("Top 150 Matches (Before Groq Evaluation)")
144
  st.dataframe(top_matches[['title', 'company', 'location', 'model', 'metric']])
145
 
146
- # Groq evaluation (if API key is provided)
147
  groq_api_key = st.text_input("Enter your Groq API Key", type="password")
148
  if groq_api_key:
149
  client = groq.Groq(api_key=groq_api_key)
@@ -155,7 +225,6 @@ def main():
155
  top_matches.at[row.Index, 'groq_reasoning'] = groq_result['reasoning']
156
  progress_bar.progress((i + 1) / len(top_matches))
157
 
158
- # Sort by Groq score and take top 100
159
  top_100_matches = top_matches.nlargest(100, 'groq_score')
160
  st.subheader("Top 100 Matches After Groq Evaluation")
161
  display_data_explorer(top_100_matches)
 
19
  HF_USERNAME = st.secrets["HF_USERNAME"]
20
  DATASET_NAME = "jobeasz"
21
 
 
 
 
22
  @st.cache_data(ttl=3600)
23
  def load_and_concat_data():
24
  api = HfApi()
25
  dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
26
  feather_files = [file for file in dataset_files if file.endswith('.feather')]
27
 
28
+ all_data = []
29
+ for file in feather_files:
30
+ try:
31
+ file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
32
+ df = feather.read_feather(file_content)
33
+ all_data.append(df)
34
+ except Exception:
35
+ pass # Silently skip files that can't be processed
36
+
37
+ if not all_data:
38
+ return pd.DataFrame()
39
+
40
+ concatenated_df = pd.concat(all_data, ignore_index=True)
41
+
 
42
  columns_to_keep = [
43
  'site', 'job_url', 'title', 'company', 'location',
44
  'job_type', 'date_posted', 'is_remote', 'company_url', 'description'
45
  ]
46
+ filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
47
+ filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
48
+
49
+ # Drop duplicates and rows with NaT in date_posted
50
+ filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
51
+ #filtering based on data in 2024
52
+ filtered_df = filtered_df[filtered_df['date_posted'].dt.year==2024]
53
+ # Convert titles and company name to lowercase
54
+ filtered_df['title'] = filtered_df['title'].str.lower()
55
+ filtered_df['company'] = filtered_df['company'].str.lower()
56
+
57
+ # Function to clean the location
58
  def clean_location(location):
59
  if pd.isna(location):
60
+ return location # Return NaN as is
61
+ # Convert to lowercase
62
  location = location.lower()
63
+ # Remove ', us' or ', usa' from the end using regex
64
  location = re.sub(r',\s*(us|usa)$', '', location)
65
  return location
66
 
67
+ # Clean the location in place
68
+ filtered_df['location'] = filtered_df['location'].apply(clean_location)
69
+ #added new line to drop duplicate records
 
70
  filtered_df = filtered_df.drop_duplicates()
71
+
72
  return filtered_df
73
 
74
+ def remove_special_chars(text):
75
+ if pd.isna(text):
76
+ return text
77
+ # Remove special characters and markdown formatting
78
+ cleaned_text = re.sub(r'[*\n\-_]', ' ', text)
79
+ # Remove extra whitespace
80
+ cleaned_text = ' '.join(cleaned_text.split())
 
81
  return cleaned_text
82
 
 
83
  @st.cache_resource
84
  def load_models():
85
  return {
 
95
  'paraphrase': models['paraphrase'].encode(text)
96
  }
97
 
98
+ def calculate_similarities(job_embeddings, resume_embedding):
99
+ similarities = []
100
+ for job_embedding in job_embeddings:
101
+ job_emb = np.array(job_embedding).reshape(1, -1)
102
+ res_emb = resume_embedding.reshape(1, -1)
103
+ cosine_sim = cosine_similarity(job_emb, res_emb)[0][0]
104
+ pearson_corr = pearsonr(job_embedding, resume_embedding)[0]
105
+ euclidean_dist = euclidean(job_embedding, resume_embedding)
106
+ similarities.append({
107
+ 'cosine': cosine_sim,
108
+ 'pearson': pearson_corr,
109
+ 'euclidean': euclidean_dist
110
+ })
111
+ return similarities
112
+
113
+ def get_top_matches(df, n=50):
114
+ top_matches = pd.DataFrame()
115
+ for model_name in ['minilm', 'mpnet', 'paraphrase']:
116
+ for metric in ['cosine', 'pearson', 'euclidean']:
117
+ col_name = f'{model_name}_{metric}'
118
+ ascending = metric == 'euclidean'
119
+ top_n = df.nsmallest(n, col_name) if ascending else df.nlargest(n, col_name)
120
+ top_n['model'] = model_name
121
+ top_n['metric'] = metric
122
+ top_matches = pd.concat([top_matches, top_n])
123
+ return top_matches.drop_duplicates().head(150)
124
+
125
+ @st.cache_data
126
+ def evaluate_with_groq(resume_text, job_description_text, client):
127
+ prompt = f"""
128
+ Resume: {resume_text}
129
+ Job Description: {job_description_text}
130
+ Based on the above information, rate the match quality on a scale of 0-100 and provide reasoning.
131
+ Return your response in the following JSON format:
132
+ {{ "score": <integer between 0 and 100>, "reasoning": "<your explanation>" }}
133
+ """
134
+ response = client.chat.completions.create(
135
+ messages=[
136
+ {"role": "user", "content": prompt}
137
+ ],
138
+ model="mixtral-8x7b-32768",
139
+ max_tokens=200,
140
+ )
141
+ return json.loads(response.choices[0].message.content)
142
+
143
+ def display_data_explorer(df):
144
+ st.subheader("Data Explorer")
145
+ items_per_page = 15
146
+ num_pages = math.ceil(len(df) / items_per_page)
147
+ col1, col2, col3 = st.columns([1, 3, 1])
148
+ with col2:
149
+ page = st.number_input("Page", min_value=1, max_value=num_pages, value=1)
150
+ start_idx = (page - 1) * items_per_page
151
+ end_idx = start_idx + items_per_page
152
+ page_df = df.iloc[start_idx:end_idx]
153
+
154
+ def make_clickable(url, text):
155
+ return f'<a href="{url}" target="_blank" style="color: #4e79a7;">{text}</a>'
156
+
157
+ page_df['job_url'] = page_df.apply(lambda row: make_clickable(row['job_url'], 'Link'), axis=1)
158
+ page_df['company_url'] = page_df.apply(lambda row: make_clickable(row['company_url'], row['company']), axis=1)
159
+
160
+ display_columns = ['title', 'company_url', 'location', 'job_type', 'date_posted', 'job_url', 'groq_score', 'groq_reasoning']
161
+ st.write(page_df[display_columns].to_html(escape=False, index=False), unsafe_allow_html=True)
162
+
163
+ col1, col2, col3 = st.columns([1, 3, 1])
164
+ with col2:
165
+ st.write(f"Page {page} of {num_pages}")
166
+
167
+ def read_file_content(uploaded_file):
168
+ if uploaded_file.type == "application/pdf":
169
+ pdf_reader = io.BytesIO(uploaded_file.getvalue())
170
+ return extract_text(pdf_reader)
171
+ else:
172
+ return uploaded_file.getvalue().decode("utf-8", errors="ignore")
173
+
174
  def main():
175
  st.title("Resume-Job Matcher")
176
 
 
190
 
191
  # Clean description and create embeddings
192
  models = load_models()
193
+ df_filtered['cleaned_description'] = df_filtered['description'].apply(remove_special_chars)
194
 
195
  for model_name in ['minilm', 'mpnet', 'paraphrase']:
196
  df_filtered[f'embeddings_{model_name}'] = df_filtered['cleaned_description'].apply(lambda x: models[model_name].encode(x))
197
 
 
198
  uploaded_file = st.file_uploader("Upload your resume", type=["txt", "pdf"], key="resume_uploader")
199
  if uploaded_file is not None:
200
  try:
 
201
  resume_text = read_file_content(uploaded_file)
202
  cleaned_resume = remove_special_chars(resume_text)
203
  st.subheader("Parsed Resume")
204
  st.text(cleaned_resume)
205
 
 
206
  resume_embeddings = generate_embeddings(cleaned_resume, models)
207
 
 
208
  for model_name in ['minilm', 'mpnet', 'paraphrase']:
209
  similarities = calculate_similarities(df_filtered[f'embeddings_{model_name}'].tolist(), resume_embeddings[model_name])
210
  for metric in ['cosine', 'pearson', 'euclidean']:
211
  df_filtered[f'{model_name}_{metric}'] = [s[metric] for s in similarities]
212
 
 
213
  top_matches = get_top_matches(df_filtered, 50)
214
  st.subheader("Top 150 Matches (Before Groq Evaluation)")
215
  st.dataframe(top_matches[['title', 'company', 'location', 'model', 'metric']])
216
 
 
217
  groq_api_key = st.text_input("Enter your Groq API Key", type="password")
218
  if groq_api_key:
219
  client = groq.Groq(api_key=groq_api_key)
 
225
  top_matches.at[row.Index, 'groq_reasoning'] = groq_result['reasoning']
226
  progress_bar.progress((i + 1) / len(top_matches))
227
 
 
228
  top_100_matches = top_matches.nlargest(100, 'groq_score')
229
  st.subheader("Top 100 Matches After Groq Evaluation")
230
  display_data_explorer(top_100_matches)