Spaces:
Running
Running
Update pages/test.py
Browse files- pages/test.py +117 -48
pages/test.py
CHANGED
@@ -19,67 +19,67 @@ HF_TOKEN = st.secrets["HF_TOKEN"]
|
|
19 |
HF_USERNAME = st.secrets["HF_USERNAME"]
|
20 |
DATASET_NAME = "jobeasz"
|
21 |
|
22 |
-
import dask.dataframe as dd
|
23 |
-
from distributed import Client
|
24 |
-
|
25 |
@st.cache_data(ttl=3600)
|
26 |
def load_and_concat_data():
|
27 |
api = HfApi()
|
28 |
dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
|
29 |
feather_files = [file for file in dataset_files if file.endswith('.feather')]
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
)
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
# Perform operations on the Dask DataFrame
|
46 |
columns_to_keep = [
|
47 |
'site', 'job_url', 'title', 'company', 'location',
|
48 |
'job_type', 'date_posted', 'is_remote', 'company_url', 'description'
|
49 |
]
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
57 |
def clean_location(location):
|
58 |
if pd.isna(location):
|
59 |
-
return location
|
|
|
60 |
location = location.lower()
|
|
|
61 |
location = re.sub(r',\s*(us|usa)$', '', location)
|
62 |
return location
|
63 |
|
64 |
-
|
65 |
-
|
66 |
-
#
|
67 |
-
filtered_df = ddf.compute()
|
68 |
filtered_df = filtered_df.drop_duplicates()
|
69 |
-
|
70 |
return filtered_df
|
71 |
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
|
80 |
return cleaned_text
|
81 |
|
82 |
-
|
83 |
@st.cache_resource
|
84 |
def load_models():
|
85 |
return {
|
@@ -95,6 +95,82 @@ def generate_embeddings(text, models):
|
|
95 |
'paraphrase': models['paraphrase'].encode(text)
|
96 |
}
|
97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
def main():
|
99 |
st.title("Resume-Job Matcher")
|
100 |
|
@@ -114,36 +190,30 @@ def main():
|
|
114 |
|
115 |
# Clean description and create embeddings
|
116 |
models = load_models()
|
117 |
-
df_filtered['cleaned_description'] = df_filtered['description'].apply(
|
118 |
|
119 |
for model_name in ['minilm', 'mpnet', 'paraphrase']:
|
120 |
df_filtered[f'embeddings_{model_name}'] = df_filtered['cleaned_description'].apply(lambda x: models[model_name].encode(x))
|
121 |
|
122 |
-
# Rest of the code remains the same
|
123 |
uploaded_file = st.file_uploader("Upload your resume", type=["txt", "pdf"], key="resume_uploader")
|
124 |
if uploaded_file is not None:
|
125 |
try:
|
126 |
-
# Read and clean the resume
|
127 |
resume_text = read_file_content(uploaded_file)
|
128 |
cleaned_resume = remove_special_chars(resume_text)
|
129 |
st.subheader("Parsed Resume")
|
130 |
st.text(cleaned_resume)
|
131 |
|
132 |
-
# Generate embeddings for resume
|
133 |
resume_embeddings = generate_embeddings(cleaned_resume, models)
|
134 |
|
135 |
-
# Calculate similarities
|
136 |
for model_name in ['minilm', 'mpnet', 'paraphrase']:
|
137 |
similarities = calculate_similarities(df_filtered[f'embeddings_{model_name}'].tolist(), resume_embeddings[model_name])
|
138 |
for metric in ['cosine', 'pearson', 'euclidean']:
|
139 |
df_filtered[f'{model_name}_{metric}'] = [s[metric] for s in similarities]
|
140 |
|
141 |
-
# Get top 150 matches
|
142 |
top_matches = get_top_matches(df_filtered, 50)
|
143 |
st.subheader("Top 150 Matches (Before Groq Evaluation)")
|
144 |
st.dataframe(top_matches[['title', 'company', 'location', 'model', 'metric']])
|
145 |
|
146 |
-
# Groq evaluation (if API key is provided)
|
147 |
groq_api_key = st.text_input("Enter your Groq API Key", type="password")
|
148 |
if groq_api_key:
|
149 |
client = groq.Groq(api_key=groq_api_key)
|
@@ -155,7 +225,6 @@ def main():
|
|
155 |
top_matches.at[row.Index, 'groq_reasoning'] = groq_result['reasoning']
|
156 |
progress_bar.progress((i + 1) / len(top_matches))
|
157 |
|
158 |
-
# Sort by Groq score and take top 100
|
159 |
top_100_matches = top_matches.nlargest(100, 'groq_score')
|
160 |
st.subheader("Top 100 Matches After Groq Evaluation")
|
161 |
display_data_explorer(top_100_matches)
|
|
|
19 |
HF_USERNAME = st.secrets["HF_USERNAME"]
|
20 |
DATASET_NAME = "jobeasz"
|
21 |
|
|
|
|
|
|
|
22 |
@st.cache_data(ttl=3600)
|
23 |
def load_and_concat_data():
|
24 |
api = HfApi()
|
25 |
dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
|
26 |
feather_files = [file for file in dataset_files if file.endswith('.feather')]
|
27 |
|
28 |
+
all_data = []
|
29 |
+
for file in feather_files:
|
30 |
+
try:
|
31 |
+
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
|
32 |
+
df = feather.read_feather(file_content)
|
33 |
+
all_data.append(df)
|
34 |
+
except Exception:
|
35 |
+
pass # Silently skip files that can't be processed
|
36 |
+
|
37 |
+
if not all_data:
|
38 |
+
return pd.DataFrame()
|
39 |
+
|
40 |
+
concatenated_df = pd.concat(all_data, ignore_index=True)
|
41 |
+
|
|
|
42 |
columns_to_keep = [
|
43 |
'site', 'job_url', 'title', 'company', 'location',
|
44 |
'job_type', 'date_posted', 'is_remote', 'company_url', 'description'
|
45 |
]
|
46 |
+
filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
|
47 |
+
filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
|
48 |
+
|
49 |
+
# Drop duplicates and rows with NaT in date_posted
|
50 |
+
filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
|
51 |
+
#filtering based on data in 2024
|
52 |
+
filtered_df = filtered_df[filtered_df['date_posted'].dt.year==2024]
|
53 |
+
# Convert titles and company name to lowercase
|
54 |
+
filtered_df['title'] = filtered_df['title'].str.lower()
|
55 |
+
filtered_df['company'] = filtered_df['company'].str.lower()
|
56 |
+
|
57 |
+
# Function to clean the location
|
58 |
def clean_location(location):
|
59 |
if pd.isna(location):
|
60 |
+
return location # Return NaN as is
|
61 |
+
# Convert to lowercase
|
62 |
location = location.lower()
|
63 |
+
# Remove ', us' or ', usa' from the end using regex
|
64 |
location = re.sub(r',\s*(us|usa)$', '', location)
|
65 |
return location
|
66 |
|
67 |
+
# Clean the location in place
|
68 |
+
filtered_df['location'] = filtered_df['location'].apply(clean_location)
|
69 |
+
#added new line to drop duplicate records
|
|
|
70 |
filtered_df = filtered_df.drop_duplicates()
|
71 |
+
|
72 |
return filtered_df
|
73 |
|
74 |
+
def remove_special_chars(text):
|
75 |
+
if pd.isna(text):
|
76 |
+
return text
|
77 |
+
# Remove special characters and markdown formatting
|
78 |
+
cleaned_text = re.sub(r'[*\n\-_]', ' ', text)
|
79 |
+
# Remove extra whitespace
|
80 |
+
cleaned_text = ' '.join(cleaned_text.split())
|
|
|
81 |
return cleaned_text
|
82 |
|
|
|
83 |
@st.cache_resource
|
84 |
def load_models():
|
85 |
return {
|
|
|
95 |
'paraphrase': models['paraphrase'].encode(text)
|
96 |
}
|
97 |
|
98 |
+
def calculate_similarities(job_embeddings, resume_embedding):
|
99 |
+
similarities = []
|
100 |
+
for job_embedding in job_embeddings:
|
101 |
+
job_emb = np.array(job_embedding).reshape(1, -1)
|
102 |
+
res_emb = resume_embedding.reshape(1, -1)
|
103 |
+
cosine_sim = cosine_similarity(job_emb, res_emb)[0][0]
|
104 |
+
pearson_corr = pearsonr(job_embedding, resume_embedding)[0]
|
105 |
+
euclidean_dist = euclidean(job_embedding, resume_embedding)
|
106 |
+
similarities.append({
|
107 |
+
'cosine': cosine_sim,
|
108 |
+
'pearson': pearson_corr,
|
109 |
+
'euclidean': euclidean_dist
|
110 |
+
})
|
111 |
+
return similarities
|
112 |
+
|
113 |
+
def get_top_matches(df, n=50):
|
114 |
+
top_matches = pd.DataFrame()
|
115 |
+
for model_name in ['minilm', 'mpnet', 'paraphrase']:
|
116 |
+
for metric in ['cosine', 'pearson', 'euclidean']:
|
117 |
+
col_name = f'{model_name}_{metric}'
|
118 |
+
ascending = metric == 'euclidean'
|
119 |
+
top_n = df.nsmallest(n, col_name) if ascending else df.nlargest(n, col_name)
|
120 |
+
top_n['model'] = model_name
|
121 |
+
top_n['metric'] = metric
|
122 |
+
top_matches = pd.concat([top_matches, top_n])
|
123 |
+
return top_matches.drop_duplicates().head(150)
|
124 |
+
|
125 |
+
@st.cache_data
|
126 |
+
def evaluate_with_groq(resume_text, job_description_text, client):
|
127 |
+
prompt = f"""
|
128 |
+
Resume: {resume_text}
|
129 |
+
Job Description: {job_description_text}
|
130 |
+
Based on the above information, rate the match quality on a scale of 0-100 and provide reasoning.
|
131 |
+
Return your response in the following JSON format:
|
132 |
+
{{ "score": <integer between 0 and 100>, "reasoning": "<your explanation>" }}
|
133 |
+
"""
|
134 |
+
response = client.chat.completions.create(
|
135 |
+
messages=[
|
136 |
+
{"role": "user", "content": prompt}
|
137 |
+
],
|
138 |
+
model="mixtral-8x7b-32768",
|
139 |
+
max_tokens=200,
|
140 |
+
)
|
141 |
+
return json.loads(response.choices[0].message.content)
|
142 |
+
|
143 |
+
def display_data_explorer(df):
|
144 |
+
st.subheader("Data Explorer")
|
145 |
+
items_per_page = 15
|
146 |
+
num_pages = math.ceil(len(df) / items_per_page)
|
147 |
+
col1, col2, col3 = st.columns([1, 3, 1])
|
148 |
+
with col2:
|
149 |
+
page = st.number_input("Page", min_value=1, max_value=num_pages, value=1)
|
150 |
+
start_idx = (page - 1) * items_per_page
|
151 |
+
end_idx = start_idx + items_per_page
|
152 |
+
page_df = df.iloc[start_idx:end_idx]
|
153 |
+
|
154 |
+
def make_clickable(url, text):
|
155 |
+
return f'<a href="{url}" target="_blank" style="color: #4e79a7;">{text}</a>'
|
156 |
+
|
157 |
+
page_df['job_url'] = page_df.apply(lambda row: make_clickable(row['job_url'], 'Link'), axis=1)
|
158 |
+
page_df['company_url'] = page_df.apply(lambda row: make_clickable(row['company_url'], row['company']), axis=1)
|
159 |
+
|
160 |
+
display_columns = ['title', 'company_url', 'location', 'job_type', 'date_posted', 'job_url', 'groq_score', 'groq_reasoning']
|
161 |
+
st.write(page_df[display_columns].to_html(escape=False, index=False), unsafe_allow_html=True)
|
162 |
+
|
163 |
+
col1, col2, col3 = st.columns([1, 3, 1])
|
164 |
+
with col2:
|
165 |
+
st.write(f"Page {page} of {num_pages}")
|
166 |
+
|
167 |
+
def read_file_content(uploaded_file):
|
168 |
+
if uploaded_file.type == "application/pdf":
|
169 |
+
pdf_reader = io.BytesIO(uploaded_file.getvalue())
|
170 |
+
return extract_text(pdf_reader)
|
171 |
+
else:
|
172 |
+
return uploaded_file.getvalue().decode("utf-8", errors="ignore")
|
173 |
+
|
174 |
def main():
|
175 |
st.title("Resume-Job Matcher")
|
176 |
|
|
|
190 |
|
191 |
# Clean description and create embeddings
|
192 |
models = load_models()
|
193 |
+
df_filtered['cleaned_description'] = df_filtered['description'].apply(remove_special_chars)
|
194 |
|
195 |
for model_name in ['minilm', 'mpnet', 'paraphrase']:
|
196 |
df_filtered[f'embeddings_{model_name}'] = df_filtered['cleaned_description'].apply(lambda x: models[model_name].encode(x))
|
197 |
|
|
|
198 |
uploaded_file = st.file_uploader("Upload your resume", type=["txt", "pdf"], key="resume_uploader")
|
199 |
if uploaded_file is not None:
|
200 |
try:
|
|
|
201 |
resume_text = read_file_content(uploaded_file)
|
202 |
cleaned_resume = remove_special_chars(resume_text)
|
203 |
st.subheader("Parsed Resume")
|
204 |
st.text(cleaned_resume)
|
205 |
|
|
|
206 |
resume_embeddings = generate_embeddings(cleaned_resume, models)
|
207 |
|
|
|
208 |
for model_name in ['minilm', 'mpnet', 'paraphrase']:
|
209 |
similarities = calculate_similarities(df_filtered[f'embeddings_{model_name}'].tolist(), resume_embeddings[model_name])
|
210 |
for metric in ['cosine', 'pearson', 'euclidean']:
|
211 |
df_filtered[f'{model_name}_{metric}'] = [s[metric] for s in similarities]
|
212 |
|
|
|
213 |
top_matches = get_top_matches(df_filtered, 50)
|
214 |
st.subheader("Top 150 Matches (Before Groq Evaluation)")
|
215 |
st.dataframe(top_matches[['title', 'company', 'location', 'model', 'metric']])
|
216 |
|
|
|
217 |
groq_api_key = st.text_input("Enter your Groq API Key", type="password")
|
218 |
if groq_api_key:
|
219 |
client = groq.Groq(api_key=groq_api_key)
|
|
|
225 |
top_matches.at[row.Index, 'groq_reasoning'] = groq_result['reasoning']
|
226 |
progress_bar.progress((i + 1) / len(top_matches))
|
227 |
|
|
|
228 |
top_100_matches = top_matches.nlargest(100, 'groq_score')
|
229 |
st.subheader("Top 100 Matches After Groq Evaluation")
|
230 |
display_data_explorer(top_100_matches)
|