Spaces:

Niharmahesh
/

job_easz

Running

App Files Files Community

Niharmahesh commited on Dec 17, 2024

Commit

7d0b74c

verified ·

1 Parent(s): 2673151

Update pages/test.py

Browse files

Files changed (1) hide show

pages/test.py +117 -48

pages/test.py CHANGED Viewed

@@ -19,67 +19,67 @@ HF_TOKEN = st.secrets["HF_TOKEN"]
 HF_USERNAME = st.secrets["HF_USERNAME"]
 DATASET_NAME = "jobeasz"
-import dask.dataframe as dd
-from distributed import Client
 @st.cache_data(ttl=3600)
 def load_and_concat_data():
     api = HfApi()
     dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
     feather_files = [file for file in dataset_files if file.endswith('.feather')]
-    # Create a Dask client for local parallelization
-    client = Client()
-    # Use Dask to read feather files in parallel
-    ddf = dd.from_delayed([
-        dd.from_delayed(client.submit(feather.read_feather, api.hf_hub_download(
-            repo_id=f"{HF_USERNAME}/{DATASET_NAME}",
-            filename=file,
-            repo_type="dataset",
-            token=HF_TOKEN
-        )))
-        for file in feather_files
-    ])
-    # Perform operations on the Dask DataFrame
     columns_to_keep = [
         'site', 'job_url', 'title', 'company', 'location',
         'job_type', 'date_posted', 'is_remote', 'company_url', 'description'
     ]
-    ddf = ddf[columns_to_keep]
-    ddf['date_posted'] = dd.to_datetime(ddf['date_posted'], errors='coerce')
-    ddf = ddf.dropna(subset=['date_posted'])
-    ddf = ddf[ddf['date_posted'].dt.year == 2024]
-    ddf['title'] = ddf['title'].str.lower()
-    ddf['company'] = ddf['company'].str.lower()
     def clean_location(location):
         if pd.isna(location):
-            return location
         location = location.lower()
         location = re.sub(r',\s*(us|usa)$', '', location)
         return location
-    ddf['location'] = ddf['location'].map(clean_location)
-    # Compute the final result
-    filtered_df = ddf.compute()
     filtered_df = filtered_df.drop_duplicates()
     return filtered_df
-def clean_description(text):
-    if not isinstance(text, str):
-        return ''  # Return an empty string for non-string inputs
-    # Remove newline characters and asterisks
-    cleaned_text = re.sub(r'[\n\r\*]', ' ', text)
-    # Remove extra spaces
-    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
     return cleaned_text
 @st.cache_resource
 def load_models():
     return {
@@ -95,6 +95,82 @@ def generate_embeddings(text, models):
         'paraphrase': models['paraphrase'].encode(text)
     }
 def main():
     st.title("Resume-Job Matcher")
@@ -114,36 +190,30 @@ def main():
     # Clean description and create embeddings
     models = load_models()
-    df_filtered['cleaned_description'] = df_filtered['description'].apply(clean_description)
     for model_name in ['minilm', 'mpnet', 'paraphrase']:
         df_filtered[f'embeddings_{model_name}'] = df_filtered['cleaned_description'].apply(lambda x: models[model_name].encode(x))
-    # Rest of the code remains the same
     uploaded_file = st.file_uploader("Upload your resume", type=["txt", "pdf"], key="resume_uploader")
     if uploaded_file is not None:
         try:
-            # Read and clean the resume
             resume_text = read_file_content(uploaded_file)
             cleaned_resume = remove_special_chars(resume_text)
             st.subheader("Parsed Resume")
             st.text(cleaned_resume)
-            # Generate embeddings for resume
             resume_embeddings = generate_embeddings(cleaned_resume, models)
-            # Calculate similarities
             for model_name in ['minilm', 'mpnet', 'paraphrase']:
                 similarities = calculate_similarities(df_filtered[f'embeddings_{model_name}'].tolist(), resume_embeddings[model_name])
                 for metric in ['cosine', 'pearson', 'euclidean']:
                     df_filtered[f'{model_name}_{metric}'] = [s[metric] for s in similarities]
-            # Get top 150 matches
             top_matches = get_top_matches(df_filtered, 50)
             st.subheader("Top 150 Matches (Before Groq Evaluation)")
             st.dataframe(top_matches[['title', 'company', 'location', 'model', 'metric']])
-            # Groq evaluation (if API key is provided)
             groq_api_key = st.text_input("Enter your Groq API Key", type="password")
             if groq_api_key:
                 client = groq.Groq(api_key=groq_api_key)
@@ -155,7 +225,6 @@ def main():
                     top_matches.at[row.Index, 'groq_reasoning'] = groq_result['reasoning']
                     progress_bar.progress((i + 1) / len(top_matches))
-                # Sort by Groq score and take top 100
                 top_100_matches = top_matches.nlargest(100, 'groq_score')
                 st.subheader("Top 100 Matches After Groq Evaluation")
                 display_data_explorer(top_100_matches)

 HF_USERNAME = st.secrets["HF_USERNAME"]
 DATASET_NAME = "jobeasz"
 @st.cache_data(ttl=3600)
 def load_and_concat_data():
     api = HfApi()
     dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
     feather_files = [file for file in dataset_files if file.endswith('.feather')]
+    all_data = []
+    for file in feather_files:
+        try:
+            file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
+            df = feather.read_feather(file_content)
+            all_data.append(df)
+        except Exception:
+            pass  # Silently skip files that can't be processed
+    if not all_data:
+        return pd.DataFrame()
+    concatenated_df = pd.concat(all_data, ignore_index=True)
     columns_to_keep = [
         'site', 'job_url', 'title', 'company', 'location',
         'job_type', 'date_posted', 'is_remote', 'company_url', 'description'
     ]
+    filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
+    filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
+    # Drop duplicates and rows with NaT in date_posted
+    filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
+    #filtering based on data in 2024
+    filtered_df = filtered_df[filtered_df['date_posted'].dt.year==2024]
+    # Convert titles and company name to lowercase
+    filtered_df['title'] = filtered_df['title'].str.lower()
+    filtered_df['company'] = filtered_df['company'].str.lower()
+    # Function to clean the location
     def clean_location(location):
         if pd.isna(location):
+            return location  # Return NaN as is
+        # Convert to lowercase
         location = location.lower()
+        # Remove ', us' or ', usa' from the end using regex
         location = re.sub(r',\s*(us|usa)$', '', location)
         return location
+    # Clean the location in place
+    filtered_df['location'] = filtered_df['location'].apply(clean_location)
+    #added new line to drop duplicate records
     filtered_df = filtered_df.drop_duplicates()
     return filtered_df
+def remove_special_chars(text):
+    if pd.isna(text):
+        return text
+    # Remove special characters and markdown formatting
+    cleaned_text = re.sub(r'[*\n\-_]', ' ', text)
+    # Remove extra whitespace
+    cleaned_text = ' '.join(cleaned_text.split())
     return cleaned_text
 @st.cache_resource
 def load_models():
     return {
         'paraphrase': models['paraphrase'].encode(text)
     }
+def calculate_similarities(job_embeddings, resume_embedding):
+    similarities = []
+    for job_embedding in job_embeddings:
+        job_emb = np.array(job_embedding).reshape(1, -1)
+        res_emb = resume_embedding.reshape(1, -1)
+        cosine_sim = cosine_similarity(job_emb, res_emb)[0][0]
+        pearson_corr = pearsonr(job_embedding, resume_embedding)[0]
+        euclidean_dist = euclidean(job_embedding, resume_embedding)
+        similarities.append({
+            'cosine': cosine_sim,
+            'pearson': pearson_corr,
+            'euclidean': euclidean_dist
+        })
+    return similarities
+def get_top_matches(df, n=50):
+    top_matches = pd.DataFrame()
+    for model_name in ['minilm', 'mpnet', 'paraphrase']:
+        for metric in ['cosine', 'pearson', 'euclidean']:
+            col_name = f'{model_name}_{metric}'
+            ascending = metric == 'euclidean'
+            top_n = df.nsmallest(n, col_name) if ascending else df.nlargest(n, col_name)
+            top_n['model'] = model_name
+            top_n['metric'] = metric
+            top_matches = pd.concat([top_matches, top_n])
+    return top_matches.drop_duplicates().head(150)
+@st.cache_data
+def evaluate_with_groq(resume_text, job_description_text, client):
+    prompt = f"""
+    Resume: {resume_text}
+    Job Description: {job_description_text}
+    Based on the above information, rate the match quality on a scale of 0-100 and provide reasoning.
+    Return your response in the following JSON format:
+    {{ "score": <integer between 0 and 100>, "reasoning": "<your explanation>" }}
+    """
+    response = client.chat.completions.create(
+        messages=[
+            {"role": "user", "content": prompt}
+        ],
+        model="mixtral-8x7b-32768",
+        max_tokens=200,
+    )
+    return json.loads(response.choices[0].message.content)
+def display_data_explorer(df):
+    st.subheader("Data Explorer")
+    items_per_page = 15
+    num_pages = math.ceil(len(df) / items_per_page)
+    col1, col2, col3 = st.columns([1, 3, 1])
+    with col2:
+        page = st.number_input("Page", min_value=1, max_value=num_pages, value=1)
+    start_idx = (page - 1) * items_per_page
+    end_idx = start_idx + items_per_page
+    page_df = df.iloc[start_idx:end_idx]
+    def make_clickable(url, text):
+        return f'<a href="{url}" target="_blank" style="color: #4e79a7;">{text}</a>'
+    page_df['job_url'] = page_df.apply(lambda row: make_clickable(row['job_url'], 'Link'), axis=1)
+    page_df['company_url'] = page_df.apply(lambda row: make_clickable(row['company_url'], row['company']), axis=1)
+    display_columns = ['title', 'company_url', 'location', 'job_type', 'date_posted', 'job_url', 'groq_score', 'groq_reasoning']
+    st.write(page_df[display_columns].to_html(escape=False, index=False), unsafe_allow_html=True)
+    col1, col2, col3 = st.columns([1, 3, 1])
+    with col2:
+        st.write(f"Page {page} of {num_pages}")
+def read_file_content(uploaded_file):
+    if uploaded_file.type == "application/pdf":
+        pdf_reader = io.BytesIO(uploaded_file.getvalue())
+        return extract_text(pdf_reader)
+    else:
+        return uploaded_file.getvalue().decode("utf-8", errors="ignore")
 def main():
     st.title("Resume-Job Matcher")
     # Clean description and create embeddings
     models = load_models()
+    df_filtered['cleaned_description'] = df_filtered['description'].apply(remove_special_chars)
     for model_name in ['minilm', 'mpnet', 'paraphrase']:
         df_filtered[f'embeddings_{model_name}'] = df_filtered['cleaned_description'].apply(lambda x: models[model_name].encode(x))
     uploaded_file = st.file_uploader("Upload your resume", type=["txt", "pdf"], key="resume_uploader")
     if uploaded_file is not None:
         try:
             resume_text = read_file_content(uploaded_file)
             cleaned_resume = remove_special_chars(resume_text)
             st.subheader("Parsed Resume")
             st.text(cleaned_resume)
             resume_embeddings = generate_embeddings(cleaned_resume, models)
             for model_name in ['minilm', 'mpnet', 'paraphrase']:
                 similarities = calculate_similarities(df_filtered[f'embeddings_{model_name}'].tolist(), resume_embeddings[model_name])
                 for metric in ['cosine', 'pearson', 'euclidean']:
                     df_filtered[f'{model_name}_{metric}'] = [s[metric] for s in similarities]
             top_matches = get_top_matches(df_filtered, 50)
             st.subheader("Top 150 Matches (Before Groq Evaluation)")
             st.dataframe(top_matches[['title', 'company', 'location', 'model', 'metric']])
             groq_api_key = st.text_input("Enter your Groq API Key", type="password")
             if groq_api_key:
                 client = groq.Groq(api_key=groq_api_key)
                     top_matches.at[row.Index, 'groq_reasoning'] = groq_result['reasoning']
                     progress_bar.progress((i + 1) / len(top_matches))
                 top_100_matches = top_matches.nlargest(100, 'groq_score')
                 st.subheader("Top 100 Matches After Groq Evaluation")
                 display_data_explorer(top_100_matches)