Spaces:

SID2702
/

CV_Process

Sleeping

App Files Files Community

bsiddhharth commited on Nov 21, 2024

Commit

1e97cbb

0 Parent(s):

Initial commit with app.py, cv_question.py , cv_short.py, extraction.py

Browse files

Files changed (7) hide show

.gitignore +14 -0
app.log +0 -0
app.py +71 -0
cv_question.py +130 -0
cv_short.py +317 -0
extraction.py +138 -0
requirements.txt +18 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,14 @@

+# Ignore virtual environment
+venv/
+# Ignore environment files
+.env
+# Ignore Python compiled files
+*.pyc
+__pycache__/
+# Ignore specific file (like extraction.pydantic)
+extraction_pydantic.py
+cv_quest.py
+logger.py

app.log ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import streamlit as st
+import cv_question
+import cv_short
+from logger import setup_logger
+# def initialize_session_state():
+    # """Initialize all session state variables with default values."""
+    # session_vars = {
+    #     'jd_text': "",
+    #     'min_years': 0,
+    #     'required_skills_list': [],
+    #     'uploaded_files': [],
+    #     'results': [],
+    #     'generated_questions': None,
+    #     'current_candidate_index': 0,
+    #     'processed_cvs': {},  # Store processed CV data
+    #     'analysis_complete': False
+    # }
+    # for var, default_value in session_vars.items():
+    #     if var not in st.session_state:
+    #         st.session_state[var] = default_value
+def clear_session_state():
+    """Clear all session state variables."""
+    for key in list(st.session_state.keys()):
+        del st.session_state[key]
+    # initialize_session_state()
+def main():
+    # Setup logger for app
+    app_logger = setup_logger('app_logger', 'app.log')
+    # Initialize session state
+    # initialize_session_state()
+    # Sidebar
+    st.sidebar.title("Navigation")
+    app_logger.info("Sidebar navigation displayed")
+    # Add reset button in sidebar
+    if st.sidebar.button("Reset All Data"):
+        clear_session_state()
+        st.sidebar.success("All data has been reset!")
+        app_logger.info("Session state reset")
+    # Navigation
+    page = st.sidebar.radio("Go to", ["CV Shortlisting", "Interview Questions"])
+    app_logger.info(f"Page selected: {page}")
+    try:
+        if page == "CV Shortlisting":
+            app_logger.info("Navigating to CV Shortlisting")
+            cv_short.create_cv_shortlisting_page()
+        elif page == "Interview Questions":
+            # Check if CV shortlisting is complete
+            # if not st.session_state.analysis_complete:
+            #     st.warning("Please complete the CV shortlisting process first.")
+            #     app_logger.warning("Attempted to access Interview Questions without completing CV shortlisting")
+            # else:
+                app_logger.info("Navigating to Interview Questions")
+                cv_question.create_interview_questions_page()
+    except Exception as e:
+        app_logger.error(f"Error occurred: {e}")
+        st.error(f"An error occurred: {e}")
+if __name__ == "__main__":
+    main()

cv_question.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import streamlit as st
+from langchain_groq import ChatGroq
+from langchain.prompts import ChatPromptTemplate
+import os
+import tempfile
+import json
+from extraction import extract_cv_data, process_file, display_candidates_info  # importing from your extraction.py
+# Initialize environment variables
+os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY")
+groq_api_key = os.getenv("GROQ_API_KEY")
+class InterviewQuestionGenerator:
+    def __init__(self):
+        self.llm = ChatGroq(
+            groq_api_key=groq_api_key,
+            # model_name="mixtral-8x7b-32768",
+            model_name = "llama3-8b-8192",
+            temperature=0.7,
+            max_tokens=4096
+        )
+        # The prompt template to generate questions based on extracted CV data
+        self.question_template = """
+        Based on the following CV excerpt, generate 5 specific basic technical interview questions
+        that are directly related to the candidate's experience and skills. Make sure the
+        questions test both their claimed knowledge and problem-solving abilities.
+        CV Excerpt:
+        {cv_text}
+        Skills Mentioned:
+        {skills}
+       Return the questions in the following text format:
+        (bold)
+        Question 1:\n
+        - Technical_question: "Your question here" \n
+        - Follow_up_question: "Deep dive question here" \n
+        - What_to_listen_for: "Key points to listen for here" \n
+        \n\n
+        Question 2:
+        - Technical_question: "Your question here" \n
+        - Follow_up_question: "Deep dive question here" \n
+        - What_to_listen_for: "Key points to listen for here" \n
+        Make sure to follow this format exactly, with the correct structure and labels for each question.
+        (Repeat for all 5 questions)
+        Be sure to make each question clear and actionable, and align it with the skills mentioned in the CV.
+        """
+        # Using ChatPromptTemplate for question generation
+        self.question_prompt = ChatPromptTemplate.from_messages(
+            [
+                ("system", self.question_template),
+                ("human", "{cv_text}\n{skills}")
+            ]
+        )
+    def generate_questions(self, cv_text: str, skills: str) -> str:
+        """Generate interview questions based on CV text and skills."""
+        runnable = self.question_prompt | self.llm  # Using Runnable instead of LLMChain
+        questions = runnable.invoke({
+            "cv_text": cv_text,
+            "skills": skills
+        })
+        return questions
+def create_interview_questions_page():
+    # Initializing session state variables since they dont exist at first
+    if 'uploaded_file' not in st.session_state:
+        st.session_state.uploaded_file = None
+    if 'cv_text' not in st.session_state:
+        st.session_state.cv_text = None
+    if 'candidates_list' not in st.session_state:
+        st.session_state.candidates_list = None
+    if 'generated_questions' not in st.session_state:
+        st.session_state.generated_questions = None
+    st.title("Interview Question Generator")
+    # File uploader
+    uploaded_file = st.file_uploader("Upload a CV", type=['pdf', 'txt'])
+    # Update session state when new file is uploaded
+    if uploaded_file is not None and (st.session_state.uploaded_file is None or
+                                    uploaded_file.name != st.session_state.uploaded_file.name):
+        st.session_state.uploaded_file = uploaded_file
+        st.session_state.cv_text = None  # Reset CV text
+        st.session_state.candidates_list = None  # Reset candidates
+        st.session_state.generated_questions = None  # Reset questions
+    # Process file if it exists in session state
+    if st.session_state.uploaded_file is not None:
+        # Only process the file if we haven't already
+        if st.session_state.cv_text is None:
+            st.session_state.cv_text = process_file(st.session_state.uploaded_file)
+            st.session_state.candidates_list = extract_cv_data(st.session_state.cv_text)
+        # Display candidates info if available
+        if st.session_state.candidates_list:
+            display_candidates_info(st.session_state.candidates_list)
+            # Generate questions if not already generated
+            if st.session_state.generated_questions is None:
+                candidate = st.session_state.candidates_list[0]
+                generator = InterviewQuestionGenerator()
+                questions = generator.generate_questions(
+                    cv_text=st.session_state.cv_text,
+                    skills=", ".join(candidate.skills)
+                )
+                st.session_state.generated_questions = questions.content
+            # Display the generated questions
+            st.subheader("Recommended Interview Questions")
+            st.markdown(st.session_state.generated_questions)

cv_short.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import logging
+from langchain_community.document_loaders import PDFPlumberLoader, TextLoader
+import extraction as extr # extraction.py
+import streamlit as st
+import pandas as pd
+# Configure logging
+logging.basicConfig(level=logging.DEBUG , format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class CVAnalyzer:
+    def __init__(self):
+        # Initialize Groq LLM
+        logger.info("Initializing CVAnalyzer")
+        self.llm = extr.initialize_llm()  # Updated to use the new function
+        logger.info(" LLM initialized")
+        # Initialize embeddings (if needed)
+        # self.embeddings = HuggingFaceEmbeddings(
+        #     model_name="sentence-transformers/all-mpnet-base-v2"
+        # )
+    def load_document(self, file_path: str) -> str:
+        logger.info(f"Loading document from file: {file_path}")
+        """Load document based on file type."""
+        if file_path.endswith('.pdf'):
+            loader = PDFPlumberLoader(file_path)
+        else:
+            loader = TextLoader(file_path)
+        documents = loader.load()
+        logger.info(f"Document loaded from {file_path}")
+        return " ".join([doc.page_content for doc in documents])
+    def extract_cv_info(self, cv_text: str) -> list[extr.cv]: # referring to cv class in extraction.py
+        logger.info("Extracting CV information")
+        """Extract structured information from CV text using new extraction method."""
+        extracted_data = extr.extract_cv_data(cv_text)
+        logger.info(f"Extracted {len(extracted_data)} candidate(s) from CV")
+        return extracted_data
+        # return extr.extract_cv_data(cv_text)
+    def calculate_match_score(self, cv_info: dict, jd_requirements: dict) -> dict:
+        logger.info(f"Calculating match score for CV: {cv_info.get('name', 'Unknown')}")
+        """Calculate match score between CV and job requirements."""
+        score_components = {
+            "skills_match": 0,
+            "experience_match": 0,
+            "overall_score": 0
+        }
+        # Skills matching
+        if "skills" in cv_info and "required_skills" in jd_requirements:
+            cv_skills = set(skill.lower() for skill in cv_info["skills"])
+            required_skills = set(skill.lower() for skill in jd_requirements["required_skills"])
+            score_components["skills_match"] = len(cv_skills & required_skills) / len(required_skills)
+        # Experience matching
+        if "years_of_exp" in cv_info and "min_years_experience" in jd_requirements:
+            if cv_info["years_of_exp"] >= jd_requirements["min_years_experience"]:
+                score_components["experience_match"] = 1.0
+            else:
+                score_components["experience_match"] = cv_info["years_of_exp"] / jd_requirements["min_years_experience"]
+        # Calculate overall score (weighted average)
+        weights = {"skills_match": 0.5, "experience_match": 0.3}
+        score_components["overall_score"] = sum(
+            score * weights[component]
+            for component, score in score_components.items()
+            if component != "overall_score"
+        )
+        logger.debug(f"Match score for {cv_info.get('name', 'Unknown')}: {score_components['overall_score']:.2%}")
+        return score_components
+# def create_cv_shortlisting_page():
+#     logger.info("Starting CV shortlisting system")
+#     st.title("CV Shortlisting System")
+#     # Reset analysis state when starting new analysis
+#     if 'analysis_started' not in st.session_state:
+#         st.session_state.analysis_started = False
+#     # Job Description Input
+#     st.header("Job Description")
+#     jd_text = st.text_area("Enter the job description",
+#                            value=st.session_state.jd_text if 'jd_text' in st.session_state else "")
+#     if jd_text:
+#         st.session_state.jd_text = jd_text
+#     # Job Requirements Input
+#     st.header("Job Requirements")
+#     min_years = st.number_input("Minimum years of experience",
+#                                min_value=0,
+#                                value=st.session_state.min_years if 'min_years' in st.session_state else 3)
+#     required_skills = st.text_input("Required skills (comma-separated)",
+#                                    value=','.join(st.session_state.required_skills_list) if 'required_skills_list' in st.session_state else "")
+#     required_skills_list = [skill.strip() for skill in required_skills.split(",") if skill.strip()]
+#     # Update session state
+#     st.session_state.required_skills_list = required_skills_list
+#     st.session_state.min_years = min_years
+#     # CV Upload
+#     st.header("Upload CVs")
+#     uploaded_files = st.file_uploader("Choose CV files",
+#                                     accept_multiple_files=True,
+#                                     type=['pdf', 'txt'],
+#                                     key="cv_upload")
+#     if uploaded_files:
+#         st.session_state.uploaded_files = uploaded_files
+#         st.session_state.analysis_started = True
+#     # Analysis Button
+#     if st.button("Analyze CVs") and uploaded_files and jd_text:
+#         st.session_state.results = []  # Reset results
+#         st.session_state.processed_cvs = {}  # Reset processed CVs
+        # with st.spinner('Analyzing CVs...'):
+        #     try:
+        #         analyzer = CVAnalyzer()
+        #         # Prepare job requirements
+        #         job_requirements = {
+        #             "min_years_experience": st.session_state.min_years,
+        #             "required_skills": st.session_state.required_skills_list
+        #         }
+        #         # Process each CV
+        #         for uploaded_file in uploaded_files:
+        #             cv_text = extr.process_file(uploaded_file)
+        #             try:
+        #                 # Extract CV information
+        #                 candidates = analyzer.extract_cv_info(cv_text)
+        #                 for idx, candidate in enumerate(candidates):
+        #                     # Calculate match scores
+        #                     match_scores = analyzer.calculate_match_score(
+        #                         candidate.__dict__,
+        #                         job_requirements
+        #                     )
+        #                     # Store results
+        #                     result = {
+        #                         "Name": candidate.name or "Unknown",
+        #                         "Experience (Years)": candidate.years_of_exp or 0,
+        #                         "Skills": ", ".join(candidate.skills) if candidate.skills else "None",
+        #                         "Certifications": ", ".join(candidate.certifications) if candidate.certifications else "None",
+        #                         "Skills Match": f"{match_scores['skills_match']:.2%}",
+        #                         "Experience Match": f"{match_scores['experience_match']:.2%}",
+        #                         "Overall Score": f"{match_scores['overall_score']:.2%}"
+        #                     }
+        #                     st.session_state.results.append(result)
+        #                     # Store processed CV data for interview questions
+        #                     st.session_state.processed_cvs[f"{candidate.name}_{idx}"] = {
+        #                         "cv_text": cv_text,
+        #                         "candidate": candidate,
+        #                         "match_scores": match_scores
+        #                     }
+        #             except Exception as e:
+        #                 logger.error(f"Error processing CV: {str(e)}")
+        #                 st.error(f"Error processing CV: {str(e)}")
+        #         # Mark analysis as complete
+        #         st.session_state.analysis_complete = True
+        #         # Display results
+        #         if st.session_state.results:
+        #             df = pd.DataFrame(st.session_state.results)
+        #             df = df.sort_values("Overall Score", ascending=False)
+        #             st.dataframe(df)
+        #             # Save top candidates
+        #             st.session_state.top_candidates = df.head()
+        #         else:
+        #             logger.warning("No valid candidates found")
+        #             st.warning("No valid candidates found in the uploaded CVs")
+        #     except Exception as e:
+        #         logger.error(f"Analysis error: {str(e)}")
+        #         st.error(f"An error occurred during analysis: {str(e)}")
+        #         st.session_state.analysis_complete = False
+#     # Display analysis status
+#     if st.session_state.get('analysis_complete', False):
+#         st.success("CV analysis complete! You can now proceed to generate interview questions.")
+def create_cv_shortlisting_page():
+    logger.info("Starting CV shortlisting system")
+    # Initialize session state if not already initialized
+    if 'jd_text' not in st.session_state:
+        st.session_state.jd_text = ""
+    if 'min_years' not in st.session_state:
+        st.session_state.min_years = 3
+    if 'required_skills_list' not in st.session_state:
+        st.session_state.required_skills_list = []
+    if 'uploaded_files' not in st.session_state:
+        st.session_state.uploaded_files = None
+    if 'results' not in st.session_state:
+        st.session_state.results = []
+    if 'analysis_complete' not in st.session_state:
+        st.session_state.analysis_complete = False
+    st.title("CV Shortlisting System")
+    # Job Description Input
+    st.header("Job Description")
+    jd_text = st.text_area("Enter the job description", value=st.session_state.jd_text)
+    if jd_text:
+        st.session_state.jd_text = jd_text
+    # Job Requirements Input
+    st.header("Job Requirements")
+    min_years = st.number_input("Minimum years of experience",
+                               min_value=0,
+                               value=st.session_state.min_years,
+                               )
+    required_skills = st.text_input("Required skills (comma-separated)",
+                                   value=','.join(st.session_state.required_skills_list) if st.session_state.required_skills_list else "")
+    required_skills_list = [skill.strip() for skill in required_skills.split(",") if skill.strip()]
+    if required_skills_list:
+        st.session_state.required_skills_list = required_skills_list
+    if min_years:
+        st.session_state.min_years = min_years
+    # CV Upload
+    st.header("Upload CVs")
+    uploaded_files = st.file_uploader("Choose CV files",
+                                    accept_multiple_files=True,
+                                    type=['pdf', 'txt'],
+                                    key="unique_cv_upload")
+    if uploaded_files:
+        st.session_state.uploaded_files = uploaded_files
+    if st.button("Analyze CVs") and uploaded_files and jd_text:
+        logger.info("Analyzing uploaded CVs")
+        with st.spinner('Analyzing CVs...'):
+            analyzer = CVAnalyzer()
+            # Prepare job requirements
+            job_requirements = {
+                "min_years_experience": st.session_state.min_years,
+                "required_skills": st.session_state.required_skills_list
+            }
+            results = []
+            st.session_state.results = []  # Reset results for new analysis
+            # Process each CV
+            for uploaded_file in uploaded_files:
+                cv_text = extr.process_file(uploaded_file)
+                try:
+                    candidates = analyzer.extract_cv_info(cv_text)
+                    for candidate in candidates:
+                        match_scores = analyzer.calculate_match_score(
+                            candidate.__dict__,
+                            job_requirements
+                        )
+                        result = {
+                            "Name": candidate.name or "Unknown",
+                            "Experience (Years)": candidate.years_of_exp or 0,
+                            "Skills": ", ".join(candidate.skills) if candidate.skills else "None",
+                            "Certifications": ", ".join(candidate.certifications) if candidate.certifications else "None",
+                            "Skills Match": f"{match_scores['skills_match']:.2%}",
+                            "Experience Match": f"{match_scores['experience_match']:.2%}",
+                            "Overall Score": f"{match_scores['overall_score']:.2%}"
+                        }
+                        results.append(result)
+                        st.session_state.results.append(result)
+                except Exception as e:
+                    logger.error(f"Error processing CV: {str(e)}")
+        # Display results
+        logger.info(f"Displaying analyzed results for {len(results)} candidate(s)")
+        if st.session_state.results:
+            df = pd.DataFrame(st.session_state.results)
+            df = df.sort_values("Overall Score", ascending=False)
+            st.dataframe(df)
+            st.session_state.analysis_complete = True
+        else:
+            logger.warning("No valid candidates found in uploaded CVs")
+            st.error("No valid results found from CV analysis")
+            st.session_state.analysis_complete = False

extraction.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import logging
+from typing import Optional
+from pydantic import BaseModel, Field
+from langchain.prompts import ChatPromptTemplate
+from langchain_groq import ChatGroq
+import os
+import tempfile
+import streamlit as st
+from langchain_community.document_loaders import PDFPlumberLoader, TextLoader
+# logging
+logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Defining the CV structure using Pydantic for structured output
+class cv(BaseModel):
+    name: Optional[str] = Field(default=None, description="Name of candidate")
+    skills: Optional[list[str]] = Field(default=None, description="Skills of candidate")
+    certifications: Optional[list[str]] = Field(default=None, description="Certificates of candidate")
+    years_of_exp: Optional[int] = Field(default=None, description="Years of experience")
+# Defining the data structure that contains a list of CVs
+class data(BaseModel):
+    candidates: list[cv]
+def create_prompt_template() -> ChatPromptTemplate:
+    logger.info("Creating the prompt template for CV extraction")
+    """Create the prompt template for CV extraction."""
+    return ChatPromptTemplate.from_messages(
+        [
+            ("system",
+        "You are an expert extraction algorithm. Your job is to extract the following specific information from the given text:"
+         "- Name of the candidate"
+         "- Skills"
+         "- Certifications (Look for terms such as 'Certified,' 'Certification,' 'Certificate')"
+         "- years_of_exp (Extract only the number of years. If an approximation is given (e.g., '5+ years'), return the lower bound (e.g., '5').)"
+         "If you cannot find the value for a specific attribute, return null for that attribute's value."
+         "The 'years of experience' can be mentioned in various formats (e.g., '5+ years', '5 years', 'since 2010'). "
+         "Extract it accurately, even if it's mentioned in different contexts like a professional summary or work experience. "
+         "If multiple jobs are listed, you can calculate the experience from the work history."
+        "Certifications are usually found under headers like 'Certifications,' 'Professional Certificates,' or similar. They might include phrases like 'AWS Certified Developer,' 'MongoDB Developer Associate,' etc."
+        ),
+            ("human", "{text}")
+        ]
+    )
+def initialize_llm() -> ChatGroq:
+    logger.info("Initializing LLM")
+    """Initialize the language model."""
+    os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY")
+    groq_api_key = os.getenv("GROQ_API_KEY")
+    if not groq_api_key:
+        logger.error("GROQ_API_KEY is not set")
+        raise ValueError("GROQ_API_KEY environment variable is missing.")
+    return ChatGroq(groq_api_key=groq_api_key, model_name="llama-3.1-70b-versatile", temperature=0.6)
+def extract_cv_data(text: str) -> list[cv]:
+    logger.info("Extracting CV data from text")
+    """Extract data from the text using the language model."""
+    prompt = create_prompt_template()
+    llm = initialize_llm()
+    # creating a chain to extract structred ouput from the text using schema
+    runnable = prompt | llm.with_structured_output(schema=data)
+    response = runnable.invoke({"text": text})
+    logger.info(f"Extracted {len(response.candidates)} candidate(s) from the text")
+    return response.candidates  # returns the list of candidates
+def process_file(uploaded_files) -> str:
+    logger.info(f"Processing file: {uploaded_files.name}")
+    """Process the uploaded file and return the text."""
+    with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_files.name)[1]) as tmp_file:
+        tmp_file.write(uploaded_files.getvalue())
+        tmp_path = tmp_file.name
+    try:
+        if tmp_path.endswith('.pdf'):
+            loader = PDFPlumberLoader(tmp_path)
+            logger.info(f"Loaded PDF file: {tmp_path}")
+        else:
+            loader = TextLoader(tmp_path)
+            logger.info(f"Loaded text file: {tmp_path}")
+        documents = loader.load()
+        # return " ".join([doc.page_content for doc in documents])
+        text_content = " ".join([doc.page_content for doc in documents])
+        logger.info(f"Extracted text from file: {uploaded_files.name}")
+        return text_content
+    finally:
+        logger.info(f"Deleting temporary file: {tmp_path}")
+        os.unlink(tmp_path)
+def display_candidates_info(candidates_list: list[cv]):
+    logger.info(f"Displaying information for {len(candidates_list)} candidate(s)")
+    """Display the extracted candidates' information in a table."""
+    logger.debug(f"Candidate list: {candidates_list}")
+    data = []
+    for candidate in candidates_list:
+        data.append({
+            "Name": candidate.name,
+            "Skills": ", ".join(candidate.skills) if candidate.skills else 'None',
+            "Certifications": ", ".join(candidate.certifications) if candidate.certifications else 'None',
+            "Years of Experience": candidate.years_of_exp if candidate.years_of_exp else 'None'
+        })
+    st.write("### Candidates Information")
+    st.table(data)
+    logger.debug("Displayed candidates' information in table")
+    # print(candidates_list)
+# Try this to see the working of extraction
+# Streamlit file uploader and extraction logic
+# uploaded_files = st.file_uploader(" Upload the CV: ", type=['pdf', 'txt'],key="unique_cv_upload")
+# if uploaded_files is not None:
+#     text = process_file(uploaded_files)
+#     # text = ep.text
+#     candidates_list = extract_cv_data(text)
+#     display_candidates_info(candidates_list)

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+langchain
+python-dotenv
+ipykernel
+langchain-community
+streamlit
+pypdf
+pymupdf
+langchain-text-splitters
+langchain-openai
+chromadb
+sentence_transformers
+langchain_huggingface
+faiss-cpu
+langchain_chroma
+openai
+langchain-groq
+pdfplumber
+prettytable