bsiddhharth commited on
Commit
1e97cbb
·
0 Parent(s):

Initial commit with app.py, cv_question.py , cv_short.py, extraction.py

Browse files
Files changed (7) hide show
  1. .gitignore +14 -0
  2. app.log +0 -0
  3. app.py +71 -0
  4. cv_question.py +130 -0
  5. cv_short.py +317 -0
  6. extraction.py +138 -0
  7. requirements.txt +18 -0
.gitignore ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore virtual environment
2
+ venv/
3
+
4
+ # Ignore environment files
5
+ .env
6
+
7
+ # Ignore Python compiled files
8
+ *.pyc
9
+ __pycache__/
10
+
11
+ # Ignore specific file (like extraction.pydantic)
12
+ extraction_pydantic.py
13
+ cv_quest.py
14
+ logger.py
app.log ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import streamlit as st
3
+ import cv_question
4
+ import cv_short
5
+ from logger import setup_logger
6
+
7
+ # def initialize_session_state():
8
+ # """Initialize all session state variables with default values."""
9
+ # session_vars = {
10
+ # 'jd_text': "",
11
+ # 'min_years': 0,
12
+ # 'required_skills_list': [],
13
+ # 'uploaded_files': [],
14
+ # 'results': [],
15
+ # 'generated_questions': None,
16
+ # 'current_candidate_index': 0,
17
+ # 'processed_cvs': {}, # Store processed CV data
18
+ # 'analysis_complete': False
19
+ # }
20
+
21
+ # for var, default_value in session_vars.items():
22
+ # if var not in st.session_state:
23
+ # st.session_state[var] = default_value
24
+
25
+ def clear_session_state():
26
+ """Clear all session state variables."""
27
+ for key in list(st.session_state.keys()):
28
+ del st.session_state[key]
29
+ # initialize_session_state()
30
+
31
+ def main():
32
+ # Setup logger for app
33
+ app_logger = setup_logger('app_logger', 'app.log')
34
+
35
+ # Initialize session state
36
+ # initialize_session_state()
37
+
38
+ # Sidebar
39
+ st.sidebar.title("Navigation")
40
+ app_logger.info("Sidebar navigation displayed")
41
+
42
+ # Add reset button in sidebar
43
+ if st.sidebar.button("Reset All Data"):
44
+ clear_session_state()
45
+ st.sidebar.success("All data has been reset!")
46
+ app_logger.info("Session state reset")
47
+
48
+ # Navigation
49
+ page = st.sidebar.radio("Go to", ["CV Shortlisting", "Interview Questions"])
50
+ app_logger.info(f"Page selected: {page}")
51
+
52
+ try:
53
+ if page == "CV Shortlisting":
54
+ app_logger.info("Navigating to CV Shortlisting")
55
+ cv_short.create_cv_shortlisting_page()
56
+
57
+ elif page == "Interview Questions":
58
+ # Check if CV shortlisting is complete
59
+ # if not st.session_state.analysis_complete:
60
+ # st.warning("Please complete the CV shortlisting process first.")
61
+ # app_logger.warning("Attempted to access Interview Questions without completing CV shortlisting")
62
+ # else:
63
+ app_logger.info("Navigating to Interview Questions")
64
+ cv_question.create_interview_questions_page()
65
+
66
+ except Exception as e:
67
+ app_logger.error(f"Error occurred: {e}")
68
+ st.error(f"An error occurred: {e}")
69
+
70
+ if __name__ == "__main__":
71
+ main()
cv_question.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain_groq import ChatGroq
3
+ from langchain.prompts import ChatPromptTemplate
4
+ import os
5
+ import tempfile
6
+ import json
7
+ from extraction import extract_cv_data, process_file, display_candidates_info # importing from your extraction.py
8
+
9
+ # Initialize environment variables
10
+ os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY")
11
+ groq_api_key = os.getenv("GROQ_API_KEY")
12
+
13
+ class InterviewQuestionGenerator:
14
+ def __init__(self):
15
+ self.llm = ChatGroq(
16
+ groq_api_key=groq_api_key,
17
+ # model_name="mixtral-8x7b-32768",
18
+ model_name = "llama3-8b-8192",
19
+ temperature=0.7,
20
+ max_tokens=4096
21
+ )
22
+
23
+ # The prompt template to generate questions based on extracted CV data
24
+ self.question_template = """
25
+ Based on the following CV excerpt, generate 5 specific basic technical interview questions
26
+ that are directly related to the candidate's experience and skills. Make sure the
27
+ questions test both their claimed knowledge and problem-solving abilities.
28
+
29
+ CV Excerpt:
30
+ {cv_text}
31
+
32
+ Skills Mentioned:
33
+ {skills}
34
+
35
+ Return the questions in the following text format:
36
+
37
+ (bold)
38
+ Question 1:\n
39
+
40
+ - Technical_question: "Your question here" \n
41
+
42
+ - Follow_up_question: "Deep dive question here" \n
43
+
44
+ - What_to_listen_for: "Key points to listen for here" \n
45
+
46
+ \n\n
47
+ Question 2:
48
+
49
+ - Technical_question: "Your question here" \n
50
+
51
+ - Follow_up_question: "Deep dive question here" \n
52
+
53
+ - What_to_listen_for: "Key points to listen for here" \n
54
+
55
+
56
+
57
+ Make sure to follow this format exactly, with the correct structure and labels for each question.
58
+
59
+
60
+ (Repeat for all 5 questions)
61
+
62
+ Be sure to make each question clear and actionable, and align it with the skills mentioned in the CV.
63
+ """
64
+
65
+ # Using ChatPromptTemplate for question generation
66
+ self.question_prompt = ChatPromptTemplate.from_messages(
67
+ [
68
+ ("system", self.question_template),
69
+ ("human", "{cv_text}\n{skills}")
70
+ ]
71
+ )
72
+
73
+ def generate_questions(self, cv_text: str, skills: str) -> str:
74
+ """Generate interview questions based on CV text and skills."""
75
+ runnable = self.question_prompt | self.llm # Using Runnable instead of LLMChain
76
+ questions = runnable.invoke({
77
+ "cv_text": cv_text,
78
+ "skills": skills
79
+ })
80
+ return questions
81
+
82
+
83
+ def create_interview_questions_page():
84
+ # Initializing session state variables since they dont exist at first
85
+ if 'uploaded_file' not in st.session_state:
86
+ st.session_state.uploaded_file = None
87
+ if 'cv_text' not in st.session_state:
88
+ st.session_state.cv_text = None
89
+ if 'candidates_list' not in st.session_state:
90
+ st.session_state.candidates_list = None
91
+ if 'generated_questions' not in st.session_state:
92
+ st.session_state.generated_questions = None
93
+
94
+ st.title("Interview Question Generator")
95
+
96
+ # File uploader
97
+ uploaded_file = st.file_uploader("Upload a CV", type=['pdf', 'txt'])
98
+
99
+ # Update session state when new file is uploaded
100
+ if uploaded_file is not None and (st.session_state.uploaded_file is None or
101
+ uploaded_file.name != st.session_state.uploaded_file.name):
102
+ st.session_state.uploaded_file = uploaded_file
103
+ st.session_state.cv_text = None # Reset CV text
104
+ st.session_state.candidates_list = None # Reset candidates
105
+ st.session_state.generated_questions = None # Reset questions
106
+
107
+ # Process file if it exists in session state
108
+ if st.session_state.uploaded_file is not None:
109
+ # Only process the file if we haven't already
110
+ if st.session_state.cv_text is None:
111
+ st.session_state.cv_text = process_file(st.session_state.uploaded_file)
112
+ st.session_state.candidates_list = extract_cv_data(st.session_state.cv_text)
113
+
114
+ # Display candidates info if available
115
+ if st.session_state.candidates_list:
116
+ display_candidates_info(st.session_state.candidates_list)
117
+
118
+ # Generate questions if not already generated
119
+ if st.session_state.generated_questions is None:
120
+ candidate = st.session_state.candidates_list[0]
121
+ generator = InterviewQuestionGenerator()
122
+ questions = generator.generate_questions(
123
+ cv_text=st.session_state.cv_text,
124
+ skills=", ".join(candidate.skills)
125
+ )
126
+ st.session_state.generated_questions = questions.content
127
+
128
+ # Display the generated questions
129
+ st.subheader("Recommended Interview Questions")
130
+ st.markdown(st.session_state.generated_questions)
cv_short.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from langchain_community.document_loaders import PDFPlumberLoader, TextLoader
3
+ import extraction as extr # extraction.py
4
+ import streamlit as st
5
+ import pandas as pd
6
+
7
+ # Configure logging
8
+ logging.basicConfig(level=logging.DEBUG , format='%(asctime)s - %(levelname)s - %(message)s')
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class CVAnalyzer:
13
+
14
+ def __init__(self):
15
+ # Initialize Groq LLM
16
+ logger.info("Initializing CVAnalyzer")
17
+
18
+ self.llm = extr.initialize_llm() # Updated to use the new function
19
+
20
+ logger.info(" LLM initialized")
21
+ # Initialize embeddings (if needed)
22
+ # self.embeddings = HuggingFaceEmbeddings(
23
+ # model_name="sentence-transformers/all-mpnet-base-v2"
24
+ # )
25
+
26
+ def load_document(self, file_path: str) -> str:
27
+ logger.info(f"Loading document from file: {file_path}")
28
+
29
+ """Load document based on file type."""
30
+
31
+ if file_path.endswith('.pdf'):
32
+ loader = PDFPlumberLoader(file_path)
33
+ else:
34
+ loader = TextLoader(file_path)
35
+ documents = loader.load()
36
+
37
+ logger.info(f"Document loaded from {file_path}")
38
+
39
+ return " ".join([doc.page_content for doc in documents])
40
+
41
+ def extract_cv_info(self, cv_text: str) -> list[extr.cv]: # referring to cv class in extraction.py
42
+ logger.info("Extracting CV information")
43
+
44
+ """Extract structured information from CV text using new extraction method."""
45
+
46
+ extracted_data = extr.extract_cv_data(cv_text)
47
+ logger.info(f"Extracted {len(extracted_data)} candidate(s) from CV")
48
+ return extracted_data
49
+ # return extr.extract_cv_data(cv_text)
50
+
51
+ def calculate_match_score(self, cv_info: dict, jd_requirements: dict) -> dict:
52
+ logger.info(f"Calculating match score for CV: {cv_info.get('name', 'Unknown')}")
53
+
54
+ """Calculate match score between CV and job requirements."""
55
+
56
+ score_components = {
57
+ "skills_match": 0,
58
+ "experience_match": 0,
59
+ "overall_score": 0
60
+ }
61
+
62
+ # Skills matching
63
+ if "skills" in cv_info and "required_skills" in jd_requirements:
64
+ cv_skills = set(skill.lower() for skill in cv_info["skills"])
65
+ required_skills = set(skill.lower() for skill in jd_requirements["required_skills"])
66
+ score_components["skills_match"] = len(cv_skills & required_skills) / len(required_skills)
67
+
68
+ # Experience matching
69
+ if "years_of_exp" in cv_info and "min_years_experience" in jd_requirements:
70
+ if cv_info["years_of_exp"] >= jd_requirements["min_years_experience"]:
71
+ score_components["experience_match"] = 1.0
72
+ else:
73
+ score_components["experience_match"] = cv_info["years_of_exp"] / jd_requirements["min_years_experience"]
74
+
75
+ # Calculate overall score (weighted average)
76
+ weights = {"skills_match": 0.5, "experience_match": 0.3}
77
+ score_components["overall_score"] = sum(
78
+ score * weights[component]
79
+ for component, score in score_components.items()
80
+ if component != "overall_score"
81
+ )
82
+
83
+ logger.debug(f"Match score for {cv_info.get('name', 'Unknown')}: {score_components['overall_score']:.2%}")
84
+
85
+ return score_components
86
+
87
+
88
+
89
+ # def create_cv_shortlisting_page():
90
+ # logger.info("Starting CV shortlisting system")
91
+
92
+ # st.title("CV Shortlisting System")
93
+
94
+ # # Reset analysis state when starting new analysis
95
+ # if 'analysis_started' not in st.session_state:
96
+ # st.session_state.analysis_started = False
97
+
98
+ # # Job Description Input
99
+ # st.header("Job Description")
100
+ # jd_text = st.text_area("Enter the job description",
101
+ # value=st.session_state.jd_text if 'jd_text' in st.session_state else "")
102
+
103
+ # if jd_text:
104
+ # st.session_state.jd_text = jd_text
105
+
106
+ # # Job Requirements Input
107
+ # st.header("Job Requirements")
108
+ # min_years = st.number_input("Minimum years of experience",
109
+ # min_value=0,
110
+ # value=st.session_state.min_years if 'min_years' in st.session_state else 3)
111
+
112
+ # required_skills = st.text_input("Required skills (comma-separated)",
113
+ # value=','.join(st.session_state.required_skills_list) if 'required_skills_list' in st.session_state else "")
114
+
115
+ # required_skills_list = [skill.strip() for skill in required_skills.split(",") if skill.strip()]
116
+
117
+ # # Update session state
118
+ # st.session_state.required_skills_list = required_skills_list
119
+ # st.session_state.min_years = min_years
120
+
121
+ # # CV Upload
122
+ # st.header("Upload CVs")
123
+ # uploaded_files = st.file_uploader("Choose CV files",
124
+ # accept_multiple_files=True,
125
+ # type=['pdf', 'txt'],
126
+ # key="cv_upload")
127
+
128
+ # if uploaded_files:
129
+ # st.session_state.uploaded_files = uploaded_files
130
+ # st.session_state.analysis_started = True
131
+
132
+ # # Analysis Button
133
+ # if st.button("Analyze CVs") and uploaded_files and jd_text:
134
+ # st.session_state.results = [] # Reset results
135
+ # st.session_state.processed_cvs = {} # Reset processed CVs
136
+
137
+ # with st.spinner('Analyzing CVs...'):
138
+ # try:
139
+ # analyzer = CVAnalyzer()
140
+
141
+ # # Prepare job requirements
142
+ # job_requirements = {
143
+ # "min_years_experience": st.session_state.min_years,
144
+ # "required_skills": st.session_state.required_skills_list
145
+ # }
146
+
147
+ # # Process each CV
148
+ # for uploaded_file in uploaded_files:
149
+ # cv_text = extr.process_file(uploaded_file)
150
+
151
+ # try:
152
+ # # Extract CV information
153
+ # candidates = analyzer.extract_cv_info(cv_text)
154
+
155
+ # for idx, candidate in enumerate(candidates):
156
+ # # Calculate match scores
157
+ # match_scores = analyzer.calculate_match_score(
158
+ # candidate.__dict__,
159
+ # job_requirements
160
+ # )
161
+
162
+ # # Store results
163
+ # result = {
164
+ # "Name": candidate.name or "Unknown",
165
+ # "Experience (Years)": candidate.years_of_exp or 0,
166
+ # "Skills": ", ".join(candidate.skills) if candidate.skills else "None",
167
+ # "Certifications": ", ".join(candidate.certifications) if candidate.certifications else "None",
168
+ # "Skills Match": f"{match_scores['skills_match']:.2%}",
169
+ # "Experience Match": f"{match_scores['experience_match']:.2%}",
170
+ # "Overall Score": f"{match_scores['overall_score']:.2%}"
171
+ # }
172
+
173
+ # st.session_state.results.append(result)
174
+
175
+ # # Store processed CV data for interview questions
176
+ # st.session_state.processed_cvs[f"{candidate.name}_{idx}"] = {
177
+ # "cv_text": cv_text,
178
+ # "candidate": candidate,
179
+ # "match_scores": match_scores
180
+ # }
181
+
182
+ # except Exception as e:
183
+ # logger.error(f"Error processing CV: {str(e)}")
184
+ # st.error(f"Error processing CV: {str(e)}")
185
+
186
+ # # Mark analysis as complete
187
+ # st.session_state.analysis_complete = True
188
+
189
+ # # Display results
190
+ # if st.session_state.results:
191
+ # df = pd.DataFrame(st.session_state.results)
192
+ # df = df.sort_values("Overall Score", ascending=False)
193
+ # st.dataframe(df)
194
+
195
+ # # Save top candidates
196
+ # st.session_state.top_candidates = df.head()
197
+ # else:
198
+ # logger.warning("No valid candidates found")
199
+ # st.warning("No valid candidates found in the uploaded CVs")
200
+
201
+ # except Exception as e:
202
+ # logger.error(f"Analysis error: {str(e)}")
203
+ # st.error(f"An error occurred during analysis: {str(e)}")
204
+ # st.session_state.analysis_complete = False
205
+
206
+ # # Display analysis status
207
+ # if st.session_state.get('analysis_complete', False):
208
+ # st.success("CV analysis complete! You can now proceed to generate interview questions.")
209
+
210
+
211
+ def create_cv_shortlisting_page():
212
+ logger.info("Starting CV shortlisting system")
213
+
214
+ # Initialize session state if not already initialized
215
+ if 'jd_text' not in st.session_state:
216
+ st.session_state.jd_text = ""
217
+ if 'min_years' not in st.session_state:
218
+ st.session_state.min_years = 3
219
+ if 'required_skills_list' not in st.session_state:
220
+ st.session_state.required_skills_list = []
221
+ if 'uploaded_files' not in st.session_state:
222
+ st.session_state.uploaded_files = None
223
+ if 'results' not in st.session_state:
224
+ st.session_state.results = []
225
+ if 'analysis_complete' not in st.session_state:
226
+ st.session_state.analysis_complete = False
227
+
228
+ st.title("CV Shortlisting System")
229
+
230
+ # Job Description Input
231
+ st.header("Job Description")
232
+ jd_text = st.text_area("Enter the job description", value=st.session_state.jd_text)
233
+ if jd_text:
234
+ st.session_state.jd_text = jd_text
235
+
236
+ # Job Requirements Input
237
+ st.header("Job Requirements")
238
+ min_years = st.number_input("Minimum years of experience",
239
+ min_value=0,
240
+ value=st.session_state.min_years,
241
+ )
242
+
243
+ required_skills = st.text_input("Required skills (comma-separated)",
244
+ value=','.join(st.session_state.required_skills_list) if st.session_state.required_skills_list else "")
245
+
246
+ required_skills_list = [skill.strip() for skill in required_skills.split(",") if skill.strip()]
247
+
248
+ if required_skills_list:
249
+ st.session_state.required_skills_list = required_skills_list
250
+ if min_years:
251
+ st.session_state.min_years = min_years
252
+
253
+ # CV Upload
254
+ st.header("Upload CVs")
255
+ uploaded_files = st.file_uploader("Choose CV files",
256
+ accept_multiple_files=True,
257
+ type=['pdf', 'txt'],
258
+ key="unique_cv_upload")
259
+
260
+ if uploaded_files:
261
+ st.session_state.uploaded_files = uploaded_files
262
+
263
+ if st.button("Analyze CVs") and uploaded_files and jd_text:
264
+ logger.info("Analyzing uploaded CVs")
265
+ with st.spinner('Analyzing CVs...'):
266
+ analyzer = CVAnalyzer()
267
+
268
+ # Prepare job requirements
269
+ job_requirements = {
270
+ "min_years_experience": st.session_state.min_years,
271
+ "required_skills": st.session_state.required_skills_list
272
+ }
273
+
274
+ results = []
275
+ st.session_state.results = [] # Reset results for new analysis
276
+
277
+ # Process each CV
278
+ for uploaded_file in uploaded_files:
279
+ cv_text = extr.process_file(uploaded_file)
280
+
281
+ try:
282
+ candidates = analyzer.extract_cv_info(cv_text)
283
+
284
+ for candidate in candidates:
285
+ match_scores = analyzer.calculate_match_score(
286
+ candidate.__dict__,
287
+ job_requirements
288
+ )
289
+
290
+ result = {
291
+ "Name": candidate.name or "Unknown",
292
+ "Experience (Years)": candidate.years_of_exp or 0,
293
+ "Skills": ", ".join(candidate.skills) if candidate.skills else "None",
294
+ "Certifications": ", ".join(candidate.certifications) if candidate.certifications else "None",
295
+ "Skills Match": f"{match_scores['skills_match']:.2%}",
296
+ "Experience Match": f"{match_scores['experience_match']:.2%}",
297
+ "Overall Score": f"{match_scores['overall_score']:.2%}"
298
+ }
299
+
300
+ results.append(result)
301
+ st.session_state.results.append(result)
302
+
303
+ except Exception as e:
304
+ logger.error(f"Error processing CV: {str(e)}")
305
+
306
+ # Display results
307
+ logger.info(f"Displaying analyzed results for {len(results)} candidate(s)")
308
+
309
+ if st.session_state.results:
310
+ df = pd.DataFrame(st.session_state.results)
311
+ df = df.sort_values("Overall Score", ascending=False)
312
+ st.dataframe(df)
313
+ st.session_state.analysis_complete = True
314
+ else:
315
+ logger.warning("No valid candidates found in uploaded CVs")
316
+ st.error("No valid results found from CV analysis")
317
+ st.session_state.analysis_complete = False
extraction.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Optional
3
+ from pydantic import BaseModel, Field
4
+ from langchain.prompts import ChatPromptTemplate
5
+ from langchain_groq import ChatGroq
6
+ import os
7
+ import tempfile
8
+ import streamlit as st
9
+ from langchain_community.document_loaders import PDFPlumberLoader, TextLoader
10
+
11
+
12
+ # logging
13
+ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Defining the CV structure using Pydantic for structured output
17
+ class cv(BaseModel):
18
+ name: Optional[str] = Field(default=None, description="Name of candidate")
19
+ skills: Optional[list[str]] = Field(default=None, description="Skills of candidate")
20
+ certifications: Optional[list[str]] = Field(default=None, description="Certificates of candidate")
21
+ years_of_exp: Optional[int] = Field(default=None, description="Years of experience")
22
+
23
+ # Defining the data structure that contains a list of CVs
24
+ class data(BaseModel):
25
+ candidates: list[cv]
26
+
27
+ def create_prompt_template() -> ChatPromptTemplate:
28
+
29
+ logger.info("Creating the prompt template for CV extraction")
30
+
31
+ """Create the prompt template for CV extraction."""
32
+
33
+ return ChatPromptTemplate.from_messages(
34
+ [
35
+ ("system",
36
+ "You are an expert extraction algorithm. Your job is to extract the following specific information from the given text:"
37
+ "- Name of the candidate"
38
+ "- Skills"
39
+ "- Certifications (Look for terms such as 'Certified,' 'Certification,' 'Certificate')"
40
+ "- years_of_exp (Extract only the number of years. If an approximation is given (e.g., '5+ years'), return the lower bound (e.g., '5').)"
41
+ "If you cannot find the value for a specific attribute, return null for that attribute's value."
42
+ "The 'years of experience' can be mentioned in various formats (e.g., '5+ years', '5 years', 'since 2010'). "
43
+ "Extract it accurately, even if it's mentioned in different contexts like a professional summary or work experience. "
44
+ "If multiple jobs are listed, you can calculate the experience from the work history."
45
+ "Certifications are usually found under headers like 'Certifications,' 'Professional Certificates,' or similar. They might include phrases like 'AWS Certified Developer,' 'MongoDB Developer Associate,' etc."
46
+ ),
47
+ ("human", "{text}")
48
+ ]
49
+ )
50
+
51
+ def initialize_llm() -> ChatGroq:
52
+ logger.info("Initializing LLM")
53
+
54
+ """Initialize the language model."""
55
+
56
+ os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY")
57
+ groq_api_key = os.getenv("GROQ_API_KEY")
58
+
59
+ if not groq_api_key:
60
+ logger.error("GROQ_API_KEY is not set")
61
+ raise ValueError("GROQ_API_KEY environment variable is missing.")
62
+
63
+
64
+ return ChatGroq(groq_api_key=groq_api_key, model_name="llama-3.1-70b-versatile", temperature=0.6)
65
+
66
+
67
+ def extract_cv_data(text: str) -> list[cv]:
68
+ logger.info("Extracting CV data from text")
69
+
70
+ """Extract data from the text using the language model."""
71
+
72
+ prompt = create_prompt_template()
73
+ llm = initialize_llm()
74
+
75
+ # creating a chain to extract structred ouput from the text using schema
76
+ runnable = prompt | llm.with_structured_output(schema=data)
77
+ response = runnable.invoke({"text": text})
78
+
79
+ logger.info(f"Extracted {len(response.candidates)} candidate(s) from the text")
80
+
81
+ return response.candidates # returns the list of candidates
82
+
83
+ def process_file(uploaded_files) -> str:
84
+ logger.info(f"Processing file: {uploaded_files.name}")
85
+
86
+ """Process the uploaded file and return the text."""
87
+
88
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_files.name)[1]) as tmp_file:
89
+ tmp_file.write(uploaded_files.getvalue())
90
+ tmp_path = tmp_file.name
91
+ try:
92
+ if tmp_path.endswith('.pdf'):
93
+ loader = PDFPlumberLoader(tmp_path)
94
+ logger.info(f"Loaded PDF file: {tmp_path}")
95
+
96
+ else:
97
+ loader = TextLoader(tmp_path)
98
+ logger.info(f"Loaded text file: {tmp_path}")
99
+
100
+ documents = loader.load()
101
+ # return " ".join([doc.page_content for doc in documents])
102
+ text_content = " ".join([doc.page_content for doc in documents])
103
+ logger.info(f"Extracted text from file: {uploaded_files.name}")
104
+ return text_content
105
+
106
+ finally:
107
+ logger.info(f"Deleting temporary file: {tmp_path}")
108
+ os.unlink(tmp_path)
109
+
110
+ def display_candidates_info(candidates_list: list[cv]):
111
+ logger.info(f"Displaying information for {len(candidates_list)} candidate(s)")
112
+
113
+ """Display the extracted candidates' information in a table."""
114
+
115
+ logger.debug(f"Candidate list: {candidates_list}")
116
+
117
+ data = []
118
+ for candidate in candidates_list:
119
+ data.append({
120
+ "Name": candidate.name,
121
+ "Skills": ", ".join(candidate.skills) if candidate.skills else 'None',
122
+ "Certifications": ", ".join(candidate.certifications) if candidate.certifications else 'None',
123
+ "Years of Experience": candidate.years_of_exp if candidate.years_of_exp else 'None'
124
+ })
125
+
126
+ st.write("### Candidates Information")
127
+ st.table(data)
128
+ logger.debug("Displayed candidates' information in table")
129
+ # print(candidates_list)
130
+
131
+ # Try this to see the working of extraction
132
+ # Streamlit file uploader and extraction logic
133
+ # uploaded_files = st.file_uploader(" Upload the CV: ", type=['pdf', 'txt'],key="unique_cv_upload")
134
+ # if uploaded_files is not None:
135
+ # text = process_file(uploaded_files)
136
+ # # text = ep.text
137
+ # candidates_list = extract_cv_data(text)
138
+ # display_candidates_info(candidates_list)
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ python-dotenv
3
+ ipykernel
4
+ langchain-community
5
+ streamlit
6
+ pypdf
7
+ pymupdf
8
+ langchain-text-splitters
9
+ langchain-openai
10
+ chromadb
11
+ sentence_transformers
12
+ langchain_huggingface
13
+ faiss-cpu
14
+ langchain_chroma
15
+ openai
16
+ langchain-groq
17
+ pdfplumber
18
+ prettytable