Nassiraaa commited on
Commit
0715b4a
·
verified ·
1 Parent(s): 7829a8b

Upload 5 files

Browse files
final-app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from cv_analyzer import analyze_cv
3
+
4
+ st.set_page_config(page_title="CV Analyzer", layout="wide")
5
+
6
+ st.title('CV Analyzer')
7
+
8
+ uploaded_file = st.file_uploader("Choose a CV file", type=['pdf', 'docx', 'txt'])
9
+
10
+ if uploaded_file is not None:
11
+ file_content = uploaded_file.read()
12
+ with st.spinner('Analyzing CV...'):
13
+ result = analyze_cv(file_content)
14
+
15
+ if "error" in result:
16
+ st.error(result["error"])
17
+ else:
18
+ st.header("Personal Information")
19
+ st.json(result["personal_info"])
20
+
21
+ st.header("Spelling and Grammar")
22
+ st.write(f"Score: {result['spelling_grammar_score']:.2f} / 100")
23
+
24
+ st.header("Detected Sections")
25
+ st.write(result["detected_sections"])
26
+
27
+ st.header("Section Detection Score")
28
+ st.write(f"Score: {result['section_detection_score']}")
29
+
30
+ st.header("Content Quality Analysis")
31
+ for section, evaluation in result['content_analysis'].items():
32
+ st.subheader(section.capitalize())
33
+ st.json(evaluation)
34
+
35
+ st.header("Overall Content Quality Score")
36
+ st.write(f"Score: {result['overall_score']:.2f} / 10")
37
+
38
+ if __name__ == "__main__":
39
+ st.sidebar.title("About")
40
+ st.sidebar.info(
41
+ "This CV Analyzer detects sections, extracts personal information, "
42
+ "checks spelling and grammar, analyzes content quality, "
43
+ "and provides a detailed evaluation of the CV."
44
+ )
final-cv-analyzer.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import tempfile
4
+ import os
5
+ from cv_prompt import (
6
+ ResumeQualityEvaluation,
7
+ get_personal_info_prompt,
8
+ get_spelling_grammar_prompt,
9
+ get_section_detection_prompt,
10
+ get_content_quality_prompt,
11
+ calculate_section_detection_score,
12
+ calculate_overall_score
13
+ )
14
+ from openai_utils import get_ai_response
15
+ from ocr_extractor import process_file
16
+ from langchain.output_parsers import PydanticOutputParser
17
+
18
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
19
+
20
+ def analyze_cv(file_content):
21
+ try:
22
+ # Save the file content temporarily
23
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
24
+ temp_file.write(file_content)
25
+ temp_file_path = temp_file.name
26
+
27
+ extracted_text = process_file(temp_file_path, 'ocr_weights.json')
28
+ logging.info("Text extracted successfully")
29
+
30
+ # Personal Information Extraction
31
+ personal_info_prompt = get_personal_info_prompt(extracted_text)
32
+ personal_info_response = get_ai_response([{"role": "user", "content": personal_info_prompt}])
33
+ if personal_info_response is None:
34
+ return {"error": "Failed to get AI response for personal information"}
35
+ personal_info = json.loads(personal_info_response)
36
+
37
+ # Spelling and Grammar Check
38
+ spelling_grammar_prompt = get_spelling_grammar_prompt(extracted_text)
39
+ spelling_grammar_response = get_ai_response([{"role": "user", "content": spelling_grammar_prompt}])
40
+ if spelling_grammar_response is None:
41
+ return {"error": "Failed to get AI response for spelling and grammar"}
42
+ spelling_grammar_result = json.loads(spelling_grammar_response)
43
+ error_percentage = spelling_grammar_result.get('error_percentage', 100)
44
+ spelling_grammar_score = 100 - error_percentage # Convert error percentage to a score
45
+
46
+ # Section Detection
47
+ sections_prompt = get_section_detection_prompt(extracted_text)
48
+ sections_response = get_ai_response([{"role": "user", "content": sections_prompt}])
49
+ if sections_response is None:
50
+ return {"error": "Failed to get AI response for sections"}
51
+
52
+ sections_data = json.loads(sections_response)
53
+ detected_sections = sections_data.get('present_sections', [])
54
+ section_detection_score = calculate_section_detection_score(detected_sections)
55
+ logging.info(f"Detected sections: {detected_sections}")
56
+ logging.info(f"Section detection score: {section_detection_score}")
57
+
58
+ # Content Quality Analysis
59
+ quality_prompt = get_content_quality_prompt(extracted_text)
60
+ quality_response = get_ai_response([{"role": "user", "content": quality_prompt}])
61
+
62
+ if quality_response is None:
63
+ return {"error": "Failed to get AI response for content quality"}
64
+
65
+ parser = PydanticOutputParser(pydantic_object=ResumeQualityEvaluation)
66
+ evaluation_result = parser.parse(quality_response)
67
+
68
+ overall_score = calculate_overall_score(evaluation_result)
69
+
70
+ logging.info("All analyses completed")
71
+ logging.info(f"Overall score: {overall_score}")
72
+
73
+ return {
74
+ "extracted_text": extracted_text,
75
+ "personal_info": personal_info,
76
+ "spelling_grammar_score": spelling_grammar_score,
77
+ "detected_sections": detected_sections,
78
+ "section_detection_score": section_detection_score,
79
+ "content_analysis": evaluation_result.dict(),
80
+ "overall_score": overall_score
81
+ }
82
+ except Exception as e:
83
+ logging.error(f"Error in CV analysis: {str(e)}", exc_info=True)
84
+ return {"error": str(e)}
85
+ finally:
86
+ # Clean up the temporary file
87
+ if 'temp_file_path' in locals():
88
+ os.unlink(temp_file_path)
final-cv-prompt.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import List
3
+ from langchain_core.pydantic_v1 import BaseModel, Field
4
+ from langchain.output_parsers import PydanticOutputParser
5
+ from langchain_core.prompts import PromptTemplate
6
+
7
+ def load_json_file(filename):
8
+ try:
9
+ with open(filename, 'r', encoding='utf-8') as f:
10
+ return json.load(f)
11
+ except json.JSONDecodeError as e:
12
+ print(f"Error decoding JSON in {filename}: {e}")
13
+ return None
14
+
15
+ cv_structure = load_json_file('cv_structure.json')
16
+ cv_sections = load_json_file('cv_sections.json')
17
+
18
+ class EducationElement(BaseModel):
19
+ degree_present: bool = Field(description="Whether the degree is present")
20
+ year_present: bool = Field(description="Whether the year is present")
21
+ institution_present: bool = Field(description="Whether the institution is present")
22
+ score: float = Field(description="Score for this education element", ge=0, le=10)
23
+
24
+ class Education(BaseModel):
25
+ overall_score: float = Field(description="Overall score for the education section", ge=0, le=10)
26
+ elements: List[EducationElement] = Field(description="List of education elements")
27
+
28
+ class WorkExperienceElement(BaseModel):
29
+ job_title_present: bool = Field(description="Whether the job title is present")
30
+ company_present: bool = Field(description="Whether the company name is present")
31
+ dates_present: bool = Field(description="Whether the start and end dates are present")
32
+ technologies_present: bool = Field(description="Whether the used technologies are present")
33
+ responsibilities_present: bool = Field(description="Whether responsibilities are present")
34
+ achievements_present: bool = Field(description="Whether achievements are present")
35
+ responsibilities_quality: float = Field(description="Quality of responsibilities description", ge=0, le=10)
36
+ achievements_quality: float = Field(description="Quality of achievements description", ge=0, le=10)
37
+ score: float = Field(description="Score for this work experience element", ge=0, le=10)
38
+
39
+ class WorkExperience(BaseModel):
40
+ overall_score: float = Field(description="Overall score for the work experience section", ge=0, le=10)
41
+ elements: List[WorkExperienceElement] = Field(description="List of work experience elements")
42
+
43
+ class Profile(BaseModel):
44
+ overall_score: float = Field(description="Overall score for the profile section", ge=0, le=10)
45
+ brief_overview_present: bool = Field(description="Whether a brief overview is present")
46
+ career_goals_present: bool = Field(description="Whether career goals are present")
47
+ objective_present: bool = Field(description="Whether an objective is present")
48
+
49
+ class ResumeQualityEvaluation(BaseModel):
50
+ education: Education = Field(description="Evaluation of the education section")
51
+ work_experience: WorkExperience = Field(description="Evaluation of the work experience section")
52
+ profile: Profile = Field(description="Evaluation of the profile section")
53
+
54
+ def get_personal_info_prompt(text):
55
+ return f"""<s>[INST]Extract the personal information from the following CV text. The text may be in any language. Respond with a JSON object in the format {{"name": "extracted name", "email": "extracted email", "phone": "extracted phone number", "location": "extracted location"}}. If you can't find any of the information, set the value to null.
56
+
57
+ CV text:
58
+ {text}[/INST]"""
59
+
60
+ def get_spelling_grammar_prompt(text):
61
+ return f"""<s>[INST]Analyze the following text for spelling and grammar errors. The text may be in any language. Do not correct the errors, just count them. Calculate the percentage of errors.
62
+
63
+ Text to analyze:
64
+ {text}
65
+
66
+ Respond with a JSON object containing the key 'error_percentage' with the calculated percentage (0-100) of errors.[/INST]"""
67
+
68
+ def get_section_detection_prompt(text):
69
+ if cv_sections is None:
70
+ return None
71
+ sections_list = ", ".join(cv_sections['sections'].keys())
72
+ return f"""<s>[INST] Analyze this CV text and identify which of the following sections are present: {sections_list}.
73
+ A section is considered present if its content is identifiable, even without an explicit title.
74
+ Consider synonyms and alternative phrasings for section titles.
75
+
76
+ Sections to look for:
77
+ {sections_list}
78
+
79
+ CV text:
80
+ {text}
81
+
82
+ Respond with a JSON object with a key "present_sections" containing an array of the identified sections.
83
+ Only include sections that are actually present in the CV. [/INST]"""
84
+
85
+ def get_content_quality_prompt(text):
86
+ parser = PydanticOutputParser(pydantic_object=ResumeQualityEvaluation)
87
+
88
+ prompt = PromptTemplate(
89
+ template="""<s>[INST]Evaluate the quality of the following resume sections:
90
+
91
+ {resume}
92
+
93
+ Provide a detailed evaluation following this format:
94
+ {format_instructions}
95
+
96
+ For each section, evaluate the presence of required elements.
97
+ For the Work Experience section, also evaluate the quality of the Responsibilities and Achievements descriptions on a scale of 0-10.
98
+ Provide an overall score for each section on a scale of 0-10 based on the presence of elements and their quality where applicable.[/INST]""",
99
+ input_variables=["resume"],
100
+ partial_variables={"format_instructions": parser.get_format_instructions()}
101
+ )
102
+
103
+ return prompt.format(resume=text)
104
+
105
+ def calculate_section_detection_score(detected_sections):
106
+ total_score = 0
107
+ for section in detected_sections:
108
+ if section in cv_sections['sections']:
109
+ total_score += cv_sections['sections'][section]
110
+ return total_score
111
+
112
+ def calculate_overall_score(evaluation: ResumeQualityEvaluation) -> float:
113
+ education_weight = 0.3
114
+ work_experience_weight = 0.5
115
+ profile_weight = 0.2
116
+
117
+ overall_score = (
118
+ evaluation.education.overall_score * education_weight +
119
+ evaluation.work_experience.overall_score * work_experience_weight +
120
+ evaluation.profile.overall_score * profile_weight
121
+ )
122
+
123
+ return round(overall_score, 2)
124
+
125
+ __all__ = ['ResumeQualityEvaluation', 'get_personal_info_prompt', 'get_spelling_grammar_prompt',
126
+ 'get_section_detection_prompt', 'get_content_quality_prompt',
127
+ 'calculate_section_detection_score', 'calculate_overall_score']
final-ocr-extractor.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import importlib
3
+ from PIL import Image
4
+ import boto3
5
+ import os
6
+ from doctr.io import DocumentFile
7
+ from doctr.models import ocr_predictor
8
+ import easyocr
9
+ from shapely.geometry import Polygon
10
+ from paddleocr import PaddleOCR
11
+ import langid
12
+ import json
13
+ import PyPDF2
14
+
15
+ # Check if python-bidi is installed
16
+ if importlib.util.find_spec("bidi") is None:
17
+ print("Error: python-bidi is not installed. Please install it using pip install python-bidi")
18
+ sys.exit(1)
19
+
20
+ # Initialize OCR models
21
+ def load_models(language):
22
+ doctr_model = ocr_predictor(pretrained=True)
23
+ easyocr_reader = easyocr.Reader([language])
24
+ paddleocr_reader = PaddleOCR(use_angle_cls=True, lang=language)
25
+ return doctr_model, easyocr_reader, paddleocr_reader
26
+
27
+ # AWS Textract client
28
+ textract_client = boto3.client('textract', region_name='us-west-2')
29
+
30
+ def extract_text_aws(image_bytes):
31
+ try:
32
+ response = textract_client.detect_document_text(Document={'Bytes': image_bytes})
33
+ return [(item['Text'], item['Geometry']['BoundingBox'], item['Confidence'])
34
+ for item in response['Blocks'] if item['BlockType'] == 'WORD']
35
+ except Exception as e:
36
+ print(f"Error in AWS Textract: {str(e)}")
37
+ return []
38
+
39
+ def extract_text_doctr(image_path, doctr_model):
40
+ try:
41
+ doc = DocumentFile.from_images(image_path)
42
+ result = doctr_model(doc)
43
+ return [(word.value, word.geometry, word.confidence)
44
+ for block in result.pages[0].blocks for line in block.lines for word in line.words]
45
+ except Exception as e:
46
+ print(f"Error in Doctr OCR: {str(e)}")
47
+ return []
48
+
49
+ def extract_text_easyocr(image_path, easyocr_reader):
50
+ try:
51
+ result = easyocr_reader.readtext(image_path)
52
+ return [(detection[1], detection[0], detection
final-openai-utils.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from openai import OpenAI
3
+
4
+ # OpenAI configuration
5
+ client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
6
+ OPENAI_MODEL = "gpt-3.5-turbo"
7
+
8
+ def get_ai_response(messages):
9
+ """
10
+ Get a response from the AI model using the OpenAI client.
11
+ :param messages: List of message dictionaries as expected by OpenAI API
12
+ :return: The content of the AI's response
13
+ """
14
+ try:
15
+ response = client.chat.completions.create(
16
+ model=OPENAI_MODEL,
17
+ messages=messages
18
+ )
19
+ return response.choices[0].message.content
20
+ except Exception as e:
21
+ print(f"Error getting AI response: {str(e)}")
22
+ return None