Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- final-app.py +44 -0
- final-cv-analyzer.py +88 -0
- final-cv-prompt.py +127 -0
- final-ocr-extractor.py +52 -0
- final-openai-utils.py +22 -0
final-app.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from cv_analyzer import analyze_cv
|
3 |
+
|
4 |
+
st.set_page_config(page_title="CV Analyzer", layout="wide")
|
5 |
+
|
6 |
+
st.title('CV Analyzer')
|
7 |
+
|
8 |
+
uploaded_file = st.file_uploader("Choose a CV file", type=['pdf', 'docx', 'txt'])
|
9 |
+
|
10 |
+
if uploaded_file is not None:
|
11 |
+
file_content = uploaded_file.read()
|
12 |
+
with st.spinner('Analyzing CV...'):
|
13 |
+
result = analyze_cv(file_content)
|
14 |
+
|
15 |
+
if "error" in result:
|
16 |
+
st.error(result["error"])
|
17 |
+
else:
|
18 |
+
st.header("Personal Information")
|
19 |
+
st.json(result["personal_info"])
|
20 |
+
|
21 |
+
st.header("Spelling and Grammar")
|
22 |
+
st.write(f"Score: {result['spelling_grammar_score']:.2f} / 100")
|
23 |
+
|
24 |
+
st.header("Detected Sections")
|
25 |
+
st.write(result["detected_sections"])
|
26 |
+
|
27 |
+
st.header("Section Detection Score")
|
28 |
+
st.write(f"Score: {result['section_detection_score']}")
|
29 |
+
|
30 |
+
st.header("Content Quality Analysis")
|
31 |
+
for section, evaluation in result['content_analysis'].items():
|
32 |
+
st.subheader(section.capitalize())
|
33 |
+
st.json(evaluation)
|
34 |
+
|
35 |
+
st.header("Overall Content Quality Score")
|
36 |
+
st.write(f"Score: {result['overall_score']:.2f} / 10")
|
37 |
+
|
38 |
+
if __name__ == "__main__":
|
39 |
+
st.sidebar.title("About")
|
40 |
+
st.sidebar.info(
|
41 |
+
"This CV Analyzer detects sections, extracts personal information, "
|
42 |
+
"checks spelling and grammar, analyzes content quality, "
|
43 |
+
"and provides a detailed evaluation of the CV."
|
44 |
+
)
|
final-cv-analyzer.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import logging
|
3 |
+
import tempfile
|
4 |
+
import os
|
5 |
+
from cv_prompt import (
|
6 |
+
ResumeQualityEvaluation,
|
7 |
+
get_personal_info_prompt,
|
8 |
+
get_spelling_grammar_prompt,
|
9 |
+
get_section_detection_prompt,
|
10 |
+
get_content_quality_prompt,
|
11 |
+
calculate_section_detection_score,
|
12 |
+
calculate_overall_score
|
13 |
+
)
|
14 |
+
from openai_utils import get_ai_response
|
15 |
+
from ocr_extractor import process_file
|
16 |
+
from langchain.output_parsers import PydanticOutputParser
|
17 |
+
|
18 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
19 |
+
|
20 |
+
def analyze_cv(file_content):
|
21 |
+
try:
|
22 |
+
# Save the file content temporarily
|
23 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
24 |
+
temp_file.write(file_content)
|
25 |
+
temp_file_path = temp_file.name
|
26 |
+
|
27 |
+
extracted_text = process_file(temp_file_path, 'ocr_weights.json')
|
28 |
+
logging.info("Text extracted successfully")
|
29 |
+
|
30 |
+
# Personal Information Extraction
|
31 |
+
personal_info_prompt = get_personal_info_prompt(extracted_text)
|
32 |
+
personal_info_response = get_ai_response([{"role": "user", "content": personal_info_prompt}])
|
33 |
+
if personal_info_response is None:
|
34 |
+
return {"error": "Failed to get AI response for personal information"}
|
35 |
+
personal_info = json.loads(personal_info_response)
|
36 |
+
|
37 |
+
# Spelling and Grammar Check
|
38 |
+
spelling_grammar_prompt = get_spelling_grammar_prompt(extracted_text)
|
39 |
+
spelling_grammar_response = get_ai_response([{"role": "user", "content": spelling_grammar_prompt}])
|
40 |
+
if spelling_grammar_response is None:
|
41 |
+
return {"error": "Failed to get AI response for spelling and grammar"}
|
42 |
+
spelling_grammar_result = json.loads(spelling_grammar_response)
|
43 |
+
error_percentage = spelling_grammar_result.get('error_percentage', 100)
|
44 |
+
spelling_grammar_score = 100 - error_percentage # Convert error percentage to a score
|
45 |
+
|
46 |
+
# Section Detection
|
47 |
+
sections_prompt = get_section_detection_prompt(extracted_text)
|
48 |
+
sections_response = get_ai_response([{"role": "user", "content": sections_prompt}])
|
49 |
+
if sections_response is None:
|
50 |
+
return {"error": "Failed to get AI response for sections"}
|
51 |
+
|
52 |
+
sections_data = json.loads(sections_response)
|
53 |
+
detected_sections = sections_data.get('present_sections', [])
|
54 |
+
section_detection_score = calculate_section_detection_score(detected_sections)
|
55 |
+
logging.info(f"Detected sections: {detected_sections}")
|
56 |
+
logging.info(f"Section detection score: {section_detection_score}")
|
57 |
+
|
58 |
+
# Content Quality Analysis
|
59 |
+
quality_prompt = get_content_quality_prompt(extracted_text)
|
60 |
+
quality_response = get_ai_response([{"role": "user", "content": quality_prompt}])
|
61 |
+
|
62 |
+
if quality_response is None:
|
63 |
+
return {"error": "Failed to get AI response for content quality"}
|
64 |
+
|
65 |
+
parser = PydanticOutputParser(pydantic_object=ResumeQualityEvaluation)
|
66 |
+
evaluation_result = parser.parse(quality_response)
|
67 |
+
|
68 |
+
overall_score = calculate_overall_score(evaluation_result)
|
69 |
+
|
70 |
+
logging.info("All analyses completed")
|
71 |
+
logging.info(f"Overall score: {overall_score}")
|
72 |
+
|
73 |
+
return {
|
74 |
+
"extracted_text": extracted_text,
|
75 |
+
"personal_info": personal_info,
|
76 |
+
"spelling_grammar_score": spelling_grammar_score,
|
77 |
+
"detected_sections": detected_sections,
|
78 |
+
"section_detection_score": section_detection_score,
|
79 |
+
"content_analysis": evaluation_result.dict(),
|
80 |
+
"overall_score": overall_score
|
81 |
+
}
|
82 |
+
except Exception as e:
|
83 |
+
logging.error(f"Error in CV analysis: {str(e)}", exc_info=True)
|
84 |
+
return {"error": str(e)}
|
85 |
+
finally:
|
86 |
+
# Clean up the temporary file
|
87 |
+
if 'temp_file_path' in locals():
|
88 |
+
os.unlink(temp_file_path)
|
final-cv-prompt.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from typing import List
|
3 |
+
from langchain_core.pydantic_v1 import BaseModel, Field
|
4 |
+
from langchain.output_parsers import PydanticOutputParser
|
5 |
+
from langchain_core.prompts import PromptTemplate
|
6 |
+
|
7 |
+
def load_json_file(filename):
|
8 |
+
try:
|
9 |
+
with open(filename, 'r', encoding='utf-8') as f:
|
10 |
+
return json.load(f)
|
11 |
+
except json.JSONDecodeError as e:
|
12 |
+
print(f"Error decoding JSON in {filename}: {e}")
|
13 |
+
return None
|
14 |
+
|
15 |
+
cv_structure = load_json_file('cv_structure.json')
|
16 |
+
cv_sections = load_json_file('cv_sections.json')
|
17 |
+
|
18 |
+
class EducationElement(BaseModel):
|
19 |
+
degree_present: bool = Field(description="Whether the degree is present")
|
20 |
+
year_present: bool = Field(description="Whether the year is present")
|
21 |
+
institution_present: bool = Field(description="Whether the institution is present")
|
22 |
+
score: float = Field(description="Score for this education element", ge=0, le=10)
|
23 |
+
|
24 |
+
class Education(BaseModel):
|
25 |
+
overall_score: float = Field(description="Overall score for the education section", ge=0, le=10)
|
26 |
+
elements: List[EducationElement] = Field(description="List of education elements")
|
27 |
+
|
28 |
+
class WorkExperienceElement(BaseModel):
|
29 |
+
job_title_present: bool = Field(description="Whether the job title is present")
|
30 |
+
company_present: bool = Field(description="Whether the company name is present")
|
31 |
+
dates_present: bool = Field(description="Whether the start and end dates are present")
|
32 |
+
technologies_present: bool = Field(description="Whether the used technologies are present")
|
33 |
+
responsibilities_present: bool = Field(description="Whether responsibilities are present")
|
34 |
+
achievements_present: bool = Field(description="Whether achievements are present")
|
35 |
+
responsibilities_quality: float = Field(description="Quality of responsibilities description", ge=0, le=10)
|
36 |
+
achievements_quality: float = Field(description="Quality of achievements description", ge=0, le=10)
|
37 |
+
score: float = Field(description="Score for this work experience element", ge=0, le=10)
|
38 |
+
|
39 |
+
class WorkExperience(BaseModel):
|
40 |
+
overall_score: float = Field(description="Overall score for the work experience section", ge=0, le=10)
|
41 |
+
elements: List[WorkExperienceElement] = Field(description="List of work experience elements")
|
42 |
+
|
43 |
+
class Profile(BaseModel):
|
44 |
+
overall_score: float = Field(description="Overall score for the profile section", ge=0, le=10)
|
45 |
+
brief_overview_present: bool = Field(description="Whether a brief overview is present")
|
46 |
+
career_goals_present: bool = Field(description="Whether career goals are present")
|
47 |
+
objective_present: bool = Field(description="Whether an objective is present")
|
48 |
+
|
49 |
+
class ResumeQualityEvaluation(BaseModel):
|
50 |
+
education: Education = Field(description="Evaluation of the education section")
|
51 |
+
work_experience: WorkExperience = Field(description="Evaluation of the work experience section")
|
52 |
+
profile: Profile = Field(description="Evaluation of the profile section")
|
53 |
+
|
54 |
+
def get_personal_info_prompt(text):
|
55 |
+
return f"""<s>[INST]Extract the personal information from the following CV text. The text may be in any language. Respond with a JSON object in the format {{"name": "extracted name", "email": "extracted email", "phone": "extracted phone number", "location": "extracted location"}}. If you can't find any of the information, set the value to null.
|
56 |
+
|
57 |
+
CV text:
|
58 |
+
{text}[/INST]"""
|
59 |
+
|
60 |
+
def get_spelling_grammar_prompt(text):
|
61 |
+
return f"""<s>[INST]Analyze the following text for spelling and grammar errors. The text may be in any language. Do not correct the errors, just count them. Calculate the percentage of errors.
|
62 |
+
|
63 |
+
Text to analyze:
|
64 |
+
{text}
|
65 |
+
|
66 |
+
Respond with a JSON object containing the key 'error_percentage' with the calculated percentage (0-100) of errors.[/INST]"""
|
67 |
+
|
68 |
+
def get_section_detection_prompt(text):
|
69 |
+
if cv_sections is None:
|
70 |
+
return None
|
71 |
+
sections_list = ", ".join(cv_sections['sections'].keys())
|
72 |
+
return f"""<s>[INST] Analyze this CV text and identify which of the following sections are present: {sections_list}.
|
73 |
+
A section is considered present if its content is identifiable, even without an explicit title.
|
74 |
+
Consider synonyms and alternative phrasings for section titles.
|
75 |
+
|
76 |
+
Sections to look for:
|
77 |
+
{sections_list}
|
78 |
+
|
79 |
+
CV text:
|
80 |
+
{text}
|
81 |
+
|
82 |
+
Respond with a JSON object with a key "present_sections" containing an array of the identified sections.
|
83 |
+
Only include sections that are actually present in the CV. [/INST]"""
|
84 |
+
|
85 |
+
def get_content_quality_prompt(text):
|
86 |
+
parser = PydanticOutputParser(pydantic_object=ResumeQualityEvaluation)
|
87 |
+
|
88 |
+
prompt = PromptTemplate(
|
89 |
+
template="""<s>[INST]Evaluate the quality of the following resume sections:
|
90 |
+
|
91 |
+
{resume}
|
92 |
+
|
93 |
+
Provide a detailed evaluation following this format:
|
94 |
+
{format_instructions}
|
95 |
+
|
96 |
+
For each section, evaluate the presence of required elements.
|
97 |
+
For the Work Experience section, also evaluate the quality of the Responsibilities and Achievements descriptions on a scale of 0-10.
|
98 |
+
Provide an overall score for each section on a scale of 0-10 based on the presence of elements and their quality where applicable.[/INST]""",
|
99 |
+
input_variables=["resume"],
|
100 |
+
partial_variables={"format_instructions": parser.get_format_instructions()}
|
101 |
+
)
|
102 |
+
|
103 |
+
return prompt.format(resume=text)
|
104 |
+
|
105 |
+
def calculate_section_detection_score(detected_sections):
|
106 |
+
total_score = 0
|
107 |
+
for section in detected_sections:
|
108 |
+
if section in cv_sections['sections']:
|
109 |
+
total_score += cv_sections['sections'][section]
|
110 |
+
return total_score
|
111 |
+
|
112 |
+
def calculate_overall_score(evaluation: ResumeQualityEvaluation) -> float:
|
113 |
+
education_weight = 0.3
|
114 |
+
work_experience_weight = 0.5
|
115 |
+
profile_weight = 0.2
|
116 |
+
|
117 |
+
overall_score = (
|
118 |
+
evaluation.education.overall_score * education_weight +
|
119 |
+
evaluation.work_experience.overall_score * work_experience_weight +
|
120 |
+
evaluation.profile.overall_score * profile_weight
|
121 |
+
)
|
122 |
+
|
123 |
+
return round(overall_score, 2)
|
124 |
+
|
125 |
+
__all__ = ['ResumeQualityEvaluation', 'get_personal_info_prompt', 'get_spelling_grammar_prompt',
|
126 |
+
'get_section_detection_prompt', 'get_content_quality_prompt',
|
127 |
+
'calculate_section_detection_score', 'calculate_overall_score']
|
final-ocr-extractor.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import importlib
|
3 |
+
from PIL import Image
|
4 |
+
import boto3
|
5 |
+
import os
|
6 |
+
from doctr.io import DocumentFile
|
7 |
+
from doctr.models import ocr_predictor
|
8 |
+
import easyocr
|
9 |
+
from shapely.geometry import Polygon
|
10 |
+
from paddleocr import PaddleOCR
|
11 |
+
import langid
|
12 |
+
import json
|
13 |
+
import PyPDF2
|
14 |
+
|
15 |
+
# Check if python-bidi is installed
|
16 |
+
if importlib.util.find_spec("bidi") is None:
|
17 |
+
print("Error: python-bidi is not installed. Please install it using pip install python-bidi")
|
18 |
+
sys.exit(1)
|
19 |
+
|
20 |
+
# Initialize OCR models
|
21 |
+
def load_models(language):
|
22 |
+
doctr_model = ocr_predictor(pretrained=True)
|
23 |
+
easyocr_reader = easyocr.Reader([language])
|
24 |
+
paddleocr_reader = PaddleOCR(use_angle_cls=True, lang=language)
|
25 |
+
return doctr_model, easyocr_reader, paddleocr_reader
|
26 |
+
|
27 |
+
# AWS Textract client
|
28 |
+
textract_client = boto3.client('textract', region_name='us-west-2')
|
29 |
+
|
30 |
+
def extract_text_aws(image_bytes):
|
31 |
+
try:
|
32 |
+
response = textract_client.detect_document_text(Document={'Bytes': image_bytes})
|
33 |
+
return [(item['Text'], item['Geometry']['BoundingBox'], item['Confidence'])
|
34 |
+
for item in response['Blocks'] if item['BlockType'] == 'WORD']
|
35 |
+
except Exception as e:
|
36 |
+
print(f"Error in AWS Textract: {str(e)}")
|
37 |
+
return []
|
38 |
+
|
39 |
+
def extract_text_doctr(image_path, doctr_model):
|
40 |
+
try:
|
41 |
+
doc = DocumentFile.from_images(image_path)
|
42 |
+
result = doctr_model(doc)
|
43 |
+
return [(word.value, word.geometry, word.confidence)
|
44 |
+
for block in result.pages[0].blocks for line in block.lines for word in line.words]
|
45 |
+
except Exception as e:
|
46 |
+
print(f"Error in Doctr OCR: {str(e)}")
|
47 |
+
return []
|
48 |
+
|
49 |
+
def extract_text_easyocr(image_path, easyocr_reader):
|
50 |
+
try:
|
51 |
+
result = easyocr_reader.readtext(image_path)
|
52 |
+
return [(detection[1], detection[0], detection
|
final-openai-utils.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from openai import OpenAI
|
3 |
+
|
4 |
+
# OpenAI configuration
|
5 |
+
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
6 |
+
OPENAI_MODEL = "gpt-3.5-turbo"
|
7 |
+
|
8 |
+
def get_ai_response(messages):
|
9 |
+
"""
|
10 |
+
Get a response from the AI model using the OpenAI client.
|
11 |
+
:param messages: List of message dictionaries as expected by OpenAI API
|
12 |
+
:return: The content of the AI's response
|
13 |
+
"""
|
14 |
+
try:
|
15 |
+
response = client.chat.completions.create(
|
16 |
+
model=OPENAI_MODEL,
|
17 |
+
messages=messages
|
18 |
+
)
|
19 |
+
return response.choices[0].message.content
|
20 |
+
except Exception as e:
|
21 |
+
print(f"Error getting AI response: {str(e)}")
|
22 |
+
return None
|