Spaces:
Sleeping
Sleeping
Upload 13 files
Browse files- app.py +59 -0
- cv_analyzer.py +33 -0
- cv_prompt.py +28 -0
- cv_sections.json +13 -0
- error_scoring.json +8 -0
- ocr_extractor.py +139 -0
- ocr_weights.json +6 -0
- openai_utils.py +21 -0
- personal_info_scores.json +6 -0
- personal_information.py +71 -0
- prompt.py +42 -0
- requirements.txt +43 -0
- spelling_grammar_checker.py +51 -0
app.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import tempfile
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
from cv_analyzer import analyze_cv
|
6 |
+
from personal_information import analyze_personal_info
|
7 |
+
from spelling_grammar_checker import evaluate_cv_text
|
8 |
+
|
9 |
+
st.set_page_config(page_title="CV Analyzer", page_icon="📄")
|
10 |
+
|
11 |
+
st.title("CV Analyzer")
|
12 |
+
|
13 |
+
uploaded_file = st.file_uploader("Choose a CV file", type=["pdf", "png", "jpg", "jpeg"])
|
14 |
+
|
15 |
+
if uploaded_file is not None:
|
16 |
+
st.write("Analyzing your CV...")
|
17 |
+
|
18 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as temp_file:
|
19 |
+
temp_file.write(uploaded_file.getvalue())
|
20 |
+
temp_file_path = temp_file.name
|
21 |
+
|
22 |
+
# Analyze CV sections
|
23 |
+
cv_analysis = analyze_cv(temp_file_path)
|
24 |
+
|
25 |
+
# Analyze personal information
|
26 |
+
personal_info = json.loads(analyze_personal_info(temp_file_path))
|
27 |
+
|
28 |
+
# Evaluate spelling and grammar
|
29 |
+
spelling_grammar_score = evaluate_cv_text(temp_file_path, 'ocr_weights.json')
|
30 |
+
|
31 |
+
# Display results
|
32 |
+
st.header("Analysis Results")
|
33 |
+
|
34 |
+
# CV Sections
|
35 |
+
st.subheader("CV Sections")
|
36 |
+
st.write(f"Present sections: {', '.join(cv_analysis['present_sections'])}")
|
37 |
+
st.write(f"Sections score: {cv_analysis['score_sections']}")
|
38 |
+
|
39 |
+
# Personal Information
|
40 |
+
st.subheader("Personal Information")
|
41 |
+
st.write(f"Email: {'Found' if personal_info['email']['exists'] else 'Not found'}")
|
42 |
+
st.write(f"Phone: {'Found' if personal_info['phone']['exists'] else 'Not found'}")
|
43 |
+
st.write(f"City: {'Found' if personal_info['city']['exists'] else 'Not found'}")
|
44 |
+
st.write(f"Country: {'Found' if personal_info['country']['exists'] else 'Not found'}")
|
45 |
+
st.write(f"Personal information score: {personal_info['score_personal_information']}")
|
46 |
+
|
47 |
+
# Spelling and Grammar
|
48 |
+
st.subheader("Spelling and Grammar")
|
49 |
+
st.write(f"Spelling and grammar score: {spelling_grammar_score}")
|
50 |
+
|
51 |
+
# Total Score
|
52 |
+
total_score = cv_analysis['score_sections'] + personal_info['score_personal_information'] + spelling_grammar_score
|
53 |
+
st.subheader("Total Score")
|
54 |
+
st.write(f"Total CV score: {total_score}")
|
55 |
+
|
56 |
+
# Clean up the temporary file
|
57 |
+
os.unlink(temp_file_path)
|
58 |
+
|
59 |
+
st.write("Upload a CV to start the analysis.")
|
cv_analyzer.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from openai_utils import get_ai_response
|
3 |
+
from cv_prompt import get_cv_prompt, cv_sections_data
|
4 |
+
from ocr_extractor import process_file
|
5 |
+
|
6 |
+
def detect_cv_sections_and_score(text):
|
7 |
+
prompt = get_cv_prompt(text)
|
8 |
+
messages = [
|
9 |
+
{"role": "user", "content": prompt}
|
10 |
+
]
|
11 |
+
response = get_ai_response(messages)
|
12 |
+
|
13 |
+
if not response:
|
14 |
+
print("Unexpected response format from OpenAI API")
|
15 |
+
return {"present_sections": [], "score_sections": 0}
|
16 |
+
|
17 |
+
try:
|
18 |
+
detected_sections = json.loads(response)
|
19 |
+
present_sections = detected_sections.get("present_sections", [])
|
20 |
+
except json.JSONDecodeError:
|
21 |
+
print("Failed to parse JSON from response")
|
22 |
+
return {"present_sections": [], "score_sections": 0}
|
23 |
+
|
24 |
+
score_sections = sum(cv_sections_data["sections"].get(section, 0) for section in present_sections)
|
25 |
+
|
26 |
+
return {
|
27 |
+
"present_sections": present_sections,
|
28 |
+
"score_sections": score_sections
|
29 |
+
}
|
30 |
+
|
31 |
+
def analyze_cv(file_path):
|
32 |
+
text = process_file(file_path, 'ocr_weights.json')
|
33 |
+
return detect_cv_sections_and_score(text)
|
cv_prompt.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
# Load the JSON file
|
4 |
+
with open('cv_sections.json', 'r') as f:
|
5 |
+
cv_sections_data = json.load(f)
|
6 |
+
|
7 |
+
def get_cv_prompt(text):
|
8 |
+
sections_list = ", ".join(cv_sections_data["sections"].keys())
|
9 |
+
return f"""<s>[INST]Analyze this CV text in any language. Return a JSON object with key "present_sections" containing an array of sections present from this list: {sections_list}. A section is present if its content is identifiable, even without an explicit title. Consider synonyms and alternative phrasings in any language.
|
10 |
+
|
11 |
+
CV text:
|
12 |
+
{text}
|
13 |
+
|
14 |
+
Respond only with the JSON object, no explanation.[/INST]"""
|
15 |
+
|
16 |
+
def get_location_prompt(text):
|
17 |
+
return f"""<s>[INST]Extract the city and country from the following text. The text may be in any language. Respond with a JSON object in the format {{"city": {{"extracted city name": true/false}}, "country": {{"extracted country name": true/false}}}}. If you can't find the information, set the value to false.
|
18 |
+
|
19 |
+
Text:
|
20 |
+
{text}[/INST]"""
|
21 |
+
|
22 |
+
def get_spelling_grammar_prompt(text):
|
23 |
+
return f"""<s>[INST]Analyze the following text for spelling and grammar errors. The text may be in any language. Do not correct the errors, just count them. Calculate the percentage of errors.
|
24 |
+
|
25 |
+
Text to analyze:
|
26 |
+
{text}
|
27 |
+
|
28 |
+
Respond with a JSON object containing the key 'error_percentage' with the calculated percentage (0-100) of errors.[/INST]"""
|
cv_sections.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"sections": {
|
3 |
+
"Profile": 1,
|
4 |
+
"Skills": 4,
|
5 |
+
"Education": 5,
|
6 |
+
"Interests": 2,
|
7 |
+
"Experience": 5,
|
8 |
+
"Languages": 3,
|
9 |
+
"Contact": 2,
|
10 |
+
"Certificates": 2,
|
11 |
+
"References": 1
|
12 |
+
}
|
13 |
+
}
|
error_scoring.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"error_scores": [
|
3 |
+
{"min": 0, "max": 1.5, "score": -1},
|
4 |
+
{"min": 1.5, "max": 2.5, "score": -2},
|
5 |
+
{"min": 2.5, "max": 3.5, "score": -3},
|
6 |
+
{"min": 3.5, "max": 100, "score": -10}
|
7 |
+
]
|
8 |
+
}
|
ocr_extractor.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import importlib
|
3 |
+
from PIL import Image
|
4 |
+
import boto3
|
5 |
+
import os
|
6 |
+
from doctr.io import DocumentFile
|
7 |
+
from doctr.models import ocr_predictor
|
8 |
+
import easyocr
|
9 |
+
from shapely.geometry import Polygon
|
10 |
+
from paddleocr import PaddleOCR
|
11 |
+
import langid
|
12 |
+
import json
|
13 |
+
import PyPDF2
|
14 |
+
|
15 |
+
# Check if python-bidi is installed
|
16 |
+
if importlib.util.find_spec("bidi") is None:
|
17 |
+
print("Error: python-bidi is not installed. Please install it using pip install python-bidi")
|
18 |
+
sys.exit(1)
|
19 |
+
|
20 |
+
# Initialize OCR models
|
21 |
+
def load_models(language):
|
22 |
+
doctr_model = ocr_predictor(pretrained=True)
|
23 |
+
easyocr_reader = easyocr.Reader([language])
|
24 |
+
paddleocr_reader = PaddleOCR(use_angle_cls=True, lang=language)
|
25 |
+
return doctr_model, easyocr_reader, paddleocr_reader
|
26 |
+
|
27 |
+
# AWS Textract client
|
28 |
+
textract_client = boto3.client('textract', region_name='us-west-2')
|
29 |
+
|
30 |
+
def extract_text_aws(image_bytes):
|
31 |
+
try:
|
32 |
+
response = textract_client.detect_document_text(Document={'Bytes': image_bytes})
|
33 |
+
return [(item['Text'], item['Geometry']['BoundingBox'], item['Confidence'])
|
34 |
+
for item in response['Blocks'] if item['BlockType'] == 'WORD']
|
35 |
+
except Exception as e:
|
36 |
+
print(f"Error in AWS Textract: {str(e)}")
|
37 |
+
return []
|
38 |
+
|
39 |
+
def extract_text_doctr(image_path, doctr_model):
|
40 |
+
try:
|
41 |
+
doc = DocumentFile.from_images(image_path)
|
42 |
+
result = doctr_model(doc)
|
43 |
+
return [(word.value, word.geometry, word.confidence)
|
44 |
+
for block in result.pages[0].blocks for line in block.lines for word in line.words]
|
45 |
+
except Exception as e:
|
46 |
+
print(f"Error in Doctr OCR: {str(e)}")
|
47 |
+
return []
|
48 |
+
|
49 |
+
def extract_text_easyocr(image_path, easyocr_reader):
|
50 |
+
try:
|
51 |
+
result = easyocr_reader.readtext(image_path)
|
52 |
+
return [(detection[1], detection[0], detection[2]) for detection in result]
|
53 |
+
except Exception as e:
|
54 |
+
print(f"Error in EasyOCR: {str(e)}")
|
55 |
+
return []
|
56 |
+
|
57 |
+
def extract_text_paddleocr(image_path, paddleocr_reader):
|
58 |
+
try:
|
59 |
+
result = paddleocr_reader.ocr(image_path, cls=True)
|
60 |
+
return [(line[1][0], line[0], line[1][1]) for line in result[0]]
|
61 |
+
except Exception as e:
|
62 |
+
print(f"Error in PaddleOCR: {str(e)}")
|
63 |
+
return []
|
64 |
+
|
65 |
+
def bbox_to_polygon(bbox):
|
66 |
+
if isinstance(bbox, dict): # AWS format
|
67 |
+
return Polygon([(bbox['Left'], bbox['Top']),
|
68 |
+
(bbox['Left']+bbox['Width'], bbox['Top']),
|
69 |
+
(bbox['Left']+bbox['Width'], bbox['Top']+bbox['Height']),
|
70 |
+
(bbox['Left'], bbox['Top']+bbox['Height'])])
|
71 |
+
elif len(bbox) == 4 and all(isinstance(p, (list, tuple)) for p in bbox): # EasyOCR format
|
72 |
+
return Polygon(bbox)
|
73 |
+
elif len(bbox) == 2: # Doctr format
|
74 |
+
x, y, w, h = bbox[0][0], bbox[0][1], bbox[1][0] - bbox[0][0], bbox[1][1] - bbox[0][1]
|
75 |
+
return Polygon([(x, y), (x+w, y), (x+w, y+h), (x, y+h)])
|
76 |
+
else:
|
77 |
+
raise ValueError(f"Unsupported bbox format: {bbox}")
|
78 |
+
|
79 |
+
def combine_ocr_results(results, weights):
|
80 |
+
combined_words = []
|
81 |
+
for method, words in results.items():
|
82 |
+
for word, bbox, confidence in words:
|
83 |
+
try:
|
84 |
+
polygon = bbox_to_polygon(bbox)
|
85 |
+
combined_words.append((word, polygon, float(confidence) * weights[method]))
|
86 |
+
except Exception as e:
|
87 |
+
print(f"Error processing word '{word}' from {method}: {str(e)}")
|
88 |
+
|
89 |
+
final_words = []
|
90 |
+
while combined_words:
|
91 |
+
current_word = combined_words.pop(0)
|
92 |
+
overlapping = [w for w in combined_words if current_word[1].intersects(w[1])]
|
93 |
+
if overlapping:
|
94 |
+
best_word = max([current_word] + overlapping, key=lambda x: x[2])
|
95 |
+
final_words.append(best_word[0])
|
96 |
+
for word in overlapping:
|
97 |
+
combined_words.remove(word)
|
98 |
+
else:
|
99 |
+
final_words.append(current_word[0])
|
100 |
+
|
101 |
+
return ' '.join(final_words)
|
102 |
+
|
103 |
+
def detect_language(text):
|
104 |
+
language, _ = langid.classify(text)
|
105 |
+
return language
|
106 |
+
|
107 |
+
def process_file(file_path, weights_file):
|
108 |
+
_, file_extension = os.path.splitext(file_path)
|
109 |
+
|
110 |
+
if file_extension.lower() == '.pdf':
|
111 |
+
with open(file_path, 'rb') as file:
|
112 |
+
pdf_reader = PyPDF2.PdfReader(file)
|
113 |
+
text = ""
|
114 |
+
for page in pdf_reader.pages:
|
115 |
+
text += page.extract_text() + "\n"
|
116 |
+
return text
|
117 |
+
|
118 |
+
else: # Assume it's an image file
|
119 |
+
with open(weights_file, 'r') as f:
|
120 |
+
weights = json.load(f)
|
121 |
+
|
122 |
+
with open(file_path, 'rb') as image_file:
|
123 |
+
image_bytes = image_file.read()
|
124 |
+
|
125 |
+
# Detect language using a sample of text from AWS Textract
|
126 |
+
aws_results = extract_text_aws(image_bytes)
|
127 |
+
sample_text = ' '.join([item[0] for item in aws_results[:10]])
|
128 |
+
detected_language = detect_language(sample_text)
|
129 |
+
|
130 |
+
doctr_model, easyocr_reader, paddleocr_reader = load_models(detected_language)
|
131 |
+
|
132 |
+
results = {
|
133 |
+
"aws": aws_results,
|
134 |
+
"doctr": extract_text_doctr(file_path, doctr_model),
|
135 |
+
"easyocr": extract_text_easyocr(file_path, easyocr_reader),
|
136 |
+
"paddleocr": extract_text_paddleocr(file_path, paddleocr_reader),
|
137 |
+
}
|
138 |
+
|
139 |
+
return combine_ocr_results(results, weights)
|
ocr_weights.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"aws": 0.4,
|
3 |
+
"doctr": 0.3,
|
4 |
+
"easyocr": 0.2,
|
5 |
+
"paddleocr": 0.1
|
6 |
+
}
|
openai_utils.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from openai import OpenAI
|
3 |
+
|
4 |
+
# OpenAI configuration
|
5 |
+
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
6 |
+
|
7 |
+
def get_ai_response(messages):
|
8 |
+
"""
|
9 |
+
Get a response from the AI model using the OpenAI client.
|
10 |
+
:param messages: List of message dictionaries as expected by OpenAI API
|
11 |
+
:return: The content of the AI's response
|
12 |
+
"""
|
13 |
+
try:
|
14 |
+
chat_completion = client.chat.completions.create(
|
15 |
+
model="gpt-3.5-turbo",
|
16 |
+
messages=messages
|
17 |
+
)
|
18 |
+
return chat_completion.choices[0].message.content
|
19 |
+
except Exception as e:
|
20 |
+
print(f"Error getting AI response: {str(e)}")
|
21 |
+
return None
|
personal_info_scores.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"email": 3,
|
3 |
+
"phone": 3,
|
4 |
+
"city": 1,
|
5 |
+
"country": 2
|
6 |
+
}
|
personal_information.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import re
|
3 |
+
from openai_utils import get_ai_response
|
4 |
+
from ocr_extractor import process_file
|
5 |
+
from cv_prompt import get_location_prompt
|
6 |
+
|
7 |
+
# Load the scoring data
|
8 |
+
with open('personal_info_scores.json', 'r') as f:
|
9 |
+
score_data = json.load(f)
|
10 |
+
|
11 |
+
def extract_email(text):
|
12 |
+
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
13 |
+
emails = re.findall(email_pattern, text)
|
14 |
+
return emails[0] if emails else None
|
15 |
+
|
16 |
+
def extract_phone(text):
|
17 |
+
phone_pattern = r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b'
|
18 |
+
phones = re.findall(phone_pattern, text)
|
19 |
+
return phones[0] if phones else None
|
20 |
+
|
21 |
+
def extract_location(text):
|
22 |
+
prompt = get_location_prompt(text)
|
23 |
+
messages = [
|
24 |
+
{"role": "user", "content": prompt}
|
25 |
+
]
|
26 |
+
|
27 |
+
response = get_ai_response(messages)
|
28 |
+
|
29 |
+
if response:
|
30 |
+
try:
|
31 |
+
location_data = json.loads(response)
|
32 |
+
city = any(v for v in location_data.get('city', {}).values())
|
33 |
+
country = any(v for v in location_data.get('country', {}).values())
|
34 |
+
except json.JSONDecodeError:
|
35 |
+
print("Failed to parse JSON from response")
|
36 |
+
city, country = False, False
|
37 |
+
else:
|
38 |
+
city, country = False, False
|
39 |
+
|
40 |
+
return city, country
|
41 |
+
|
42 |
+
def calculate_score(email_exists, phone_exists, city_exists, country_exists):
|
43 |
+
score = 0
|
44 |
+
if email_exists:
|
45 |
+
score += score_data['email']
|
46 |
+
if phone_exists:
|
47 |
+
score += score_data['phone']
|
48 |
+
if city_exists:
|
49 |
+
score += score_data['city']
|
50 |
+
if country_exists:
|
51 |
+
score += score_data['country']
|
52 |
+
return score
|
53 |
+
|
54 |
+
def analyze_personal_info(file_path):
|
55 |
+
text = process_file(file_path, 'ocr_weights.json')
|
56 |
+
|
57 |
+
email = extract_email(text)
|
58 |
+
phone = extract_phone(text)
|
59 |
+
city_exists, country_exists = extract_location(text)
|
60 |
+
|
61 |
+
score = calculate_score(email is not None, phone is not None, city_exists, country_exists)
|
62 |
+
|
63 |
+
result = {
|
64 |
+
"email": {"exists": email is not None, "value": email},
|
65 |
+
"phone": {"exists": phone is not None, "value": phone},
|
66 |
+
"city": {"exists": city_exists},
|
67 |
+
"country": {"exists": country_exists},
|
68 |
+
"score_personal_information": score
|
69 |
+
}
|
70 |
+
|
71 |
+
return json.dumps(result)
|
prompt.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
# Load the JSON file
|
4 |
+
with open('cv_sections.json', 'r') as f:
|
5 |
+
cv_sections_data = json.load(f)
|
6 |
+
|
7 |
+
def get_cv_prompt(text):
|
8 |
+
sections_list = ", ".join(cv_sections_data["sections"].keys())
|
9 |
+
return f"""Analyze the following CV text in any language. Identify which sections from the list below are present in the CV. A section is considered present if its content is identifiable, even without an explicit title. Consider synonyms and alternative phrasings in any language.
|
10 |
+
|
11 |
+
Sections to identify: {sections_list}
|
12 |
+
|
13 |
+
CV text:
|
14 |
+
{text}
|
15 |
+
|
16 |
+
Respond with a JSON object containing a single key "present_sections" with an array of identified section names. Do not include any explanations or additional text."""
|
17 |
+
|
18 |
+
def get_location_prompt(text):
|
19 |
+
return f"""Extract the city and country from the following text. The text may be in any language.
|
20 |
+
|
21 |
+
Text:
|
22 |
+
{text}
|
23 |
+
|
24 |
+
Respond with a JSON object in the following format:
|
25 |
+
{{
|
26 |
+
"city": "extracted city name or 'Unknown' if not found",
|
27 |
+
"country": "extracted country name or 'Unknown' if not found"
|
28 |
+
}}
|
29 |
+
Do not include any explanations or additional text."""
|
30 |
+
|
31 |
+
def get_spelling_grammar_prompt(text):
|
32 |
+
return f"""Analyze the following text for spelling and grammar errors. The text may be in any language. Do not correct the errors, just evaluate the overall quality.
|
33 |
+
|
34 |
+
Text to analyze:
|
35 |
+
{text}
|
36 |
+
|
37 |
+
Respond with a JSON object in the following format:
|
38 |
+
{{
|
39 |
+
"score": "a number from 0 to 100 representing the quality of spelling and grammar"
|
40 |
+
}}
|
41 |
+
Where 0 means extremely poor quality and 100 means perfect spelling and grammar.
|
42 |
+
Do not include any explanations or additional text."""
|
requirements.txt
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core dependencies
|
2 |
+
streamlit
|
3 |
+
langchain
|
4 |
+
langchain-openai
|
5 |
+
openai
|
6 |
+
pydantic
|
7 |
+
|
8 |
+
# PDF and document processing
|
9 |
+
PyMuPDF
|
10 |
+
pdf2image
|
11 |
+
python-docx
|
12 |
+
PyPDF2
|
13 |
+
|
14 |
+
# Image processing and OCR
|
15 |
+
Pillow
|
16 |
+
pytesseract
|
17 |
+
opencv-python-headless
|
18 |
+
easyocr
|
19 |
+
paddleocr
|
20 |
+
paddlepaddle==2.4.2
|
21 |
+
|
22 |
+
# Machine Learning and Computer Vision
|
23 |
+
ultralytics
|
24 |
+
torch==1.13.0+cpu
|
25 |
+
torchvision==0.14.0+cpu
|
26 |
+
-f https://download.pytorch.org/whl/torch_stable.html
|
27 |
+
|
28 |
+
# Natural Language Processing
|
29 |
+
langdetect
|
30 |
+
langid
|
31 |
+
|
32 |
+
# AWS Integration
|
33 |
+
boto3
|
34 |
+
|
35 |
+
# Utility libraries
|
36 |
+
numpy
|
37 |
+
shapely
|
38 |
+
python-bidi==0.4.2
|
39 |
+
pyyaml
|
40 |
+
python-dotenv==1.0.0
|
41 |
+
|
42 |
+
# DocTR (Document Text Recognition)
|
43 |
+
python-doctr
|
spelling_grammar_checker.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from openai_utils import get_ai_response
|
4 |
+
from ocr_extractor import process_file
|
5 |
+
from cv_prompt import get_spelling_grammar_prompt
|
6 |
+
|
7 |
+
def check_spelling_and_grammar(text):
|
8 |
+
prompt = get_spelling_grammar_prompt(text)
|
9 |
+
messages = [
|
10 |
+
{"role": "user", "content": prompt}
|
11 |
+
]
|
12 |
+
response = get_ai_response(messages)
|
13 |
+
|
14 |
+
if not response:
|
15 |
+
print("Unexpected response from OpenAI API")
|
16 |
+
return 100 # Assume 100% errors if we can't get a response
|
17 |
+
|
18 |
+
try:
|
19 |
+
result = json.loads(response)
|
20 |
+
error_percentage = result.get('error_percentage', 100)
|
21 |
+
return min(max(float(error_percentage), 0), 100) # Ensure the percentage is between 0 and 100
|
22 |
+
except (json.JSONDecodeError, ValueError):
|
23 |
+
print(f"Unable to parse error percentage from API response: {response}")
|
24 |
+
return 100 # Assume 100% errors if we can't parse the response
|
25 |
+
|
26 |
+
def evaluate_cv_text(file_path, weights_file):
|
27 |
+
cv_text = process_file(file_path, weights_file)
|
28 |
+
error_percentage = check_spelling_and_grammar(cv_text)
|
29 |
+
|
30 |
+
error_scoring_file = 'error_scoring.json'
|
31 |
+
if not os.path.exists(error_scoring_file):
|
32 |
+
print(f"Error: {error_scoring_file} not found. Using default scoring.")
|
33 |
+
return -10 # Default score if file not found
|
34 |
+
|
35 |
+
try:
|
36 |
+
with open(error_scoring_file, 'r') as f:
|
37 |
+
error_scoring = json.load(f)
|
38 |
+
except json.JSONDecodeError as e:
|
39 |
+
print(f"Error parsing {error_scoring_file}: {str(e)}. Using default scoring.")
|
40 |
+
return -10 # Default score if JSON is invalid
|
41 |
+
|
42 |
+
score = None
|
43 |
+
for error_score in error_scoring.get('error_scores', []):
|
44 |
+
if error_score['min'] <= error_percentage <= error_score['max']:
|
45 |
+
score = error_score['score']
|
46 |
+
break
|
47 |
+
|
48 |
+
if score is None:
|
49 |
+
score = -10 # Default score if no matching range is found
|
50 |
+
|
51 |
+
return score
|