Nassiraaa commited on
Commit
dea3e5d
·
verified ·
1 Parent(s): dfa2915

Upload 13 files

Browse files
app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tempfile
3
+ import os
4
+ import json
5
+ from cv_analyzer import analyze_cv
6
+ from personal_information import analyze_personal_info
7
+ from spelling_grammar_checker import evaluate_cv_text
8
+
9
+ st.set_page_config(page_title="CV Analyzer", page_icon="📄")
10
+
11
+ st.title("CV Analyzer")
12
+
13
+ uploaded_file = st.file_uploader("Choose a CV file", type=["pdf", "png", "jpg", "jpeg"])
14
+
15
+ if uploaded_file is not None:
16
+ st.write("Analyzing your CV...")
17
+
18
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as temp_file:
19
+ temp_file.write(uploaded_file.getvalue())
20
+ temp_file_path = temp_file.name
21
+
22
+ # Analyze CV sections
23
+ cv_analysis = analyze_cv(temp_file_path)
24
+
25
+ # Analyze personal information
26
+ personal_info = json.loads(analyze_personal_info(temp_file_path))
27
+
28
+ # Evaluate spelling and grammar
29
+ spelling_grammar_score = evaluate_cv_text(temp_file_path, 'ocr_weights.json')
30
+
31
+ # Display results
32
+ st.header("Analysis Results")
33
+
34
+ # CV Sections
35
+ st.subheader("CV Sections")
36
+ st.write(f"Present sections: {', '.join(cv_analysis['present_sections'])}")
37
+ st.write(f"Sections score: {cv_analysis['score_sections']}")
38
+
39
+ # Personal Information
40
+ st.subheader("Personal Information")
41
+ st.write(f"Email: {'Found' if personal_info['email']['exists'] else 'Not found'}")
42
+ st.write(f"Phone: {'Found' if personal_info['phone']['exists'] else 'Not found'}")
43
+ st.write(f"City: {'Found' if personal_info['city']['exists'] else 'Not found'}")
44
+ st.write(f"Country: {'Found' if personal_info['country']['exists'] else 'Not found'}")
45
+ st.write(f"Personal information score: {personal_info['score_personal_information']}")
46
+
47
+ # Spelling and Grammar
48
+ st.subheader("Spelling and Grammar")
49
+ st.write(f"Spelling and grammar score: {spelling_grammar_score}")
50
+
51
+ # Total Score
52
+ total_score = cv_analysis['score_sections'] + personal_info['score_personal_information'] + spelling_grammar_score
53
+ st.subheader("Total Score")
54
+ st.write(f"Total CV score: {total_score}")
55
+
56
+ # Clean up the temporary file
57
+ os.unlink(temp_file_path)
58
+
59
+ st.write("Upload a CV to start the analysis.")
cv_analyzer.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from openai_utils import get_ai_response
3
+ from cv_prompt import get_cv_prompt, cv_sections_data
4
+ from ocr_extractor import process_file
5
+
6
+ def detect_cv_sections_and_score(text):
7
+ prompt = get_cv_prompt(text)
8
+ messages = [
9
+ {"role": "user", "content": prompt}
10
+ ]
11
+ response = get_ai_response(messages)
12
+
13
+ if not response:
14
+ print("Unexpected response format from OpenAI API")
15
+ return {"present_sections": [], "score_sections": 0}
16
+
17
+ try:
18
+ detected_sections = json.loads(response)
19
+ present_sections = detected_sections.get("present_sections", [])
20
+ except json.JSONDecodeError:
21
+ print("Failed to parse JSON from response")
22
+ return {"present_sections": [], "score_sections": 0}
23
+
24
+ score_sections = sum(cv_sections_data["sections"].get(section, 0) for section in present_sections)
25
+
26
+ return {
27
+ "present_sections": present_sections,
28
+ "score_sections": score_sections
29
+ }
30
+
31
+ def analyze_cv(file_path):
32
+ text = process_file(file_path, 'ocr_weights.json')
33
+ return detect_cv_sections_and_score(text)
cv_prompt.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ # Load the JSON file
4
+ with open('cv_sections.json', 'r') as f:
5
+ cv_sections_data = json.load(f)
6
+
7
+ def get_cv_prompt(text):
8
+ sections_list = ", ".join(cv_sections_data["sections"].keys())
9
+ return f"""<s>[INST]Analyze this CV text in any language. Return a JSON object with key "present_sections" containing an array of sections present from this list: {sections_list}. A section is present if its content is identifiable, even without an explicit title. Consider synonyms and alternative phrasings in any language.
10
+
11
+ CV text:
12
+ {text}
13
+
14
+ Respond only with the JSON object, no explanation.[/INST]"""
15
+
16
+ def get_location_prompt(text):
17
+ return f"""<s>[INST]Extract the city and country from the following text. The text may be in any language. Respond with a JSON object in the format {{"city": {{"extracted city name": true/false}}, "country": {{"extracted country name": true/false}}}}. If you can't find the information, set the value to false.
18
+
19
+ Text:
20
+ {text}[/INST]"""
21
+
22
+ def get_spelling_grammar_prompt(text):
23
+ return f"""<s>[INST]Analyze the following text for spelling and grammar errors. The text may be in any language. Do not correct the errors, just count them. Calculate the percentage of errors.
24
+
25
+ Text to analyze:
26
+ {text}
27
+
28
+ Respond with a JSON object containing the key 'error_percentage' with the calculated percentage (0-100) of errors.[/INST]"""
cv_sections.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sections": {
3
+ "Profile": 1,
4
+ "Skills": 4,
5
+ "Education": 5,
6
+ "Interests": 2,
7
+ "Experience": 5,
8
+ "Languages": 3,
9
+ "Contact": 2,
10
+ "Certificates": 2,
11
+ "References": 1
12
+ }
13
+ }
error_scoring.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "error_scores": [
3
+ {"min": 0, "max": 1.5, "score": -1},
4
+ {"min": 1.5, "max": 2.5, "score": -2},
5
+ {"min": 2.5, "max": 3.5, "score": -3},
6
+ {"min": 3.5, "max": 100, "score": -10}
7
+ ]
8
+ }
ocr_extractor.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import importlib
3
+ from PIL import Image
4
+ import boto3
5
+ import os
6
+ from doctr.io import DocumentFile
7
+ from doctr.models import ocr_predictor
8
+ import easyocr
9
+ from shapely.geometry import Polygon
10
+ from paddleocr import PaddleOCR
11
+ import langid
12
+ import json
13
+ import PyPDF2
14
+
15
+ # Check if python-bidi is installed
16
+ if importlib.util.find_spec("bidi") is None:
17
+ print("Error: python-bidi is not installed. Please install it using pip install python-bidi")
18
+ sys.exit(1)
19
+
20
+ # Initialize OCR models
21
+ def load_models(language):
22
+ doctr_model = ocr_predictor(pretrained=True)
23
+ easyocr_reader = easyocr.Reader([language])
24
+ paddleocr_reader = PaddleOCR(use_angle_cls=True, lang=language)
25
+ return doctr_model, easyocr_reader, paddleocr_reader
26
+
27
+ # AWS Textract client
28
+ textract_client = boto3.client('textract', region_name='us-west-2')
29
+
30
+ def extract_text_aws(image_bytes):
31
+ try:
32
+ response = textract_client.detect_document_text(Document={'Bytes': image_bytes})
33
+ return [(item['Text'], item['Geometry']['BoundingBox'], item['Confidence'])
34
+ for item in response['Blocks'] if item['BlockType'] == 'WORD']
35
+ except Exception as e:
36
+ print(f"Error in AWS Textract: {str(e)}")
37
+ return []
38
+
39
+ def extract_text_doctr(image_path, doctr_model):
40
+ try:
41
+ doc = DocumentFile.from_images(image_path)
42
+ result = doctr_model(doc)
43
+ return [(word.value, word.geometry, word.confidence)
44
+ for block in result.pages[0].blocks for line in block.lines for word in line.words]
45
+ except Exception as e:
46
+ print(f"Error in Doctr OCR: {str(e)}")
47
+ return []
48
+
49
+ def extract_text_easyocr(image_path, easyocr_reader):
50
+ try:
51
+ result = easyocr_reader.readtext(image_path)
52
+ return [(detection[1], detection[0], detection[2]) for detection in result]
53
+ except Exception as e:
54
+ print(f"Error in EasyOCR: {str(e)}")
55
+ return []
56
+
57
+ def extract_text_paddleocr(image_path, paddleocr_reader):
58
+ try:
59
+ result = paddleocr_reader.ocr(image_path, cls=True)
60
+ return [(line[1][0], line[0], line[1][1]) for line in result[0]]
61
+ except Exception as e:
62
+ print(f"Error in PaddleOCR: {str(e)}")
63
+ return []
64
+
65
+ def bbox_to_polygon(bbox):
66
+ if isinstance(bbox, dict): # AWS format
67
+ return Polygon([(bbox['Left'], bbox['Top']),
68
+ (bbox['Left']+bbox['Width'], bbox['Top']),
69
+ (bbox['Left']+bbox['Width'], bbox['Top']+bbox['Height']),
70
+ (bbox['Left'], bbox['Top']+bbox['Height'])])
71
+ elif len(bbox) == 4 and all(isinstance(p, (list, tuple)) for p in bbox): # EasyOCR format
72
+ return Polygon(bbox)
73
+ elif len(bbox) == 2: # Doctr format
74
+ x, y, w, h = bbox[0][0], bbox[0][1], bbox[1][0] - bbox[0][0], bbox[1][1] - bbox[0][1]
75
+ return Polygon([(x, y), (x+w, y), (x+w, y+h), (x, y+h)])
76
+ else:
77
+ raise ValueError(f"Unsupported bbox format: {bbox}")
78
+
79
+ def combine_ocr_results(results, weights):
80
+ combined_words = []
81
+ for method, words in results.items():
82
+ for word, bbox, confidence in words:
83
+ try:
84
+ polygon = bbox_to_polygon(bbox)
85
+ combined_words.append((word, polygon, float(confidence) * weights[method]))
86
+ except Exception as e:
87
+ print(f"Error processing word '{word}' from {method}: {str(e)}")
88
+
89
+ final_words = []
90
+ while combined_words:
91
+ current_word = combined_words.pop(0)
92
+ overlapping = [w for w in combined_words if current_word[1].intersects(w[1])]
93
+ if overlapping:
94
+ best_word = max([current_word] + overlapping, key=lambda x: x[2])
95
+ final_words.append(best_word[0])
96
+ for word in overlapping:
97
+ combined_words.remove(word)
98
+ else:
99
+ final_words.append(current_word[0])
100
+
101
+ return ' '.join(final_words)
102
+
103
+ def detect_language(text):
104
+ language, _ = langid.classify(text)
105
+ return language
106
+
107
+ def process_file(file_path, weights_file):
108
+ _, file_extension = os.path.splitext(file_path)
109
+
110
+ if file_extension.lower() == '.pdf':
111
+ with open(file_path, 'rb') as file:
112
+ pdf_reader = PyPDF2.PdfReader(file)
113
+ text = ""
114
+ for page in pdf_reader.pages:
115
+ text += page.extract_text() + "\n"
116
+ return text
117
+
118
+ else: # Assume it's an image file
119
+ with open(weights_file, 'r') as f:
120
+ weights = json.load(f)
121
+
122
+ with open(file_path, 'rb') as image_file:
123
+ image_bytes = image_file.read()
124
+
125
+ # Detect language using a sample of text from AWS Textract
126
+ aws_results = extract_text_aws(image_bytes)
127
+ sample_text = ' '.join([item[0] for item in aws_results[:10]])
128
+ detected_language = detect_language(sample_text)
129
+
130
+ doctr_model, easyocr_reader, paddleocr_reader = load_models(detected_language)
131
+
132
+ results = {
133
+ "aws": aws_results,
134
+ "doctr": extract_text_doctr(file_path, doctr_model),
135
+ "easyocr": extract_text_easyocr(file_path, easyocr_reader),
136
+ "paddleocr": extract_text_paddleocr(file_path, paddleocr_reader),
137
+ }
138
+
139
+ return combine_ocr_results(results, weights)
ocr_weights.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "aws": 0.4,
3
+ "doctr": 0.3,
4
+ "easyocr": 0.2,
5
+ "paddleocr": 0.1
6
+ }
openai_utils.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from openai import OpenAI
3
+
4
+ # OpenAI configuration
5
+ client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
6
+
7
+ def get_ai_response(messages):
8
+ """
9
+ Get a response from the AI model using the OpenAI client.
10
+ :param messages: List of message dictionaries as expected by OpenAI API
11
+ :return: The content of the AI's response
12
+ """
13
+ try:
14
+ chat_completion = client.chat.completions.create(
15
+ model="gpt-3.5-turbo",
16
+ messages=messages
17
+ )
18
+ return chat_completion.choices[0].message.content
19
+ except Exception as e:
20
+ print(f"Error getting AI response: {str(e)}")
21
+ return None
personal_info_scores.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "email": 3,
3
+ "phone": 3,
4
+ "city": 1,
5
+ "country": 2
6
+ }
personal_information.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from openai_utils import get_ai_response
4
+ from ocr_extractor import process_file
5
+ from cv_prompt import get_location_prompt
6
+
7
+ # Load the scoring data
8
+ with open('personal_info_scores.json', 'r') as f:
9
+ score_data = json.load(f)
10
+
11
+ def extract_email(text):
12
+ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
13
+ emails = re.findall(email_pattern, text)
14
+ return emails[0] if emails else None
15
+
16
+ def extract_phone(text):
17
+ phone_pattern = r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b'
18
+ phones = re.findall(phone_pattern, text)
19
+ return phones[0] if phones else None
20
+
21
+ def extract_location(text):
22
+ prompt = get_location_prompt(text)
23
+ messages = [
24
+ {"role": "user", "content": prompt}
25
+ ]
26
+
27
+ response = get_ai_response(messages)
28
+
29
+ if response:
30
+ try:
31
+ location_data = json.loads(response)
32
+ city = any(v for v in location_data.get('city', {}).values())
33
+ country = any(v for v in location_data.get('country', {}).values())
34
+ except json.JSONDecodeError:
35
+ print("Failed to parse JSON from response")
36
+ city, country = False, False
37
+ else:
38
+ city, country = False, False
39
+
40
+ return city, country
41
+
42
+ def calculate_score(email_exists, phone_exists, city_exists, country_exists):
43
+ score = 0
44
+ if email_exists:
45
+ score += score_data['email']
46
+ if phone_exists:
47
+ score += score_data['phone']
48
+ if city_exists:
49
+ score += score_data['city']
50
+ if country_exists:
51
+ score += score_data['country']
52
+ return score
53
+
54
+ def analyze_personal_info(file_path):
55
+ text = process_file(file_path, 'ocr_weights.json')
56
+
57
+ email = extract_email(text)
58
+ phone = extract_phone(text)
59
+ city_exists, country_exists = extract_location(text)
60
+
61
+ score = calculate_score(email is not None, phone is not None, city_exists, country_exists)
62
+
63
+ result = {
64
+ "email": {"exists": email is not None, "value": email},
65
+ "phone": {"exists": phone is not None, "value": phone},
66
+ "city": {"exists": city_exists},
67
+ "country": {"exists": country_exists},
68
+ "score_personal_information": score
69
+ }
70
+
71
+ return json.dumps(result)
prompt.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ # Load the JSON file
4
+ with open('cv_sections.json', 'r') as f:
5
+ cv_sections_data = json.load(f)
6
+
7
+ def get_cv_prompt(text):
8
+ sections_list = ", ".join(cv_sections_data["sections"].keys())
9
+ return f"""Analyze the following CV text in any language. Identify which sections from the list below are present in the CV. A section is considered present if its content is identifiable, even without an explicit title. Consider synonyms and alternative phrasings in any language.
10
+
11
+ Sections to identify: {sections_list}
12
+
13
+ CV text:
14
+ {text}
15
+
16
+ Respond with a JSON object containing a single key "present_sections" with an array of identified section names. Do not include any explanations or additional text."""
17
+
18
+ def get_location_prompt(text):
19
+ return f"""Extract the city and country from the following text. The text may be in any language.
20
+
21
+ Text:
22
+ {text}
23
+
24
+ Respond with a JSON object in the following format:
25
+ {{
26
+ "city": "extracted city name or 'Unknown' if not found",
27
+ "country": "extracted country name or 'Unknown' if not found"
28
+ }}
29
+ Do not include any explanations or additional text."""
30
+
31
+ def get_spelling_grammar_prompt(text):
32
+ return f"""Analyze the following text for spelling and grammar errors. The text may be in any language. Do not correct the errors, just evaluate the overall quality.
33
+
34
+ Text to analyze:
35
+ {text}
36
+
37
+ Respond with a JSON object in the following format:
38
+ {{
39
+ "score": "a number from 0 to 100 representing the quality of spelling and grammar"
40
+ }}
41
+ Where 0 means extremely poor quality and 100 means perfect spelling and grammar.
42
+ Do not include any explanations or additional text."""
requirements.txt ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ streamlit
3
+ langchain
4
+ langchain-openai
5
+ openai
6
+ pydantic
7
+
8
+ # PDF and document processing
9
+ PyMuPDF
10
+ pdf2image
11
+ python-docx
12
+ PyPDF2
13
+
14
+ # Image processing and OCR
15
+ Pillow
16
+ pytesseract
17
+ opencv-python-headless
18
+ easyocr
19
+ paddleocr
20
+ paddlepaddle==2.4.2
21
+
22
+ # Machine Learning and Computer Vision
23
+ ultralytics
24
+ torch==1.13.0+cpu
25
+ torchvision==0.14.0+cpu
26
+ -f https://download.pytorch.org/whl/torch_stable.html
27
+
28
+ # Natural Language Processing
29
+ langdetect
30
+ langid
31
+
32
+ # AWS Integration
33
+ boto3
34
+
35
+ # Utility libraries
36
+ numpy
37
+ shapely
38
+ python-bidi==0.4.2
39
+ pyyaml
40
+ python-dotenv==1.0.0
41
+
42
+ # DocTR (Document Text Recognition)
43
+ python-doctr
spelling_grammar_checker.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from openai_utils import get_ai_response
4
+ from ocr_extractor import process_file
5
+ from cv_prompt import get_spelling_grammar_prompt
6
+
7
+ def check_spelling_and_grammar(text):
8
+ prompt = get_spelling_grammar_prompt(text)
9
+ messages = [
10
+ {"role": "user", "content": prompt}
11
+ ]
12
+ response = get_ai_response(messages)
13
+
14
+ if not response:
15
+ print("Unexpected response from OpenAI API")
16
+ return 100 # Assume 100% errors if we can't get a response
17
+
18
+ try:
19
+ result = json.loads(response)
20
+ error_percentage = result.get('error_percentage', 100)
21
+ return min(max(float(error_percentage), 0), 100) # Ensure the percentage is between 0 and 100
22
+ except (json.JSONDecodeError, ValueError):
23
+ print(f"Unable to parse error percentage from API response: {response}")
24
+ return 100 # Assume 100% errors if we can't parse the response
25
+
26
+ def evaluate_cv_text(file_path, weights_file):
27
+ cv_text = process_file(file_path, weights_file)
28
+ error_percentage = check_spelling_and_grammar(cv_text)
29
+
30
+ error_scoring_file = 'error_scoring.json'
31
+ if not os.path.exists(error_scoring_file):
32
+ print(f"Error: {error_scoring_file} not found. Using default scoring.")
33
+ return -10 # Default score if file not found
34
+
35
+ try:
36
+ with open(error_scoring_file, 'r') as f:
37
+ error_scoring = json.load(f)
38
+ except json.JSONDecodeError as e:
39
+ print(f"Error parsing {error_scoring_file}: {str(e)}. Using default scoring.")
40
+ return -10 # Default score if JSON is invalid
41
+
42
+ score = None
43
+ for error_score in error_scoring.get('error_scores', []):
44
+ if error_score['min'] <= error_percentage <= error_score['max']:
45
+ score = error_score['score']
46
+ break
47
+
48
+ if score is None:
49
+ score = -10 # Default score if no matching range is found
50
+
51
+ return score