Spaces:

capitaletech
/

cv_quality

Sleeping

App Files Files Community

Nassiraaa commited on Aug 3, 2024

Commit

dea3e5d

verified ·

1 Parent(s): dfa2915

Upload 13 files

Browse files

Files changed (13) hide show

app.py +59 -0
cv_analyzer.py +33 -0
cv_prompt.py +28 -0
cv_sections.json +13 -0
error_scoring.json +8 -0
ocr_extractor.py +139 -0
ocr_weights.json +6 -0
openai_utils.py +21 -0
personal_info_scores.json +6 -0
personal_information.py +71 -0
prompt.py +42 -0
requirements.txt +43 -0
spelling_grammar_checker.py +51 -0

app.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import streamlit as st
+import tempfile
+import os
+import json
+from cv_analyzer import analyze_cv
+from personal_information import analyze_personal_info
+from spelling_grammar_checker import evaluate_cv_text
+st.set_page_config(page_title="CV Analyzer", page_icon="📄")
+st.title("CV Analyzer")
+uploaded_file = st.file_uploader("Choose a CV file", type=["pdf", "png", "jpg", "jpeg"])
+if uploaded_file is not None:
+    st.write("Analyzing your CV...")
+    with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as temp_file:
+        temp_file.write(uploaded_file.getvalue())
+        temp_file_path = temp_file.name
+    # Analyze CV sections
+    cv_analysis = analyze_cv(temp_file_path)
+    # Analyze personal information
+    personal_info = json.loads(analyze_personal_info(temp_file_path))
+    # Evaluate spelling and grammar
+    spelling_grammar_score = evaluate_cv_text(temp_file_path, 'ocr_weights.json')
+    # Display results
+    st.header("Analysis Results")
+    # CV Sections
+    st.subheader("CV Sections")
+    st.write(f"Present sections: {', '.join(cv_analysis['present_sections'])}")
+    st.write(f"Sections score: {cv_analysis['score_sections']}")
+    # Personal Information
+    st.subheader("Personal Information")
+    st.write(f"Email: {'Found' if personal_info['email']['exists'] else 'Not found'}")
+    st.write(f"Phone: {'Found' if personal_info['phone']['exists'] else 'Not found'}")
+    st.write(f"City: {'Found' if personal_info['city']['exists'] else 'Not found'}")
+    st.write(f"Country: {'Found' if personal_info['country']['exists'] else 'Not found'}")
+    st.write(f"Personal information score: {personal_info['score_personal_information']}")
+    # Spelling and Grammar
+    st.subheader("Spelling and Grammar")
+    st.write(f"Spelling and grammar score: {spelling_grammar_score}")
+    # Total Score
+    total_score = cv_analysis['score_sections'] + personal_info['score_personal_information'] + spelling_grammar_score
+    st.subheader("Total Score")
+    st.write(f"Total CV score: {total_score}")
+    # Clean up the temporary file
+    os.unlink(temp_file_path)
+st.write("Upload a CV to start the analysis.")

cv_analyzer.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import json
+from openai_utils import get_ai_response
+from cv_prompt import get_cv_prompt, cv_sections_data
+from ocr_extractor import process_file
+def detect_cv_sections_and_score(text):
+    prompt = get_cv_prompt(text)
+    messages = [
+        {"role": "user", "content": prompt}
+    ]
+    response = get_ai_response(messages)
+    if not response:
+        print("Unexpected response format from OpenAI API")
+        return {"present_sections": [], "score_sections": 0}
+    try:
+        detected_sections = json.loads(response)
+        present_sections = detected_sections.get("present_sections", [])
+    except json.JSONDecodeError:
+        print("Failed to parse JSON from response")
+        return {"present_sections": [], "score_sections": 0}
+    score_sections = sum(cv_sections_data["sections"].get(section, 0) for section in present_sections)
+    return {
+        "present_sections": present_sections,
+        "score_sections": score_sections
+    }
+def analyze_cv(file_path):
+    text = process_file(file_path, 'ocr_weights.json')
+    return detect_cv_sections_and_score(text)

cv_prompt.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import json
+# Load the JSON file
+with open('cv_sections.json', 'r') as f:
+    cv_sections_data = json.load(f)
+def get_cv_prompt(text):
+    sections_list = ", ".join(cv_sections_data["sections"].keys())
+    return f"""<s>[INST]Analyze this CV text in any language. Return a JSON object with key "present_sections" containing an array of sections present from this list: {sections_list}. A section is present if its content is identifiable, even without an explicit title. Consider synonyms and alternative phrasings in any language.
+CV text:
+{text}
+Respond only with the JSON object, no explanation.[/INST]"""
+def get_location_prompt(text):
+    return f"""<s>[INST]Extract the city and country from the following text. The text may be in any language. Respond with a JSON object in the format {{"city": {{"extracted city name": true/false}}, "country": {{"extracted country name": true/false}}}}. If you can't find the information, set the value to false.
+Text:
+{text}[/INST]"""
+def get_spelling_grammar_prompt(text):
+    return f"""<s>[INST]Analyze the following text for spelling and grammar errors. The text may be in any language. Do not correct the errors, just count them. Calculate the percentage of errors.
+Text to analyze:
+{text}
+Respond with a JSON object containing the key 'error_percentage' with the calculated percentage (0-100) of errors.[/INST]"""

cv_sections.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "sections": {
+        "Profile": 1,
+        "Skills": 4,
+        "Education": 5,
+        "Interests": 2,
+        "Experience": 5,
+        "Languages": 3,
+        "Contact": 2,
+        "Certificates": 2,
+        "References": 1
+    }
+}

error_scoring.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "error_scores": [
+    {"min": 0, "max": 1.5, "score": -1},
+    {"min": 1.5, "max": 2.5, "score": -2},
+    {"min": 2.5, "max": 3.5, "score": -3},
+    {"min": 3.5, "max": 100, "score": -10}
+  ]
+}

ocr_extractor.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import sys
+import importlib
+from PIL import Image
+import boto3
+import os
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+import easyocr
+from shapely.geometry import Polygon
+from paddleocr import PaddleOCR
+import langid
+import json
+import PyPDF2
+# Check if python-bidi is installed
+if importlib.util.find_spec("bidi") is None:
+    print("Error: python-bidi is not installed. Please install it using pip install python-bidi")
+    sys.exit(1)
+# Initialize OCR models
+def load_models(language):
+    doctr_model = ocr_predictor(pretrained=True)
+    easyocr_reader = easyocr.Reader([language])
+    paddleocr_reader = PaddleOCR(use_angle_cls=True, lang=language)
+    return doctr_model, easyocr_reader, paddleocr_reader
+# AWS Textract client
+textract_client = boto3.client('textract', region_name='us-west-2')
+def extract_text_aws(image_bytes):
+    try:
+        response = textract_client.detect_document_text(Document={'Bytes': image_bytes})
+        return [(item['Text'], item['Geometry']['BoundingBox'], item['Confidence'])
+                for item in response['Blocks'] if item['BlockType'] == 'WORD']
+    except Exception as e:
+        print(f"Error in AWS Textract: {str(e)}")
+        return []
+def extract_text_doctr(image_path, doctr_model):
+    try:
+        doc = DocumentFile.from_images(image_path)
+        result = doctr_model(doc)
+        return [(word.value, word.geometry, word.confidence)
+                for block in result.pages[0].blocks for line in block.lines for word in line.words]
+    except Exception as e:
+        print(f"Error in Doctr OCR: {str(e)}")
+        return []
+def extract_text_easyocr(image_path, easyocr_reader):
+    try:
+        result = easyocr_reader.readtext(image_path)
+        return [(detection[1], detection[0], detection[2]) for detection in result]
+    except Exception as e:
+        print(f"Error in EasyOCR: {str(e)}")
+        return []
+def extract_text_paddleocr(image_path, paddleocr_reader):
+    try:
+        result = paddleocr_reader.ocr(image_path, cls=True)
+        return [(line[1][0], line[0], line[1][1]) for line in result[0]]
+    except Exception as e:
+        print(f"Error in PaddleOCR: {str(e)}")
+        return []
+def bbox_to_polygon(bbox):
+    if isinstance(bbox, dict):  # AWS format
+        return Polygon([(bbox['Left'], bbox['Top']),
+                        (bbox['Left']+bbox['Width'], bbox['Top']),
+                        (bbox['Left']+bbox['Width'], bbox['Top']+bbox['Height']),
+                        (bbox['Left'], bbox['Top']+bbox['Height'])])
+    elif len(bbox) == 4 and all(isinstance(p, (list, tuple)) for p in bbox):  # EasyOCR format
+        return Polygon(bbox)
+    elif len(bbox) == 2:  # Doctr format
+        x, y, w, h = bbox[0][0], bbox[0][1], bbox[1][0] - bbox[0][0], bbox[1][1] - bbox[0][1]
+        return Polygon([(x, y), (x+w, y), (x+w, y+h), (x, y+h)])
+    else:
+        raise ValueError(f"Unsupported bbox format: {bbox}")
+def combine_ocr_results(results, weights):
+    combined_words = []
+    for method, words in results.items():
+        for word, bbox, confidence in words:
+            try:
+                polygon = bbox_to_polygon(bbox)
+                combined_words.append((word, polygon, float(confidence) * weights[method]))
+            except Exception as e:
+                print(f"Error processing word '{word}' from {method}: {str(e)}")
+    final_words = []
+    while combined_words:
+        current_word = combined_words.pop(0)
+        overlapping = [w for w in combined_words if current_word[1].intersects(w[1])]
+        if overlapping:
+            best_word = max([current_word] + overlapping, key=lambda x: x[2])
+            final_words.append(best_word[0])
+            for word in overlapping:
+                combined_words.remove(word)
+        else:
+            final_words.append(current_word[0])
+    return ' '.join(final_words)
+def detect_language(text):
+    language, _ = langid.classify(text)
+    return language
+def process_file(file_path, weights_file):
+    _, file_extension = os.path.splitext(file_path)
+    if file_extension.lower() == '.pdf':
+        with open(file_path, 'rb') as file:
+            pdf_reader = PyPDF2.PdfReader(file)
+            text = ""
+            for page in pdf_reader.pages:
+                text += page.extract_text() + "\n"
+        return text
+    else:  # Assume it's an image file
+        with open(weights_file, 'r') as f:
+            weights = json.load(f)
+        with open(file_path, 'rb') as image_file:
+            image_bytes = image_file.read()
+        # Detect language using a sample of text from AWS Textract
+        aws_results = extract_text_aws(image_bytes)
+        sample_text = ' '.join([item[0] for item in aws_results[:10]])
+        detected_language = detect_language(sample_text)
+        doctr_model, easyocr_reader, paddleocr_reader = load_models(detected_language)
+        results = {
+            "aws": aws_results,
+            "doctr": extract_text_doctr(file_path, doctr_model),
+            "easyocr": extract_text_easyocr(file_path, easyocr_reader),
+            "paddleocr": extract_text_paddleocr(file_path, paddleocr_reader),
+        }
+        return combine_ocr_results(results, weights)

ocr_weights.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "aws": 0.4,
+    "doctr": 0.3,
+    "easyocr": 0.2,
+    "paddleocr": 0.1
+}

openai_utils.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+from openai import OpenAI
+# OpenAI configuration
+client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+def get_ai_response(messages):
+    """
+    Get a response from the AI model using the OpenAI client.
+    :param messages: List of message dictionaries as expected by OpenAI API
+    :return: The content of the AI's response
+    """
+    try:
+        chat_completion = client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=messages
+        )
+        return chat_completion.choices[0].message.content
+    except Exception as e:
+        print(f"Error getting AI response: {str(e)}")
+        return None

personal_info_scores.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "email": 3,
+    "phone": 3,
+    "city": 1,
+    "country": 2
+}

personal_information.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import json
+import re
+from openai_utils import get_ai_response
+from ocr_extractor import process_file
+from cv_prompt import get_location_prompt
+# Load the scoring data
+with open('personal_info_scores.json', 'r') as f:
+    score_data = json.load(f)
+def extract_email(text):
+    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
+    emails = re.findall(email_pattern, text)
+    return emails[0] if emails else None
+def extract_phone(text):
+    phone_pattern = r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b'
+    phones = re.findall(phone_pattern, text)
+    return phones[0] if phones else None
+def extract_location(text):
+    prompt = get_location_prompt(text)
+    messages = [
+        {"role": "user", "content": prompt}
+    ]
+    response = get_ai_response(messages)
+    if response:
+        try:
+            location_data = json.loads(response)
+            city = any(v for v in location_data.get('city', {}).values())
+            country = any(v for v in location_data.get('country', {}).values())
+        except json.JSONDecodeError:
+            print("Failed to parse JSON from response")
+            city, country = False, False
+    else:
+        city, country = False, False
+    return city, country
+def calculate_score(email_exists, phone_exists, city_exists, country_exists):
+    score = 0
+    if email_exists:
+        score += score_data['email']
+    if phone_exists:
+        score += score_data['phone']
+    if city_exists:
+        score += score_data['city']
+    if country_exists:
+        score += score_data['country']
+    return score
+def analyze_personal_info(file_path):
+    text = process_file(file_path, 'ocr_weights.json')
+    email = extract_email(text)
+    phone = extract_phone(text)
+    city_exists, country_exists = extract_location(text)
+    score = calculate_score(email is not None, phone is not None, city_exists, country_exists)
+    result = {
+        "email": {"exists": email is not None, "value": email},
+        "phone": {"exists": phone is not None, "value": phone},
+        "city": {"exists": city_exists},
+        "country": {"exists": country_exists},
+        "score_personal_information": score
+    }
+    return json.dumps(result)

prompt.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import json
+# Load the JSON file
+with open('cv_sections.json', 'r') as f:
+    cv_sections_data = json.load(f)
+def get_cv_prompt(text):
+    sections_list = ", ".join(cv_sections_data["sections"].keys())
+    return f"""Analyze the following CV text in any language. Identify which sections from the list below are present in the CV. A section is considered present if its content is identifiable, even without an explicit title. Consider synonyms and alternative phrasings in any language.
+Sections to identify: {sections_list}
+CV text:
+{text}
+Respond with a JSON object containing a single key "present_sections" with an array of identified section names. Do not include any explanations or additional text."""
+def get_location_prompt(text):
+    return f"""Extract the city and country from the following text. The text may be in any language.
+Text:
+{text}
+Respond with a JSON object in the following format:
+{{
+  "city": "extracted city name or 'Unknown' if not found",
+  "country": "extracted country name or 'Unknown' if not found"
+}}
+Do not include any explanations or additional text."""
+def get_spelling_grammar_prompt(text):
+    return f"""Analyze the following text for spelling and grammar errors. The text may be in any language. Do not correct the errors, just evaluate the overall quality.
+Text to analyze:
+{text}
+Respond with a JSON object in the following format:
+{{
+  "score": "a number from 0 to 100 representing the quality of spelling and grammar"
+}}
+Where 0 means extremely poor quality and 100 means perfect spelling and grammar.
+Do not include any explanations or additional text."""

requirements.txt ADDED Viewed

	@@ -0,0 +1,43 @@

+# Core dependencies
+streamlit
+langchain
+langchain-openai
+openai
+pydantic
+# PDF and document processing
+PyMuPDF
+pdf2image
+python-docx
+PyPDF2
+# Image processing and OCR
+Pillow
+pytesseract
+opencv-python-headless
+easyocr
+paddleocr
+paddlepaddle==2.4.2
+# Machine Learning and Computer Vision
+ultralytics
+torch==1.13.0+cpu
+torchvision==0.14.0+cpu
+-f https://download.pytorch.org/whl/torch_stable.html
+# Natural Language Processing
+langdetect
+langid
+# AWS Integration
+boto3
+# Utility libraries
+numpy
+shapely
+python-bidi==0.4.2
+pyyaml
+python-dotenv==1.0.0
+# DocTR (Document Text Recognition)
+python-doctr

spelling_grammar_checker.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import json
+import os
+from openai_utils import get_ai_response
+from ocr_extractor import process_file
+from cv_prompt import get_spelling_grammar_prompt
+def check_spelling_and_grammar(text):
+    prompt = get_spelling_grammar_prompt(text)
+    messages = [
+        {"role": "user", "content": prompt}
+    ]
+    response = get_ai_response(messages)
+    if not response:
+        print("Unexpected response from OpenAI API")
+        return 100  # Assume 100% errors if we can't get a response
+    try:
+        result = json.loads(response)
+        error_percentage = result.get('error_percentage', 100)
+        return min(max(float(error_percentage), 0), 100)  # Ensure the percentage is between 0 and 100
+    except (json.JSONDecodeError, ValueError):
+        print(f"Unable to parse error percentage from API response: {response}")
+        return 100  # Assume 100% errors if we can't parse the response
+def evaluate_cv_text(file_path, weights_file):
+    cv_text = process_file(file_path, weights_file)
+    error_percentage = check_spelling_and_grammar(cv_text)
+    error_scoring_file = 'error_scoring.json'
+    if not os.path.exists(error_scoring_file):
+        print(f"Error: {error_scoring_file} not found. Using default scoring.")
+        return -10  # Default score if file not found
+    try:
+        with open(error_scoring_file, 'r') as f:
+            error_scoring = json.load(f)
+    except json.JSONDecodeError as e:
+        print(f"Error parsing {error_scoring_file}: {str(e)}. Using default scoring.")
+        return -10  # Default score if JSON is invalid
+    score = None
+    for error_score in error_scoring.get('error_scores', []):
+        if error_score['min'] <= error_percentage <= error_score['max']:
+            score = error_score['score']
+            break
+    if score is None:
+        score = -10  # Default score if no matching range is found
+    return score