Spaces:

capitaletech
/

cv_quality

Sleeping

App Files Files Community

Nassiraaa commited on Aug 4, 2024

Commit

0715b4a

verified ·

1 Parent(s): 7829a8b

Upload 5 files

Browse files

Files changed (5) hide show

final-app.py +44 -0
final-cv-analyzer.py +88 -0
final-cv-prompt.py +127 -0
final-ocr-extractor.py +52 -0
final-openai-utils.py +22 -0

final-app.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import streamlit as st
+from cv_analyzer import analyze_cv
+st.set_page_config(page_title="CV Analyzer", layout="wide")
+st.title('CV Analyzer')
+uploaded_file = st.file_uploader("Choose a CV file", type=['pdf', 'docx', 'txt'])
+if uploaded_file is not None:
+    file_content = uploaded_file.read()
+    with st.spinner('Analyzing CV...'):
+        result = analyze_cv(file_content)
+    if "error" in result:
+        st.error(result["error"])
+    else:
+        st.header("Personal Information")
+        st.json(result["personal_info"])
+        st.header("Spelling and Grammar")
+        st.write(f"Score: {result['spelling_grammar_score']:.2f} / 100")
+        st.header("Detected Sections")
+        st.write(result["detected_sections"])
+        st.header("Section Detection Score")
+        st.write(f"Score: {result['section_detection_score']}")
+        st.header("Content Quality Analysis")
+        for section, evaluation in result['content_analysis'].items():
+            st.subheader(section.capitalize())
+            st.json(evaluation)
+        st.header("Overall Content Quality Score")
+        st.write(f"Score: {result['overall_score']:.2f} / 10")
+if __name__ == "__main__":
+    st.sidebar.title("About")
+    st.sidebar.info(
+        "This CV Analyzer detects sections, extracts personal information, "
+        "checks spelling and grammar, analyzes content quality, "
+        "and provides a detailed evaluation of the CV."
+    )

final-cv-analyzer.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import json
+import logging
+import tempfile
+import os
+from cv_prompt import (
+    ResumeQualityEvaluation,
+    get_personal_info_prompt,
+    get_spelling_grammar_prompt,
+    get_section_detection_prompt,
+    get_content_quality_prompt,
+    calculate_section_detection_score,
+    calculate_overall_score
+)
+from openai_utils import get_ai_response
+from ocr_extractor import process_file
+from langchain.output_parsers import PydanticOutputParser
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def analyze_cv(file_content):
+    try:
+        # Save the file content temporarily
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
+            temp_file.write(file_content)
+            temp_file_path = temp_file.name
+        extracted_text = process_file(temp_file_path, 'ocr_weights.json')
+        logging.info("Text extracted successfully")
+        # Personal Information Extraction
+        personal_info_prompt = get_personal_info_prompt(extracted_text)
+        personal_info_response = get_ai_response([{"role": "user", "content": personal_info_prompt}])
+        if personal_info_response is None:
+            return {"error": "Failed to get AI response for personal information"}
+        personal_info = json.loads(personal_info_response)
+        # Spelling and Grammar Check
+        spelling_grammar_prompt = get_spelling_grammar_prompt(extracted_text)
+        spelling_grammar_response = get_ai_response([{"role": "user", "content": spelling_grammar_prompt}])
+        if spelling_grammar_response is None:
+            return {"error": "Failed to get AI response for spelling and grammar"}
+        spelling_grammar_result = json.loads(spelling_grammar_response)
+        error_percentage = spelling_grammar_result.get('error_percentage', 100)
+        spelling_grammar_score = 100 - error_percentage  # Convert error percentage to a score
+        # Section Detection
+        sections_prompt = get_section_detection_prompt(extracted_text)
+        sections_response = get_ai_response([{"role": "user", "content": sections_prompt}])
+        if sections_response is None:
+            return {"error": "Failed to get AI response for sections"}
+        sections_data = json.loads(sections_response)
+        detected_sections = sections_data.get('present_sections', [])
+        section_detection_score = calculate_section_detection_score(detected_sections)
+        logging.info(f"Detected sections: {detected_sections}")
+        logging.info(f"Section detection score: {section_detection_score}")
+        # Content Quality Analysis
+        quality_prompt = get_content_quality_prompt(extracted_text)
+        quality_response = get_ai_response([{"role": "user", "content": quality_prompt}])
+        if quality_response is None:
+            return {"error": "Failed to get AI response for content quality"}
+        parser = PydanticOutputParser(pydantic_object=ResumeQualityEvaluation)
+        evaluation_result = parser.parse(quality_response)
+        overall_score = calculate_overall_score(evaluation_result)
+        logging.info("All analyses completed")
+        logging.info(f"Overall score: {overall_score}")
+        return {
+            "extracted_text": extracted_text,
+            "personal_info": personal_info,
+            "spelling_grammar_score": spelling_grammar_score,
+            "detected_sections": detected_sections,
+            "section_detection_score": section_detection_score,
+            "content_analysis": evaluation_result.dict(),
+            "overall_score": overall_score
+        }
+    except Exception as e:
+        logging.error(f"Error in CV analysis: {str(e)}", exc_info=True)
+        return {"error": str(e)}
+    finally:
+        # Clean up the temporary file
+        if 'temp_file_path' in locals():
+            os.unlink(temp_file_path)

final-cv-prompt.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import json
+from typing import List
+from langchain_core.pydantic_v1 import BaseModel, Field
+from langchain.output_parsers import PydanticOutputParser
+from langchain_core.prompts import PromptTemplate
+def load_json_file(filename):
+    try:
+        with open(filename, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except json.JSONDecodeError as e:
+        print(f"Error decoding JSON in {filename}: {e}")
+        return None
+cv_structure = load_json_file('cv_structure.json')
+cv_sections = load_json_file('cv_sections.json')
+class EducationElement(BaseModel):
+    degree_present: bool = Field(description="Whether the degree is present")
+    year_present: bool = Field(description="Whether the year is present")
+    institution_present: bool = Field(description="Whether the institution is present")
+    score: float = Field(description="Score for this education element", ge=0, le=10)
+class Education(BaseModel):
+    overall_score: float = Field(description="Overall score for the education section", ge=0, le=10)
+    elements: List[EducationElement] = Field(description="List of education elements")
+class WorkExperienceElement(BaseModel):
+    job_title_present: bool = Field(description="Whether the job title is present")
+    company_present: bool = Field(description="Whether the company name is present")
+    dates_present: bool = Field(description="Whether the start and end dates are present")
+    technologies_present: bool = Field(description="Whether the used technologies are present")
+    responsibilities_present: bool = Field(description="Whether responsibilities are present")
+    achievements_present: bool = Field(description="Whether achievements are present")
+    responsibilities_quality: float = Field(description="Quality of responsibilities description", ge=0, le=10)
+    achievements_quality: float = Field(description="Quality of achievements description", ge=0, le=10)
+    score: float = Field(description="Score for this work experience element", ge=0, le=10)
+class WorkExperience(BaseModel):
+    overall_score: float = Field(description="Overall score for the work experience section", ge=0, le=10)
+    elements: List[WorkExperienceElement] = Field(description="List of work experience elements")
+class Profile(BaseModel):
+    overall_score: float = Field(description="Overall score for the profile section", ge=0, le=10)
+    brief_overview_present: bool = Field(description="Whether a brief overview is present")
+    career_goals_present: bool = Field(description="Whether career goals are present")
+    objective_present: bool = Field(description="Whether an objective is present")
+class ResumeQualityEvaluation(BaseModel):
+    education: Education = Field(description="Evaluation of the education section")
+    work_experience: WorkExperience = Field(description="Evaluation of the work experience section")
+    profile: Profile = Field(description="Evaluation of the profile section")
+def get_personal_info_prompt(text):
+    return f"""<s>[INST]Extract the personal information from the following CV text. The text may be in any language. Respond with a JSON object in the format {{"name": "extracted name", "email": "extracted email", "phone": "extracted phone number", "location": "extracted location"}}. If you can't find any of the information, set the value to null.
+CV text:
+{text}[/INST]"""
+def get_spelling_grammar_prompt(text):
+    return f"""<s>[INST]Analyze the following text for spelling and grammar errors. The text may be in any language. Do not correct the errors, just count them. Calculate the percentage of errors.
+Text to analyze:
+{text}
+Respond with a JSON object containing the key 'error_percentage' with the calculated percentage (0-100) of errors.[/INST]"""
+def get_section_detection_prompt(text):
+    if cv_sections is None:
+        return None
+    sections_list = ", ".join(cv_sections['sections'].keys())
+    return f"""<s>[INST] Analyze this CV text and identify which of the following sections are present: {sections_list}.
+A section is considered present if its content is identifiable, even without an explicit title.
+Consider synonyms and alternative phrasings for section titles.
+Sections to look for:
+{sections_list}
+CV text:
+{text}
+Respond with a JSON object with a key "present_sections" containing an array of the identified sections.
+Only include sections that are actually present in the CV. [/INST]"""
+def get_content_quality_prompt(text):
+    parser = PydanticOutputParser(pydantic_object=ResumeQualityEvaluation)
+    prompt = PromptTemplate(
+        template="""<s>[INST]Evaluate the quality of the following resume sections:
+{resume}
+Provide a detailed evaluation following this format:
+{format_instructions}
+For each section, evaluate the presence of required elements.
+For the Work Experience section, also evaluate the quality of the Responsibilities and Achievements descriptions on a scale of 0-10.
+Provide an overall score for each section on a scale of 0-10 based on the presence of elements and their quality where applicable.[/INST]""",
+        input_variables=["resume"],
+        partial_variables={"format_instructions": parser.get_format_instructions()}
+    )
+    return prompt.format(resume=text)
+def calculate_section_detection_score(detected_sections):
+    total_score = 0
+    for section in detected_sections:
+        if section in cv_sections['sections']:
+            total_score += cv_sections['sections'][section]
+    return total_score
+def calculate_overall_score(evaluation: ResumeQualityEvaluation) -> float:
+    education_weight = 0.3
+    work_experience_weight = 0.5
+    profile_weight = 0.2
+    overall_score = (
+        evaluation.education.overall_score * education_weight +
+        evaluation.work_experience.overall_score * work_experience_weight +
+        evaluation.profile.overall_score * profile_weight
+    )
+    return round(overall_score, 2)
+__all__ = ['ResumeQualityEvaluation', 'get_personal_info_prompt', 'get_spelling_grammar_prompt',
+           'get_section_detection_prompt', 'get_content_quality_prompt',
+           'calculate_section_detection_score', 'calculate_overall_score']

final-ocr-extractor.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import sys
+import importlib
+from PIL import Image
+import boto3
+import os
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+import easyocr
+from shapely.geometry import Polygon
+from paddleocr import PaddleOCR
+import langid
+import json
+import PyPDF2
+# Check if python-bidi is installed
+if importlib.util.find_spec("bidi") is None:
+    print("Error: python-bidi is not installed. Please install it using pip install python-bidi")
+    sys.exit(1)
+# Initialize OCR models
+def load_models(language):
+    doctr_model = ocr_predictor(pretrained=True)
+    easyocr_reader = easyocr.Reader([language])
+    paddleocr_reader = PaddleOCR(use_angle_cls=True, lang=language)
+    return doctr_model, easyocr_reader, paddleocr_reader
+# AWS Textract client
+textract_client = boto3.client('textract', region_name='us-west-2')
+def extract_text_aws(image_bytes):
+    try:
+        response = textract_client.detect_document_text(Document={'Bytes': image_bytes})
+        return [(item['Text'], item['Geometry']['BoundingBox'], item['Confidence'])
+                for item in response['Blocks'] if item['BlockType'] == 'WORD']
+    except Exception as e:
+        print(f"Error in AWS Textract: {str(e)}")
+        return []
+def extract_text_doctr(image_path, doctr_model):
+    try:
+        doc = DocumentFile.from_images(image_path)
+        result = doctr_model(doc)
+        return [(word.value, word.geometry, word.confidence)
+                for block in result.pages[0].blocks for line in block.lines for word in line.words]
+    except Exception as e:
+        print(f"Error in Doctr OCR: {str(e)}")
+        return []
+def extract_text_easyocr(image_path, easyocr_reader):
+    try:
+        result = easyocr_reader.readtext(image_path)
+        return [(detection[1], detection[0], detection

final-openai-utils.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import os
+from openai import OpenAI
+# OpenAI configuration
+client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+OPENAI_MODEL = "gpt-3.5-turbo"
+def get_ai_response(messages):
+    """
+    Get a response from the AI model using the OpenAI client.
+    :param messages: List of message dictionaries as expected by OpenAI API
+    :return: The content of the AI's response
+    """
+    try:
+        response = client.chat.completions.create(
+            model=OPENAI_MODEL,
+            messages=messages
+        )
+        return response.choices[0].message.content
+    except Exception as e:
+        print(f"Error getting AI response: {str(e)}")
+        return None