Spaces:

khoatran94
/

cv_ocr_gradio

Sleeping

File size: 4,866 Bytes

from PIL import Image
import pytesseract
import os
import pymupdf
import spaces
import torch
import gradio as gr
from prepare import prepare

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from langchain_community.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import YoutubeLoader, DataFrameLoader
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.messages import AIMessage, HumanMessage
from langchain_community.llms import HuggingFaceEndpoint
from dotenv import load_dotenv
from huggingface_hub import InferenceClient
import huggingface_hub
#zero = torch.Tensor([0]).cuda()

load_dotenv()
api_token = os.getenv("HF_TOKEN")
huggingface_hub.login(token=api_token)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b')
model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b').to(device)

#@spaces.GPU
def read_pdf(file_path):
    output = ''
    doc = pymupdf.open(file_path)
    for page in range(len(doc)):
        text = doc[page].get_text().encode("utf8")
        if text:
            output += text.decode('utf-8')
        else:
            image_list = doc[page].get_images()
            for image_index, img in enumerate(image_list, start=1):  # enumerate the image list
                xref = img[0]  # get the XREF of the image
                pix = pymupdf.Pixmap(doc, xref)  # create a Pixmap

                if pix.n - pix.alpha > 3:  # CMYK: convert to RGB first
                    pix = pymupdf.Pixmap(pymupdf.csRGB, pix)

                path = "page_{}-image_{}.png".format(page, image_index)
                pix.save(path)  # save the image as png
                img = Image.open(path)
                pix = None
                output += pytesseract.image_to_string(img, lang='vie') + '\n'
                os.remove(path)
    return output


@spaces.GPU(duration=60)
def LLM_Inference(cv_text):
    text = f'''
    You are an AI designed to extract structured information from unstructured text. Your task is to analyze the content of a candidate's CV and extract the following details:

    **CV**
    {cv_text}

    **Information extraction and output format**
    1. Candidate Information
    - Full Name
    - Contact Information (Phone, Email, Address, etc.)
    - Date of Birth (if available)
    
    2. Education
    - Degree Name (e.g., Bachelor's, Master's, Ph.D.)
    - Field of Study (e.g., Computer Science, Business Administration)
    - Institution Name
    - Year(s) of Graduation
    
    3. Professional Experience
    For each job, extract:
    - Job Title
    - Company Name
    - Duration (start and end dates)
    - Summarize key Responsibilities and Achievements
    
    4. Skills
    - List of technical, soft, or industry-specific skills mentioned.
    
    5. Certifications
    - Name of Certification
    - Issuing Organization
    - Year of Issuance
    
    6. Language
    - List the languages mentioned in the CV along with proficiency levels (if specified).

    Do not explain, comment or make up any more information that is not relative to the list of Information extraction. Respond in the CV language. Let's work this out in a step by step way to ensure the correct answer. Do not repeat the step
    '''
    inputs = tokenizer(text, return_tensors='pt', max_length=2048,truncation=True).to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs, max_new_tokens=1024, pad_token_id = tokenizer.eos_token_id,
            top_p=0.99,                     # Nucleus sampling - only consider top 90% probability mass
            top_k=1,                      # Top-k sampling - choose from top 50 tokens
            temperature=0.0  
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def process(file_path):
    cv_text = read_pdf(file_path)
    cv_summary = LLM_Inference(cv_text)
    return cv_text, cv_summary

# Create Gradio App
interface = gr.Interface(
    fn=process,
    inputs=gr.File(label="Upload a PDF file"),
    outputs=[
        gr.Textbox(label="PDF Content"),  # Display PDF content
        gr.Textbox(label="CV Summary"), 
            ],
    title="PDF Processor",
    description="Upload a PDF file and extract its content."
)


# Launch the Gradio App
if __name__ == "__main__":
    prepare()
    interface.launch()