File size: 4,866 Bytes
5823725
 
 
 
b98edd0
ee3e2d2
5823725
 
 
b2a9bc3
f54310e
5823725
 
 
 
 
 
 
 
 
 
 
bb5d24f
b98edd0
0d93a82
021d971
5ab8b76
 
021d971
 
0b5b10c
 
bb5d24f
72ead82
5823725
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d26fee
5823725
 
bb5d24f
31b0aec
82698c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0cbe6c
82698c6
c9beefa
b98edd0
 
c0cbe6c
 
 
 
b98edd0
82698c6
5823725
a4f64ec
 
 
 
 
5823725
a4f64ec
 
82698c6
a4f64ec
 
 
 
82698c6
 
5823725
 
82698c6
5823725
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from PIL import Image
import pytesseract
import os
import pymupdf
import spaces
import torch
import gradio as gr
from prepare import prepare

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from langchain_community.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import YoutubeLoader, DataFrameLoader
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.messages import AIMessage, HumanMessage
from langchain_community.llms import HuggingFaceEndpoint
from dotenv import load_dotenv
from huggingface_hub import InferenceClient
import huggingface_hub
#zero = torch.Tensor([0]).cuda()

load_dotenv()
api_token = os.getenv("HF_TOKEN")
huggingface_hub.login(token=api_token)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b')
model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b').to(device)

#@spaces.GPU
def read_pdf(file_path):
    output = ''
    doc = pymupdf.open(file_path)
    for page in range(len(doc)):
        text = doc[page].get_text().encode("utf8")
        if text:
            output += text.decode('utf-8')
        else:
            image_list = doc[page].get_images()
            for image_index, img in enumerate(image_list, start=1):  # enumerate the image list
                xref = img[0]  # get the XREF of the image
                pix = pymupdf.Pixmap(doc, xref)  # create a Pixmap

                if pix.n - pix.alpha > 3:  # CMYK: convert to RGB first
                    pix = pymupdf.Pixmap(pymupdf.csRGB, pix)

                path = "page_{}-image_{}.png".format(page, image_index)
                pix.save(path)  # save the image as png
                img = Image.open(path)
                pix = None
                output += pytesseract.image_to_string(img, lang='vie') + '\n'
                os.remove(path)
    return output


@spaces.GPU(duration=60)
def LLM_Inference(cv_text):
    text = f'''
    You are an AI designed to extract structured information from unstructured text. Your task is to analyze the content of a candidate's CV and extract the following details:

    **CV**
    {cv_text}

    **Information extraction and output format**
    1. Candidate Information
    - Full Name
    - Contact Information (Phone, Email, Address, etc.)
    - Date of Birth (if available)
    
    2. Education
    - Degree Name (e.g., Bachelor's, Master's, Ph.D.)
    - Field of Study (e.g., Computer Science, Business Administration)
    - Institution Name
    - Year(s) of Graduation
    
    3. Professional Experience
    For each job, extract:
    - Job Title
    - Company Name
    - Duration (start and end dates)
    - Summarize key Responsibilities and Achievements
    
    4. Skills
    - List of technical, soft, or industry-specific skills mentioned.
    
    5. Certifications
    - Name of Certification
    - Issuing Organization
    - Year of Issuance
    
    6. Language
    - List the languages mentioned in the CV along with proficiency levels (if specified).

    Do not explain, comment or make up any more information that is not relative to the list of Information extraction. Respond in the CV language. Let's work this out in a step by step way to ensure the correct answer. Do not repeat the step
    '''
    inputs = tokenizer(text, return_tensors='pt', max_length=2048,truncation=True).to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs, max_new_tokens=1024, pad_token_id = tokenizer.eos_token_id,
            top_p=0.99,                     # Nucleus sampling - only consider top 90% probability mass
            top_k=1,                      # Top-k sampling - choose from top 50 tokens
            temperature=0.0  
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def process(file_path):
    cv_text = read_pdf(file_path)
    cv_summary = LLM_Inference(cv_text)
    return cv_text, cv_summary

# Create Gradio App
interface = gr.Interface(
    fn=process,
    inputs=gr.File(label="Upload a PDF file"),
    outputs=[
        gr.Textbox(label="PDF Content"),  # Display PDF content
        gr.Textbox(label="CV Summary"), 
            ],
    title="PDF Processor",
    description="Upload a PDF file and extract its content."
)


# Launch the Gradio App
if __name__ == "__main__":
    prepare()
    interface.launch()