Spaces:
Sleeping
Sleeping
Commit
·
82698c6
1
Parent(s):
89a8148
test cv extraction
Browse files
app.py
CHANGED
@@ -51,75 +51,85 @@ def read_pdf(file_path):
|
|
51 |
output += pytesseract.image_to_string(img, lang='vie') + '\n'
|
52 |
return output
|
53 |
|
54 |
-
# Function to query Hugging Face endpoint
|
55 |
-
#@spaces.GPU
|
56 |
-
# def respond(
|
57 |
-
# message,
|
58 |
-
# history: list[tuple[str, str]],
|
59 |
-
# system_message,
|
60 |
-
# max_tokens,
|
61 |
-
# temperature,
|
62 |
-
# top_p,
|
63 |
-
# ):
|
64 |
-
# messages = [{"role": "system", "content": system_message}]
|
65 |
-
|
66 |
-
# for val in history:
|
67 |
-
# if val[0]:
|
68 |
-
# messages.append({"role": "user", "content": val[0]})
|
69 |
-
# if val[1]:
|
70 |
-
# messages.append({"role": "assistant", "content": val[1]})
|
71 |
-
|
72 |
-
# messages.append({"role": "user", "content": message})
|
73 |
-
|
74 |
-
# response = ""
|
75 |
-
|
76 |
-
# for message in client.chat_completion(
|
77 |
-
# messages,
|
78 |
-
# max_tokens=max_tokens,
|
79 |
-
# stream=True,
|
80 |
-
# temperature=temperature,
|
81 |
-
# top_p=top_p,
|
82 |
-
# ):
|
83 |
-
# token = message.choices[0].delta.content
|
84 |
-
|
85 |
-
# response += token
|
86 |
-
# return response
|
87 |
|
88 |
@spaces.GPU(duration=30)
|
89 |
-
def LLM_Inference(
|
90 |
huggingface_hub.login(token=api_token)
|
91 |
device = torch.device('cuda')
|
92 |
tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b-it')
|
93 |
model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b-it').to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
inputs = tokenizer(text, return_tensors='pt').to(device)
|
95 |
with torch.no_grad():
|
96 |
outputs = model.generate(
|
97 |
**inputs, max_new_tokens=1024, pad_token_id = tokenizer.eos_token_id
|
98 |
)
|
99 |
-
return tokenizer.decode(outputs[0])
|
100 |
-
|
101 |
-
# Gradio Interface for PDF Processing
|
102 |
-
def process_file(file, query):
|
103 |
-
pdf_output = read_pdf(file.name)
|
104 |
-
#huggingface_output = respond(query, history=[], system_message="You are a friendly sChatbot.",
|
105 |
-
#max_tokens=1024, temperature=0.0, top_p=0.99 )
|
106 |
-
huggingface_output = LLM_Inference(query)
|
107 |
-
return pdf_output, huggingface_output
|
108 |
|
109 |
# Create Gradio App
|
110 |
-
|
111 |
-
fn=
|
112 |
-
inputs=
|
113 |
-
gr.File(label="Upload a PDF file"),
|
114 |
-
gr.Textbox(label="Enter your query for Hugging Face"),
|
115 |
-
],
|
116 |
outputs=[
|
117 |
-
gr.Textbox(label="PDF Content"),
|
118 |
-
gr.
|
119 |
-
|
120 |
-
title="PDF Processor
|
|
|
121 |
)
|
122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
# Launch the Gradio App
|
124 |
if __name__ == "__main__":
|
125 |
prepare()
|
|
|
51 |
output += pytesseract.image_to_string(img, lang='vie') + '\n'
|
52 |
return output
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
@spaces.GPU(duration=30)
|
56 |
+
def LLM_Inference(cv_text):
|
57 |
huggingface_hub.login(token=api_token)
|
58 |
device = torch.device('cuda')
|
59 |
tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b-it')
|
60 |
model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b-it').to(device)
|
61 |
+
|
62 |
+
text = f'''
|
63 |
+
You are an AI designed to extract structured information from unstructured text. Your task is to analyze the content of a candidate's CV and extract the following details:
|
64 |
+
|
65 |
+
**CV**
|
66 |
+
{cv_text}
|
67 |
+
|
68 |
+
**Information extraction and output format**
|
69 |
+
1. Candidate Information
|
70 |
+
- Full Name
|
71 |
+
- Contact Information (Phone, Email, Address, etc.)
|
72 |
+
- Date of Birth (if available)
|
73 |
+
|
74 |
+
2. Education
|
75 |
+
- Degree Name (e.g., Bachelor's, Master's, Ph.D.)
|
76 |
+
- Field of Study (e.g., Computer Science, Business Administration)
|
77 |
+
- Institution Name
|
78 |
+
- Year(s) of Graduation
|
79 |
+
|
80 |
+
3. Professional Experience
|
81 |
+
For each job, extract:
|
82 |
+
- Job Title
|
83 |
+
- Company Name
|
84 |
+
- Duration (start and end dates)
|
85 |
+
- Summarize key Responsibilities and Achievements
|
86 |
+
|
87 |
+
4. Skills
|
88 |
+
- List of technical, soft, or industry-specific skills mentioned.
|
89 |
+
|
90 |
+
5. Certifications
|
91 |
+
- Name of Certification
|
92 |
+
- Issuing Organization
|
93 |
+
- Year of Issuance
|
94 |
+
|
95 |
+
6. Language
|
96 |
+
- List the languages mentioned in the CV along with proficiency levels (if specified).
|
97 |
+
|
98 |
+
Do not explain, comment or make up any more information that is not relative to the list of Information extraction. Respond in Vietnamese. Let's work this out in a step by step way to ensure the correct answer. [END].
|
99 |
+
'''
|
100 |
inputs = tokenizer(text, return_tensors='pt').to(device)
|
101 |
with torch.no_grad():
|
102 |
outputs = model.generate(
|
103 |
**inputs, max_new_tokens=1024, pad_token_id = tokenizer.eos_token_id
|
104 |
)
|
105 |
+
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
# Create Gradio App
|
108 |
+
pdf_interface = gr.Interface(
|
109 |
+
fn=read_pdf,
|
110 |
+
inputs=gr.File(label="Upload a PDF file"),
|
|
|
|
|
|
|
111 |
outputs=[
|
112 |
+
gr.Textbox(label="PDF Content"), # Display PDF content
|
113 |
+
gr.State(), # Store PDF content in a shared state
|
114 |
+
],
|
115 |
+
title="PDF Processor",
|
116 |
+
description="Upload a PDF file and extract its content."
|
117 |
)
|
118 |
|
119 |
+
# Create Gradio interface for Hugging Face inference
|
120 |
+
llm_interface = gr.Interface(
|
121 |
+
fn=LLM_Inference,
|
122 |
+
inputs=gr.State(),
|
123 |
+
outputs=gr.Textbox(label="Hugging Face Output"),
|
124 |
+
title="Hugging Face Query",
|
125 |
+
description="Enter a query and get a response from the Hugging Face model."
|
126 |
+
)
|
127 |
+
|
128 |
+
# Combine both interfaces into a tabbed app
|
129 |
+
interface = gr.TabbedInterface(
|
130 |
+
interface_list=[pdf_interface, llm_interface],
|
131 |
+
tab_names=["PDF Processor", "Hugging Face Query"]
|
132 |
+
)
|
133 |
# Launch the Gradio App
|
134 |
if __name__ == "__main__":
|
135 |
prepare()
|