khoatran94 commited on
Commit
82698c6
·
1 Parent(s): 89a8148

test cv extraction

Browse files
Files changed (1) hide show
  1. app.py +63 -53
app.py CHANGED
@@ -51,75 +51,85 @@ def read_pdf(file_path):
51
  output += pytesseract.image_to_string(img, lang='vie') + '\n'
52
  return output
53
 
54
- # Function to query Hugging Face endpoint
55
- #@spaces.GPU
56
- # def respond(
57
- # message,
58
- # history: list[tuple[str, str]],
59
- # system_message,
60
- # max_tokens,
61
- # temperature,
62
- # top_p,
63
- # ):
64
- # messages = [{"role": "system", "content": system_message}]
65
-
66
- # for val in history:
67
- # if val[0]:
68
- # messages.append({"role": "user", "content": val[0]})
69
- # if val[1]:
70
- # messages.append({"role": "assistant", "content": val[1]})
71
-
72
- # messages.append({"role": "user", "content": message})
73
-
74
- # response = ""
75
-
76
- # for message in client.chat_completion(
77
- # messages,
78
- # max_tokens=max_tokens,
79
- # stream=True,
80
- # temperature=temperature,
81
- # top_p=top_p,
82
- # ):
83
- # token = message.choices[0].delta.content
84
-
85
- # response += token
86
- # return response
87
 
88
  @spaces.GPU(duration=30)
89
- def LLM_Inference(text):
90
  huggingface_hub.login(token=api_token)
91
  device = torch.device('cuda')
92
  tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b-it')
93
  model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b-it').to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  inputs = tokenizer(text, return_tensors='pt').to(device)
95
  with torch.no_grad():
96
  outputs = model.generate(
97
  **inputs, max_new_tokens=1024, pad_token_id = tokenizer.eos_token_id
98
  )
99
- return tokenizer.decode(outputs[0])
100
-
101
- # Gradio Interface for PDF Processing
102
- def process_file(file, query):
103
- pdf_output = read_pdf(file.name)
104
- #huggingface_output = respond(query, history=[], system_message="You are a friendly sChatbot.",
105
- #max_tokens=1024, temperature=0.0, top_p=0.99 )
106
- huggingface_output = LLM_Inference(query)
107
- return pdf_output, huggingface_output
108
 
109
  # Create Gradio App
110
- interface = gr.Interface(
111
- fn=process_file,
112
- inputs=[
113
- gr.File(label="Upload a PDF file"),
114
- gr.Textbox(label="Enter your query for Hugging Face"),
115
- ],
116
  outputs=[
117
- gr.Textbox(label="PDF Content"),
118
- gr.Textbox(label="Hugging Face Output"),
119
- ],
120
- title="PDF Processor with Hugging Face Query"
 
121
  )
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  # Launch the Gradio App
124
  if __name__ == "__main__":
125
  prepare()
 
51
  output += pytesseract.image_to_string(img, lang='vie') + '\n'
52
  return output
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  @spaces.GPU(duration=30)
56
+ def LLM_Inference(cv_text):
57
  huggingface_hub.login(token=api_token)
58
  device = torch.device('cuda')
59
  tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b-it')
60
  model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b-it').to(device)
61
+
62
+ text = f'''
63
+ You are an AI designed to extract structured information from unstructured text. Your task is to analyze the content of a candidate's CV and extract the following details:
64
+
65
+ **CV**
66
+ {cv_text}
67
+
68
+ **Information extraction and output format**
69
+ 1. Candidate Information
70
+ - Full Name
71
+ - Contact Information (Phone, Email, Address, etc.)
72
+ - Date of Birth (if available)
73
+
74
+ 2. Education
75
+ - Degree Name (e.g., Bachelor's, Master's, Ph.D.)
76
+ - Field of Study (e.g., Computer Science, Business Administration)
77
+ - Institution Name
78
+ - Year(s) of Graduation
79
+
80
+ 3. Professional Experience
81
+ For each job, extract:
82
+ - Job Title
83
+ - Company Name
84
+ - Duration (start and end dates)
85
+ - Summarize key Responsibilities and Achievements
86
+
87
+ 4. Skills
88
+ - List of technical, soft, or industry-specific skills mentioned.
89
+
90
+ 5. Certifications
91
+ - Name of Certification
92
+ - Issuing Organization
93
+ - Year of Issuance
94
+
95
+ 6. Language
96
+ - List the languages mentioned in the CV along with proficiency levels (if specified).
97
+
98
+ Do not explain, comment or make up any more information that is not relative to the list of Information extraction. Respond in Vietnamese. Let's work this out in a step by step way to ensure the correct answer. [END].
99
+ '''
100
  inputs = tokenizer(text, return_tensors='pt').to(device)
101
  with torch.no_grad():
102
  outputs = model.generate(
103
  **inputs, max_new_tokens=1024, pad_token_id = tokenizer.eos_token_id
104
  )
105
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
106
 
107
  # Create Gradio App
108
+ pdf_interface = gr.Interface(
109
+ fn=read_pdf,
110
+ inputs=gr.File(label="Upload a PDF file"),
 
 
 
111
  outputs=[
112
+ gr.Textbox(label="PDF Content"), # Display PDF content
113
+ gr.State(), # Store PDF content in a shared state
114
+ ],
115
+ title="PDF Processor",
116
+ description="Upload a PDF file and extract its content."
117
  )
118
 
119
+ # Create Gradio interface for Hugging Face inference
120
+ llm_interface = gr.Interface(
121
+ fn=LLM_Inference,
122
+ inputs=gr.State(),
123
+ outputs=gr.Textbox(label="Hugging Face Output"),
124
+ title="Hugging Face Query",
125
+ description="Enter a query and get a response from the Hugging Face model."
126
+ )
127
+
128
+ # Combine both interfaces into a tabbed app
129
+ interface = gr.TabbedInterface(
130
+ interface_list=[pdf_interface, llm_interface],
131
+ tab_names=["PDF Processor", "Hugging Face Query"]
132
+ )
133
  # Launch the Gradio App
134
  if __name__ == "__main__":
135
  prepare()