Spaces:

khoatran94
/

cv_ocr_gradio

Sleeping

App Files Files Community

khoatran94 commited on Nov 21, 2024

Commit

82698c6

1 Parent(s): 89a8148

test cv extraction

Browse files

Files changed (1) hide show

app.py +63 -53

app.py CHANGED Viewed

@@ -51,75 +51,85 @@ def read_pdf(file_path):
                 output += pytesseract.image_to_string(img, lang='vie') + '\n'
     return output
-# Function to query Hugging Face endpoint
-#@spaces.GPU
-# def respond(
-#     message,
-#     history: list[tuple[str, str]],
-#     system_message,
-#     max_tokens,
-#     temperature,
-#     top_p,
-# ):
-#     messages = [{"role": "system", "content": system_message}]
-#     for val in history:
-#         if val[0]:
-#             messages.append({"role": "user", "content": val[0]})
-#         if val[1]:
-#             messages.append({"role": "assistant", "content": val[1]})
-#     messages.append({"role": "user", "content": message})
-#     response = ""
-#     for message in client.chat_completion(
-#         messages,
-#         max_tokens=max_tokens,
-#         stream=True,
-#         temperature=temperature,
-#         top_p=top_p,
-#     ):
-#         token = message.choices[0].delta.content
-#         response += token
-#     return response
 @spaces.GPU(duration=30)
-def LLM_Inference(text):
     huggingface_hub.login(token=api_token)
     device = torch.device('cuda')
     tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b-it')
     model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b-it').to(device)
     inputs = tokenizer(text, return_tensors='pt').to(device)
     with torch.no_grad():
         outputs = model.generate(
             **inputs, max_new_tokens=1024, pad_token_id = tokenizer.eos_token_id
         )
-    return tokenizer.decode(outputs[0])
-# Gradio Interface for PDF Processing
-def process_file(file, query):
-    pdf_output = read_pdf(file.name)
-    #huggingface_output = respond(query, history=[], system_message="You are a friendly sChatbot.",
-                                #max_tokens=1024, temperature=0.0, top_p=0.99 )
-    huggingface_output = LLM_Inference(query)
-    return pdf_output, huggingface_output
 # Create Gradio App
-interface = gr.Interface(
-    fn=process_file,
-    inputs=[
-        gr.File(label="Upload a PDF file"),
-        gr.Textbox(label="Enter your query for Hugging Face"),
-    ],
     outputs=[
-        gr.Textbox(label="PDF Content"),
-        gr.Textbox(label="Hugging Face Output"),
-    ],
-    title="PDF Processor with Hugging Face Query"
 )
 # Launch the Gradio App
 if __name__ == "__main__":
     prepare()

                 output += pytesseract.image_to_string(img, lang='vie') + '\n'
     return output
 @spaces.GPU(duration=30)
+def LLM_Inference(cv_text):
     huggingface_hub.login(token=api_token)
     device = torch.device('cuda')
     tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b-it')
     model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b-it').to(device)
+    text = f'''
+    You are an AI designed to extract structured information from unstructured text. Your task is to analyze the content of a candidate's CV and extract the following details:
+    **CV**
+    {cv_text}
+    **Information extraction and output format**
+    1. Candidate Information
+    - Full Name
+    - Contact Information (Phone, Email, Address, etc.)
+    - Date of Birth (if available)
+    2. Education
+    - Degree Name (e.g., Bachelor's, Master's, Ph.D.)
+    - Field of Study (e.g., Computer Science, Business Administration)
+    - Institution Name
+    - Year(s) of Graduation
+    3. Professional Experience
+    For each job, extract:
+    - Job Title
+    - Company Name
+    - Duration (start and end dates)
+    - Summarize key Responsibilities and Achievements
+    4. Skills
+    - List of technical, soft, or industry-specific skills mentioned.
+    5. Certifications
+    - Name of Certification
+    - Issuing Organization
+    - Year of Issuance
+    6. Language
+    - List the languages mentioned in the CV along with proficiency levels (if specified).
+    Do not explain, comment or make up any more information that is not relative to the list of Information extraction. Respond in Vietnamese. Let's work this out in a step by step way to ensure the correct answer. [END].
+    '''
     inputs = tokenizer(text, return_tensors='pt').to(device)
     with torch.no_grad():
         outputs = model.generate(
             **inputs, max_new_tokens=1024, pad_token_id = tokenizer.eos_token_id
         )
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
 # Create Gradio App
+pdf_interface = gr.Interface(
+    fn=read_pdf,
+    inputs=gr.File(label="Upload a PDF file"),
     outputs=[
+        gr.Textbox(label="PDF Content"),  # Display PDF content
+        gr.State(),  # Store PDF content in a shared state
+            ],
+    title="PDF Processor",
+    description="Upload a PDF file and extract its content."
 )
+# Create Gradio interface for Hugging Face inference
+llm_interface = gr.Interface(
+    fn=LLM_Inference,
+    inputs=gr.State(),
+    outputs=gr.Textbox(label="Hugging Face Output"),
+    title="Hugging Face Query",
+    description="Enter a query and get a response from the Hugging Face model."
+)
+# Combine both interfaces into a tabbed app
+interface = gr.TabbedInterface(
+    interface_list=[pdf_interface, llm_interface],
+    tab_names=["PDF Processor", "Hugging Face Query"]
+)
 # Launch the Gradio App
 if __name__ == "__main__":
     prepare()