Spaces:
Sleeping
Sleeping
File size: 4,866 Bytes
5823725 b98edd0 ee3e2d2 5823725 b2a9bc3 f54310e 5823725 bb5d24f b98edd0 0d93a82 021d971 5ab8b76 021d971 0b5b10c bb5d24f 72ead82 5823725 7d26fee 5823725 bb5d24f 31b0aec 82698c6 c0cbe6c 82698c6 c9beefa b98edd0 c0cbe6c b98edd0 82698c6 5823725 a4f64ec 5823725 a4f64ec 82698c6 a4f64ec 82698c6 5823725 82698c6 5823725 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
from PIL import Image
import pytesseract
import os
import pymupdf
import spaces
import torch
import gradio as gr
from prepare import prepare
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from langchain_community.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import YoutubeLoader, DataFrameLoader
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.messages import AIMessage, HumanMessage
from langchain_community.llms import HuggingFaceEndpoint
from dotenv import load_dotenv
from huggingface_hub import InferenceClient
import huggingface_hub
#zero = torch.Tensor([0]).cuda()
load_dotenv()
api_token = os.getenv("HF_TOKEN")
huggingface_hub.login(token=api_token)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b')
model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b').to(device)
#@spaces.GPU
def read_pdf(file_path):
output = ''
doc = pymupdf.open(file_path)
for page in range(len(doc)):
text = doc[page].get_text().encode("utf8")
if text:
output += text.decode('utf-8')
else:
image_list = doc[page].get_images()
for image_index, img in enumerate(image_list, start=1): # enumerate the image list
xref = img[0] # get the XREF of the image
pix = pymupdf.Pixmap(doc, xref) # create a Pixmap
if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
path = "page_{}-image_{}.png".format(page, image_index)
pix.save(path) # save the image as png
img = Image.open(path)
pix = None
output += pytesseract.image_to_string(img, lang='vie') + '\n'
os.remove(path)
return output
@spaces.GPU(duration=60)
def LLM_Inference(cv_text):
text = f'''
You are an AI designed to extract structured information from unstructured text. Your task is to analyze the content of a candidate's CV and extract the following details:
**CV**
{cv_text}
**Information extraction and output format**
1. Candidate Information
- Full Name
- Contact Information (Phone, Email, Address, etc.)
- Date of Birth (if available)
2. Education
- Degree Name (e.g., Bachelor's, Master's, Ph.D.)
- Field of Study (e.g., Computer Science, Business Administration)
- Institution Name
- Year(s) of Graduation
3. Professional Experience
For each job, extract:
- Job Title
- Company Name
- Duration (start and end dates)
- Summarize key Responsibilities and Achievements
4. Skills
- List of technical, soft, or industry-specific skills mentioned.
5. Certifications
- Name of Certification
- Issuing Organization
- Year of Issuance
6. Language
- List the languages mentioned in the CV along with proficiency levels (if specified).
Do not explain, comment or make up any more information that is not relative to the list of Information extraction. Respond in the CV language. Let's work this out in a step by step way to ensure the correct answer. Do not repeat the step
'''
inputs = tokenizer(text, return_tensors='pt', max_length=2048,truncation=True).to(device)
with torch.no_grad():
outputs = model.generate(
**inputs, max_new_tokens=1024, pad_token_id = tokenizer.eos_token_id,
top_p=0.99, # Nucleus sampling - only consider top 90% probability mass
top_k=1, # Top-k sampling - choose from top 50 tokens
temperature=0.0
)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
def process(file_path):
cv_text = read_pdf(file_path)
cv_summary = LLM_Inference(cv_text)
return cv_text, cv_summary
# Create Gradio App
interface = gr.Interface(
fn=process,
inputs=gr.File(label="Upload a PDF file"),
outputs=[
gr.Textbox(label="PDF Content"), # Display PDF content
gr.Textbox(label="CV Summary"),
],
title="PDF Processor",
description="Upload a PDF file and extract its content."
)
# Launch the Gradio App
if __name__ == "__main__":
prepare()
interface.launch() |