CR7CAD commited on
Commit
08361f0
·
verified ·
1 Parent(s): 9753cc9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -15
app.py CHANGED
@@ -5,11 +5,11 @@ from PIL import Image
5
  from transformers import pipeline
6
  from pdfminer.high_level import extract_text
7
 
8
- # Load and cache the OCR model once at startup
9
  @st.cache_resource(show_spinner=False)
10
  def load_ocr_pipeline():
11
  try:
12
- # Ensure your transformers library is up-to-date (>=4.x)
13
  ocr_pipe = pipeline("image-to-text", model="YouLiXiya/tinyllava-v1.0-1.1b-hf")
14
  return ocr_pipe
15
  except Exception as e:
@@ -20,7 +20,7 @@ ocr_pipeline = load_ocr_pipeline()
20
  st.write("Model loaded successfully!")
21
 
22
  #####################################
23
- # Extract Text from File Function
24
  #####################################
25
  def extract_text_from_file(file_obj):
26
  full_text = ""
@@ -43,10 +43,17 @@ def extract_text_from_file(file_obj):
43
  return full_text
44
 
45
  #####################################
46
- # Information Extraction Functions
47
  #####################################
48
  def extract_basic_resume_info(text):
49
- """Extract basic resume info: Name, Age, Job Experience, Skills, Expected Industry/Direction."""
 
 
 
 
 
 
 
50
  info = {
51
  "Name": None,
52
  "Age": None,
@@ -55,22 +62,22 @@ def extract_basic_resume_info(text):
55
  "Expected Industry/Direction": None,
56
  }
57
 
58
- # Extract name (e.g., "Name: John Doe")
59
  name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
60
  if name_match:
61
  info["Name"] = name_match.group(1).strip()
62
  else:
63
- # Heuristic: pick the first sequence of capitalized words
64
  potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)
65
  if potential_names:
66
  info["Name"] = potential_names[0]
67
 
68
- # Extract age
69
  age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
70
  if age_match:
71
  info["Age"] = age_match.group(1)
72
 
73
- # Extract job experience (years)
74
  exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
75
  if exp_match:
76
  info["Job Experience"] = exp_match.group(1) + " years"
@@ -79,14 +86,14 @@ def extract_basic_resume_info(text):
79
  if exp_line:
80
  info["Job Experience"] = exp_line.group(2).strip()
81
 
82
- # Extract skills (e.g., "Skills: Python, Java, SQL")
83
  skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text)
84
  if skills_match:
85
  skills_text = skills_match.group(1)
86
  skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()]
87
  info["Skills"] = skills
88
 
89
- # Extract expected industry/direction
90
  industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
91
  if industry_match:
92
  info["Expected Industry/Direction"] = industry_match.group(2).strip()
@@ -94,15 +101,15 @@ def extract_basic_resume_info(text):
94
  return info
95
 
96
  #####################################
97
- # Main Processing Logic
98
  #####################################
99
  def process_resume(file_obj):
100
  if file_obj is None:
101
  return None, None
102
 
103
- # Extract text based on file type (PDF or image)
104
  resume_text = extract_text_from_file(file_obj)
105
- # Parse basic resume info using heuristics
106
  resume_info = extract_basic_resume_info(resume_text)
107
  return resume_text, resume_info
108
 
@@ -111,7 +118,7 @@ def process_resume(file_obj):
111
  #####################################
112
  st.title("Resume Extraction and Basic Info Parsing")
113
  st.markdown("""
114
- Upload a resume file (PDF, PNG, JPG, or JPEG) to extract basic text and candidate information.
115
  """)
116
 
117
  uploaded_file = st.file_uploader("Upload Resume", type=["pdf", "png", "jpg", "jpeg"])
 
5
  from transformers import pipeline
6
  from pdfminer.high_level import extract_text
7
 
8
+ # Load and cache the OCR model once at startup.
9
  @st.cache_resource(show_spinner=False)
10
  def load_ocr_pipeline():
11
  try:
12
+ # Initialize the OCR pipeline from transformers. Change the model as needed.
13
  ocr_pipe = pipeline("image-to-text", model="YouLiXiya/tinyllava-v1.0-1.1b-hf")
14
  return ocr_pipe
15
  except Exception as e:
 
20
  st.write("Model loaded successfully!")
21
 
22
  #####################################
23
+ # Text Extraction Function
24
  #####################################
25
  def extract_text_from_file(file_obj):
26
  full_text = ""
 
43
  return full_text
44
 
45
  #####################################
46
+ # Resume Information Extraction Functions
47
  #####################################
48
  def extract_basic_resume_info(text):
49
+ """
50
+ Extract basic resume information, such as:
51
+ - Name
52
+ - Age
53
+ - Job Experience
54
+ - Skills
55
+ - Expected Industry/Direction
56
+ """
57
  info = {
58
  "Name": None,
59
  "Age": None,
 
62
  "Expected Industry/Direction": None,
63
  }
64
 
65
+ # Extract Name: trigger words like 'Name:'
66
  name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
67
  if name_match:
68
  info["Name"] = name_match.group(1).strip()
69
  else:
70
+ # Fallback: heuristic for sequences of capitalized words.
71
  potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)
72
  if potential_names:
73
  info["Name"] = potential_names[0]
74
 
75
+ # Extract Age:
76
  age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
77
  if age_match:
78
  info["Age"] = age_match.group(1)
79
 
80
+ # Extract Job Experience (years)
81
  exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
82
  if exp_match:
83
  info["Job Experience"] = exp_match.group(1) + " years"
 
86
  if exp_line:
87
  info["Job Experience"] = exp_line.group(2).strip()
88
 
89
+ # Extract Skills (e.g., "Skills: Python, Java, SQL")
90
  skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text)
91
  if skills_match:
92
  skills_text = skills_match.group(1)
93
  skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()]
94
  info["Skills"] = skills
95
 
96
+ # Extract Expected Industry/Direction (e.g., "Interest: Software Development")
97
  industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
98
  if industry_match:
99
  info["Expected Industry/Direction"] = industry_match.group(2).strip()
 
101
  return info
102
 
103
  #####################################
104
+ # Main Resume Processing Logic
105
  #####################################
106
  def process_resume(file_obj):
107
  if file_obj is None:
108
  return None, None
109
 
110
+ # Extract text based on file type (PDF or image).
111
  resume_text = extract_text_from_file(file_obj)
112
+ # Parse basic resume details from the extracted text.
113
  resume_info = extract_basic_resume_info(resume_text)
114
  return resume_text, resume_info
115
 
 
118
  #####################################
119
  st.title("Resume Extraction and Basic Info Parsing")
120
  st.markdown("""
121
+ Upload a resume file in PDF or image format (PNG, JPG, JPEG) to extract text and candidate information.
122
  """)
123
 
124
  uploaded_file = st.file_uploader("Upload Resume", type=["pdf", "png", "jpg", "jpeg"])