Nasma commited on
Commit
bfbf372
·
verified ·
1 Parent(s): 2925c24

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +22 -31
main.py CHANGED
@@ -8,7 +8,6 @@ from PyPDF2 import PdfReader
8
  from PIL import Image
9
  import fitz # PyMuPDF
10
  import openai
11
- import pytesseract
12
  from dotenv import load_dotenv
13
 
14
  # Load environment variables
@@ -28,7 +27,6 @@ app.add_middleware(
28
  allow_headers=["*"],
29
  )
30
 
31
-
32
  def vision(file_content):
33
  """Extract text from images inside a PDF using PyMuPDF & OCR."""
34
  pdf_document = fitz.open(stream=file_content, filetype="pdf")
@@ -65,7 +63,6 @@ def vision(file_content):
65
  except Exception as e:
66
  raise HTTPException(status_code=500, detail=f"Error in GPT-4o vision processing: {str(e)}")
67
 
68
-
69
  @app.post("/get_ocr_data/")
70
  def get_data(input_file: UploadFile = File(...)):
71
  """Extract structured data from a PDF resume."""
@@ -77,7 +74,7 @@ def get_data(input_file: UploadFile = File(...)):
77
 
78
  if file_type == "application/pdf":
79
  pdf_reader = PdfReader(io.BytesIO(file_content))
80
-
81
  for page in pdf_reader.pages:
82
  text = page.extract_text()
83
  if text:
@@ -86,36 +83,36 @@ def get_data(input_file: UploadFile = File(...)):
86
  if not extracted_text.strip(): # If no text found, use vision processing
87
  print("\nVision OCR running...\n")
88
  extracted_text = vision(file_content)
89
-
90
  else:
91
  raise HTTPException(status_code=400, detail="Unsupported file type")
92
 
93
  print("Extracted Text:\n", extracted_text.strip())
94
 
95
- # Call GPT-4o to structure extracted text into JSON format
96
- prompt = f"""This is CV data: {extracted_text.strip()}.
97
- IMPORTANT: The output should be a JSON array! Make sure the JSON is valid.
98
- If no data is found, fill missing fields with "none". Do not include extra explanation text.
 
99
 
100
  Example Output:
101
  ```json
102
- {{
103
- "firstname": "First Name",
104
- "lastname": "Last Name",
105
- "email": "Email Address",
106
- "contact_number": "Contact Number",
107
- "home_address": "Full Home Address",
108
- "home_town": "Home Town or City",
109
- "total_years_of_experience": "Total Years of Experience",
110
- "education": "Institution Name, Degree Name",
111
- "LinkedIn_link": "LinkedIn URL",
112
- "experience": "Job Title, Start Date - End Date, Company Name; Job Title, Start Date - End Date, Company Name; Job Title, Start Date - End Date, Company Name",
113
  "industry": "industry of work",
114
- "skills": "Skill 1, Skill 2, Skill 3",
115
- "positions": ["Job Title 1", "Job Title 2"],
116
- "summary": "Summary of qualifications and experience"
117
- }}
118
- ```"""
119
 
120
  response = openai.ChatCompletion.create(
121
  model="gpt-4o",
@@ -133,9 +130,3 @@ def get_data(input_file: UploadFile = File(...)):
133
 
134
  except Exception as e:
135
  raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")
136
-
137
-
138
-
139
-
140
-
141
-
 
8
  from PIL import Image
9
  import fitz # PyMuPDF
10
  import openai
 
11
  from dotenv import load_dotenv
12
 
13
  # Load environment variables
 
27
  allow_headers=["*"],
28
  )
29
 
 
30
  def vision(file_content):
31
  """Extract text from images inside a PDF using PyMuPDF & OCR."""
32
  pdf_document = fitz.open(stream=file_content, filetype="pdf")
 
63
  except Exception as e:
64
  raise HTTPException(status_code=500, detail=f"Error in GPT-4o vision processing: {str(e)}")
65
 
 
66
  @app.post("/get_ocr_data/")
67
  def get_data(input_file: UploadFile = File(...)):
68
  """Extract structured data from a PDF resume."""
 
74
 
75
  if file_type == "application/pdf":
76
  pdf_reader = PdfReader(io.BytesIO(file_content))
77
+
78
  for page in pdf_reader.pages:
79
  text = page.extract_text()
80
  if text:
 
83
  if not extracted_text.strip(): # If no text found, use vision processing
84
  print("\nVision OCR running...\n")
85
  extracted_text = vision(file_content)
 
86
  else:
87
  raise HTTPException(status_code=400, detail="Unsupported file type")
88
 
89
  print("Extracted Text:\n", extracted_text.strip())
90
 
91
+ # Call GPT-4o to process extracted text into structured JSON
92
+ prompt = f"""
93
+ This is CV data: {extracted_text.strip()}.
94
+ IMPORTANT: The output should be a JSON array! Make sure the JSON is valid. If no data is found, fill missing fields with "none". Do not add any extra explanation text.
95
+ Need only JSON output.
96
 
97
  Example Output:
98
  ```json
99
+ [
100
+ "firstname": "firstname",
101
+ "lastname": "lastname",
102
+ "email": "email",
103
+ "contact_number": "contact number",
104
+ "home_address": "full home address",
105
+ "home_town": "home town or city",
106
+ "total_years_of_experience": "total years of experience",
107
+ "education": "Institution Name, Country, Degree Name, Graduation Year; Institution Name, Country, Degree Name, Graduation Year",
108
+ "LinkedIn_link": "LinkedIn link",
109
+ "experience": "experience",
110
  "industry": "industry of work",
111
+ "skills": "skills (Identify and list specific skills mentioned in both the skills section and inferred from the experience section), formatted as: Skill 1, Skill 2, Skill 3, Skill 4, Skill 5",
112
+ "positions": ["Job title 1, Job title 2, Job title 3"]
113
+ ]
114
+ ```
115
+ """
116
 
117
  response = openai.ChatCompletion.create(
118
  model="gpt-4o",
 
130
 
131
  except Exception as e:
132
  raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")