gneya-bacancy commited on
Commit
efe9805
·
verified ·
1 Parent(s): e0d8f76

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +177 -0
  2. demo.pdf +0 -0
  3. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import tempfile
4
+ # from pypdf import PdfReader, PdfWriter
5
+ # from pdf2image import convert_from_path
6
+ from langchain_community.document_loaders import PyPDFLoader
7
+ from langchain.prompts import PromptTemplate
8
+ from langchain_groq import ChatGroq
9
+ from langchain.chains.llm import LLMChain
10
+ import tempfile
11
+ import markdown2
12
+ from weasyprint import HTML
13
+ from io import BytesIO
14
+
15
+ def format_string(input_string):
16
+ # Find the index of the first occurrence of ":"
17
+ index = input_string.find(":")
18
+
19
+ # Check if ":" is found
20
+ if index != -1:
21
+ # Extract the substring starting from the found index to the end
22
+ substring = input_string[(index+1):]
23
+ else:
24
+ # If ":" is not found, return an empty string or an appropriate message
25
+ substring = input_string
26
+ return substring
27
+ def save_uploaded_file(uploadedfile):
28
+ # Create a temporary directory to save the file
29
+ temp_dir = tempfile.gettempdir()
30
+ save_path = os.path.join(temp_dir, uploadedfile.name)
31
+
32
+ with open(save_path, "wb") as f:
33
+ f.write(uploadedfile.getbuffer())
34
+
35
+ return save_path
36
+
37
+ def read_pdf(file_path):
38
+ # Dummy processing: copying the original PDF content to a new PDF
39
+ loader = PyPDFLoader(file_path)
40
+ pages = loader.load_and_split()
41
+ text = ""
42
+ for page in pages:
43
+ print(page.page_content)
44
+ text = text + " "+page.page_content+ "\n\n"
45
+ return text
46
+ def generate_pdf_from_markup(markup_text):
47
+ # Convert Markdown to HTML
48
+ html_content = markdown2.markdown(markup_text)
49
+
50
+ # Create a temporary file to save the PDF
51
+ temp_dir = tempfile.gettempdir()
52
+ pdf_path = os.path.join(temp_dir, "generated.pdf")
53
+
54
+ # Convert HTML to PDF
55
+ HTML(string=html_content).write_pdf(pdf_path)
56
+
57
+ return pdf_path
58
+
59
+ def parse_resume(data):
60
+ llm = ChatGroq(api_key=os.getenv("GROQ_API_KEY"),model="llama3-70b-8192")
61
+ system_prompt = """
62
+ You are an AI assistant designed to remove and format resume data. When provided with extracted text from a PDF resume, your task is to remove personal information and certain details while maintaining the professional content and structure.
63
+ Follow the guidelines below:
64
+ Keep projects, experience, technical skills as it is without any change.
65
+ Remove Salutations: Mr, Mrs, Ms, etc.
66
+ Remove Names: All instances of the candidate's names.
67
+ Remove Gender: Any mention of gender.
68
+ Remove Age/D.O.B./Astrology Info: Any references to age, date of birth, or astrological signs.
69
+ Remove Links of personal accounts for example: exail id, github url, linkedin url and all the other urls except the project and experience urls.
70
+ Remove email address, mobile number, or any other information that has personal identity.
71
+ Anonymize Location: Replace specific locations with more general terms (e.g., "Willing to relocate, currently based in Leicester").
72
+ Anonymize Education Institutions: Replace the names of educational institutions/schools with "top university (e.g. highly reputable university on the global stage) or top school" if applicable.
73
+ Anonymize Language Skills: Replace specific languages with regional groupings for multilingual candidates (e.g., "proficient in multiple European languages").
74
+ Remove Hobbies and INTERESTS: Remove specific details related to hobbies and interests
75
+ Anonymize Other Fields: Make specific removals as needed to protect the candidate's identity.
76
+ Remove professional summary, objective, agenda and all these type of sections.
77
+ Add only professional achievment, awards and certifactes
78
+ Ensure the remaining sections and information are formatted properly to maintain the professional appearance of the resume.
79
+ Ensure proper formatting of the resume with proper content justifications, add markdown, add bullet points and spacing wherever required.
80
+ Return the output of resume content only. Don't include any notes or comments.
81
+ """
82
+ # Remove achievment, awards and certifactes that are not related to professional work.
83
+
84
+ user_prompt_template = """
85
+ {resume_text}
86
+ """
87
+ prompt_template = PromptTemplate(
88
+ input_variables=["resume_text"],
89
+ template=system_prompt + user_prompt_template
90
+ )
91
+ anonymize_chain = LLMChain(
92
+ llm=llm,
93
+ prompt=prompt_template
94
+ )
95
+ response=anonymize_chain.invoke(data)
96
+ return response
97
+
98
+ def handle_pdf(file_path):
99
+ with st.spinner("Parsing Resume..."):
100
+ data = read_pdf(file_path)
101
+ modified_data = parse_resume(data)
102
+ formatted_data = format_string(modified_data["text"])
103
+ st.write(formatted_data)
104
+
105
+ pdf_path = ""
106
+
107
+ if st.button("Generate PDF"):
108
+ # Add spinner while generating the PDF
109
+ with st.spinner("Generating PDF..."):
110
+ # Generate the PDF from markup text
111
+ pdf_path = generate_pdf_from_markup(formatted_data)
112
+
113
+ st.success("PDF generated successfully.")
114
+
115
+ # Show the preview of the first page of the PDF
116
+ with open(pdf_path, "rb") as f:
117
+ pdf_bytes = f.read()
118
+ st.download_button(
119
+ label="Download PDF",
120
+ data=pdf_bytes,
121
+ file_name="generated.pdf",
122
+ mime="application/pdf"
123
+ )
124
+
125
+ def main():
126
+ st.title("Resume Parser")
127
+ option = st.radio(
128
+ "Choose an option:",
129
+ ("Use Demo PDF", "Browse Files"),
130
+ )
131
+
132
+ if option == "Use Demo PDF":
133
+ demo_pdf_path = "demo.pdf"
134
+ st.info("You have selected the demo PDF.")
135
+ if st.button("Click to go with Demo pdf"):
136
+ handle_pdf(demo_pdf_path)
137
+
138
+
139
+ elif option == "Browse Files":
140
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
141
+
142
+ if uploaded_file is not None:
143
+ original_file_path = save_uploaded_file(uploaded_file)
144
+
145
+ st.success(f"File saved at {original_file_path}")
146
+
147
+ handle_pdf(original_file_path)
148
+
149
+
150
+ # with st.spinner("Parsing Resume..."):
151
+ # data = read_pdf(original_file_path)
152
+ # modified_data = parse_resume(data)
153
+ # formatted_data = format_string(modified_data["text"])
154
+ # st.write(formatted_data)
155
+ # pdf_path = ""
156
+
157
+ # if st.button("Generate PDF"):
158
+ # # Add spinner while generating the PDF
159
+ # with st.spinner("Generating PDF..."):
160
+ # # Generate the PDF from markup text
161
+ # pdf_path = generate_pdf_from_markup(formatted_data)
162
+
163
+ # st.success("PDF generated successfully.")
164
+
165
+ # # Show the preview of the first page of the PDF
166
+ # with open(pdf_path, "rb") as f:
167
+ # pdf_bytes = f.read()
168
+ # st.download_button(
169
+ # label="Download PDF",
170
+ # data=pdf_bytes,
171
+ # file_name="generated.pdf",
172
+ # mime="application/pdf"
173
+ # )
174
+
175
+
176
+ if __name__ == "__main__":
177
+ main()
demo.pdf ADDED
Binary file (86.8 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pypdf
2
+ streamlit
3
+ langchain
4
+ langchain_groq
5
+ langchain_community
6
+ markdown2
7
+ weasyprint