Spaces:

tensora
/

wg-candidate-search

Sleeping

App Files Files Community

wg-candidate-search / app.py

mbosse99

Upload 4 files

9a804ac over 1 year ago

raw

history blame

19.2 kB

	import io
	import os
	import openai
	import re
	import sqlite3
	import streamlit as st
	from streamlit_js_eval import streamlit_js_eval
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.vectorstores.azuresearch import AzureSearch
	from PyPDF2 import PdfReader

	os.environ["OPENAI_API_KEY"] = "201b389eda7b48a496fa81c091f8e51e"
	os.environ["OPENAI_API_BASE"] = "https://tensora-oai.openai.azure.com/"
	os.environ["OPENAI_API_TYPE"] = "azure"
	os.environ["OPENAI_API_VERSION"] = "2023-03-15-preview"
	os.environ["AZURE_SEARCH_ENDPOINT"] = "https://tensora-search.search.windows.net"
	os.environ["AZURE_SEARCH_KEY"] = "LABhDdbb8NPPilxOwPpZ4nXRyHzABsKyXdMiSQ50CKAzSeB1fy1x"

	openai.api_key = os.getenv("OPENAI_API_KEY")
	openai.api_base = "https://tensora-oai.openai.azure.com/"
	openai.api_type = "azure"
	openai.api_version = "2023-05-15"

	st.markdown(
	"""
	<style>
	[data-testid=column]{
	text-align: center;
	display: flex;
	align-items: center;
	justify-content: center;
	}
	h3{
	text-align: left;
	}
	</style>
	""",
	unsafe_allow_html=True,
	)

	with open("sys_prompt_frontend.txt") as f:
	sys_prompt = f.read()

	def adjust_numbering(lst):
	return [f"{i + 1}. {item.split('. ', 1)[1]}" for i, item in enumerate(lst)]

	def check_keywords_in_content(database_path, table_name, input_id, keywords):
	# Verbindung zur Datenbank herstellen
	conn = sqlite3.connect(database_path)
	cursor = conn.cursor()

	# SQL-Abfrage, um die Zeile mit der angegebenen ID abzurufen
	cursor.execute(f'SELECT * FROM {table_name} WHERE id = ?', (input_id,))

	# Ergebnis abrufen
	row = cursor.fetchone()

	# Wenn die Zeile nicht gefunden wurde, False zurückgeben
	if not row:
	conn.close()
	print("ID not found")
	return False

	# Überprüfen, ob die Keywords in der Spalte content enthalten sind (case-insensitive)
	content = row[1].lower() # Annahme: content ist die zweite Spalte, und wir wandeln ihn in Kleinbuchstaben um
	keywords_lower = [keyword.lower() for keyword in keywords]

	contains_keywords = all(keyword in content for keyword in keywords_lower)

	# Verbindung schließen
	conn.close()

	return contains_keywords

	if "similarity_search_string" not in st.session_state:
	st.session_state["similarity_search_string"] = None
	if "job_string" not in st.session_state:
	st.session_state["job_string"] = None
	if "docs_res" not in st.session_state:
	st.session_state["docs_res"] = None
	if "final_candidates" not in st.session_state:
	st.session_state["final_candidates"] = None
	if "final_question_string" not in st.session_state:
	st.session_state["final_question_string"] = []
	if "ai_questions" not in st.session_state:
	st.session_state["ai_questions"] = None
	if "db" not in st.session_state:
	embedder = OpenAIEmbeddings(deployment="text-embedding-ada-002", chunk_size=1)
	embedding_function = embedder.embed_query


	db = AzureSearch(
	index_name="wg-cvs",
	azure_search_endpoint=os.environ.get("AZURE_SEARCH_ENDPOINT"),
	azure_search_key=os.environ.get("AZURE_SEARCH_KEY"),
	embedding_function=embedding_function,
	)
	st.session_state["db"] = db


	col1, col2 = st.columns([2, 1])

	col1.title("Candidate Search")
	col2.image("https://www.workgenius.com/wp-content/uploads/2023/03/WorkGenius_navy-1.svg")

	st.write("Please upload the job description for which you would like candidates to be proposed.")
	col_file, col_clear = st.columns([6,1])

	with col_file:
	uploaded_file_jobdescription = st.file_uploader("Upload the job description:", type=["pdf"], key="job")
	with col_clear:
	if st.button("Clear", use_container_width=True):
	streamlit_js_eval(js_expressions="parent.window.location.reload()")

	text_area_params = st.text_area(label="Add additional search parameters, which are separated by commas (e.g. master, phd, web developer, spanish)")

	submit = st.button("Search candidates",disabled= True if st.session_state["final_candidates"] else False)
	if not st.session_state["job"] and submit:
	st.error("Please upload a job description to search for candidates")
	if st.session_state["docs_res"] and submit:
	with st.spinner("Load the candidates, this may take a moment..."):
	query_string = "The following keywords must be included: " + text_area_params + " " + st.session_state["job_string"]
	checked_candidates = []
	db_path = 'cvdb.db'
	table_name = 'files'
	candidates_per_search = 100
	target_candidates_count = 10
	current_offset = 0

	while len(checked_candidates) < target_candidates_count:
	# Führe eine similarity search durch und erhalte 100 Kandidaten
	raw_candidates = st.session_state["db"].similarity_search(query_string, k=candidates_per_search+current_offset)

	for candidate in raw_candidates[current_offset:]:
	candidates_id = candidate.metadata["source"].split("/")[-1]
	keyword_bool = check_keywords_in_content(db_path, table_name, candidates_id, text_area_params.split(','))

	if keyword_bool:
	checked_candidates.append(candidate)

	# Überprüfe, ob die Zielanzahl erreicht wurde und breche die Schleife ab, wenn ja
	if len(checked_candidates) >= target_candidates_count:
	break

	current_offset += candidates_per_search
	if current_offset == 600:
	break

	# Setze die Ergebnisse in der Session State Variable
	st.session_state["docs_res"] = checked_candidates
	if len(checked_candidates) == 0:
	st.error("No candidates can be found with these keywords. Please adjust the keywords and try again.", icon="🚨")
	if (st.session_state["job"] and submit) or st.session_state["docs_res"]:
	if not st.session_state["job_string"]:
	pdf_data_jobdescription = st.session_state["job"].read()
	pdf_data_jobdescription_string = ""
	pdf_reader_job = PdfReader(io.BytesIO(pdf_data_jobdescription))
	for page_num in range(len(pdf_reader_job.pages)):
	page = pdf_reader_job.pages[page_num]
	pdf_data_jobdescription_string += page.extract_text()
	# st.session_state["pdf_data_jobdescription"] = pdf_data_jobdescription activate and add sessio state if data is needed
	st.session_state["job_string"] = pdf_data_jobdescription_string
	if not st.session_state["docs_res"]:
	# print("ich bin im spinner")
	# print(st.session_state["job_string"]+" "+text_area_params)
	with st.spinner("Load the candidates, this may take a moment..."):
	#Use this line if you just want to perform one similarity search
	# st.session_state["docs_res"] = st.session_state["db"].similarity_search(text_area_params+" "+st.session_state["job_string"], k=100)

	query_string = "The following keywords must be included: " + text_area_params + " " + st.session_state["job_string"]
	checked_candidates = []
	db_path = 'cvdb.db'
	table_name = 'files'
	candidates_per_search = 100
	target_candidates_count = 10
	current_offset = 0

	while len(checked_candidates) < target_candidates_count:
	# Führe eine similarity search durch und erhalte 100 Kandidaten
	raw_candidates = st.session_state["db"].similarity_search(query_string, k=candidates_per_search+current_offset)

	for candidate in raw_candidates[current_offset:]:
	candidates_id = candidate.metadata["source"].split("/")[-1]
	keyword_bool = check_keywords_in_content(db_path, table_name, candidates_id, text_area_params.split(','))

	if keyword_bool:
	checked_candidates.append(candidate)

	# Überprüfe, ob die Zielanzahl erreicht wurde und breche die Schleife ab, wenn ja
	if len(checked_candidates) >= target_candidates_count:
	break

	current_offset += candidates_per_search
	if current_offset == 600:
	break

	# Setze die Ergebnisse in der Session State Variable
	st.session_state["docs_res"] = checked_candidates
	if len(checked_candidates) == 0:
	st.error("No candidates can be found with these keywords. Please adjust the keywords and try again.", icon="🚨")
	# query_string = "The following keywords must be included: "+text_area_params+" "+st.session_state["job_string"]
	# raw_candidates = st.session_state["db"].similarity_search(query_string, k=100)
	# checked_candidates = []
	# db_path = 'cvdb.db'
	# table_name = 'files'
	# for candidate in raw_candidates:
	# candidates_id = candidate.metadata["source"].split("/")[-1]
	# keyword_bool = check_keywords_in_content(db_path,table_name,candidates_id,text_area_params.split(','))
	# print(keyword_bool)
	# if check_keywords_in_content(db_path,table_name,candidates_id,text_area_params.split(',')):
	# if len(checked_candidates)<15:
	# checked_candidates.append(candidate)
	# else:
	# break

	# st.session_state["docs_res"] = checked_candidates
	#This Code is creating a new Index based on the raw candidates
	# raw_candidates = st.session_state["db"].similarity_search(text_area_params+" "+st.session_state["job_string"], k=50)
	# raw_candidates_embeddings = []
	# for candidate in raw_candidates:
	# raw_candidates_embeddings.append(embedding_function(candidate.page_content))

	# st.session_state["docs_res"] = st.session_state["db"].similarity_search_by_vector(embedding=raw_candidates_embeddings,k=10,query="Every candidate needs to be proficient in spanish")
	# db_temp = AzureSearch.from_documents(
	# raw_candidates,
	# embedding=embedder,
	# index_name="wg-cvs-temp",
	# azure_search_endpoint=os.environ.get("AZURE_SEARCH_ENDPOINT"),
	# azure_search_key=os.environ.get("AZURE_SEARCH_KEY"),
	# )

	# st.session_state["docs_res"] = db_temp.similarity_search(query="Every candidate needs to be proficient in spanish", k=10)

	#Use this code to check candidates with gpt-4
	# raw_candidates = st.session_state["db"].similarity_search(text_area_params+" "+st.session_state["job_string"], k=15)
	# temp_candidates = []
	# for candidate in raw_candidates:
	# res_approve = openai.ChatCompletion.create(
	# engine="gpt-4",
	# temperature=0.1,
	# messages=[
	# {
	# "role": "system",
	# "content": "You are a professional recruiter who receives a resume and a set of requirements. The only thing you have to do is to say whether the requirements are fulfilled or not. you should not explain yourself and simply answer '1' if the requirements are fulfilled and '0' if not.",
	# },
	# {
	# "role": "system",
	# "content": "The candidate needs to be located in New York"
	# },
	# {
	# "role": "system",
	# "content": candidate.page_content
	# }
	# ],
	# )
	# print(res_approve.choices[0]["message"]["content"])
	# if res_approve.choices[0]["message"]["content"] == "1":
	# temp_candidates.append(candidate)
	# st.session_state["docs_res"] = temp_candidates


	if not st.session_state["final_candidates"]:
	for i,doc in enumerate(st.session_state["docs_res"]):
	# print(doc)
	cols_final = st.columns([6,1])
	with cols_final[1]:
	if st.button("Remove",use_container_width=True,key="btn_rm_cv_row_"+str(i)):
	# st.write(doc.page_content)
	st.session_state["docs_res"].pop(i)
	st.rerun()
	with cols_final[0]:
	# st.subheader(doc.metadata["source"])
	with st.expander(doc.metadata["source"]):
	st.write(doc.page_content)
	if st.button("Accept candidates", key="accept_candidates_btn"):
	print("hello")
	st.session_state["final_candidates"] = st.session_state["docs_res"].copy()
	st.rerun()
	else:
	print("Now Questions")
	st.subheader("Your Candidates:")
	st.write(", ".join(candidate.metadata["source"] for candidate in st.session_state["final_candidates"]))
	# for i,candidate in enumerate(st.session_state["final_candidates"]):
	# st.write(candidate.metadata["source"])
	cv_strings = "; Next CV: ".join(candidate.page_content for candidate in st.session_state["final_candidates"])
	# print(len(cv_strings))
	system = sys_prompt.format(job=st.session_state["job_string"], resume=st.session_state["final_candidates"][0], n=15)
	if not st.session_state["ai_questions"]:
	try:
	# st.write("The questions are generated. This may take a short moment...")
	st.info("The questions are generated. This may take a short moment.", icon="ℹ️")
	with st.spinner("Loading..."):
	res = openai.ChatCompletion.create(
	engine="gpt-4",
	temperature=0.2,
	messages=[
	{
	"role": "system",
	"content": system,
	},
	],
	)
	st.session_state["ai_questions"] = [item for item in res.choices[0]["message"]["content"].split("\n") if len(item) > 0]
	for i,q in enumerate(res.choices[0]["message"]["content"].split("\n")):
	st.session_state["disable_row_"+str(i)] = False
	st.rerun()
	except Exception as e:
	print(f"Fehler beim generieren der Fragen: {str(e)}")
	st.error("An error has occurred. Please reload the page or contact the admin.", icon="🚨")
	else:
	if len(st.session_state["final_question_string"]) <= 0:
	for i,question in enumerate(st.session_state["ai_questions"]):
	cols = st.columns([5,1])
	with cols[1]:
	# if st.button("Accept",use_container_width=True,key="btn_accept_row_"+str(i)):
	# print("accept")
	# pattern = re.compile(r"^[1-9][0-9]?\.")
	# questions_length = len(st.session_state["final_question_string"])
	# question_from_text_area = st.session_state["text_area_"+str(i)]
	# question_to_append = str(questions_length+1)+"."+re.sub(pattern, "", question_from_text_area)
	# st.session_state["final_question_string"].append(question_to_append)
	# st.session_state["disable_row_"+str(i)] = True
	# st.rerun()
	if st.button("Delete",use_container_width=True,key="btn_del_row_"+str(i)):
	print("delete")
	st.session_state["ai_questions"].remove(question)
	st.rerun()
	with cols[0]:
	st.text_area(label="Question "+str(i+1)+":",value=question,label_visibility="collapsed",key="text_area_"+str(i),disabled=st.session_state["disable_row_"+str(i)])
	st.write("If you are satisfied with the questions, then accept them. You can still sort them afterwards.")
	if st.button("Accept all questions",use_container_width=True,key="accept_all_questions"):
	for i,question in enumerate(st.session_state["ai_questions"]):
	pattern = re.compile(r"^[1-9][0-9]?\.")
	questions_length = len(st.session_state["final_question_string"])
	question_from_text_area = st.session_state["text_area_"+str(i)]
	question_to_append = str(questions_length+1)+"."+re.sub(pattern, "", question_from_text_area)
	st.session_state["final_question_string"].append(question_to_append)
	st.session_state["disable_row_"+str(i)] = True
	st.rerun()
	for i,final_q in enumerate(st.session_state["final_question_string"]):
	cols_final = st.columns([5,1])
	with cols_final[1]:
	if st.button("Up",use_container_width=True,key="btn_up_row_"+str(i),disabled=True if i == 0 else False):
	if i > 0:
	# Tausche das aktuelle Element mit dem vorherigen Element
	st.session_state.final_question_string[i], st.session_state.final_question_string[i - 1] = \
	st.session_state.final_question_string[i - 1], st.session_state.final_question_string[i]
	st.session_state.final_question_string = adjust_numbering(st.session_state.final_question_string)
	st.rerun()
	if st.button("Down",use_container_width=True,key="btn_down_row_"+str(i), disabled=True if i == len(st.session_state["final_question_string"])-1 else False):
	if i < len(st.session_state.final_question_string) - 1:
	# Tausche das aktuelle Element mit dem nächsten Element
	st.session_state.final_question_string[i], st.session_state.final_question_string[i + 1] = \
	st.session_state.final_question_string[i + 1], st.session_state.final_question_string[i]
	st.session_state.final_question_string = adjust_numbering(st.session_state.final_question_string)
	st.rerun()
	with cols_final[0]:
	st.write(final_q)
	if st.button("Submit", use_container_width=True):
	st.success('Successful search for candidates and generation of questions')