mbosse99's picture
Upload 4 files
9a804ac
raw
history blame
19.2 kB
import io
import os
import openai
import re
import sqlite3
import streamlit as st
from streamlit_js_eval import streamlit_js_eval
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch
from PyPDF2 import PdfReader
os.environ["OPENAI_API_KEY"] = "201b389eda7b48a496fa81c091f8e51e"
os.environ["OPENAI_API_BASE"] = "https://tensora-oai.openai.azure.com/"
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = "2023-03-15-preview"
os.environ["AZURE_SEARCH_ENDPOINT"] = "https://tensora-search.search.windows.net"
os.environ["AZURE_SEARCH_KEY"] = "LABhDdbb8NPPilxOwPpZ4nXRyHzABsKyXdMiSQ50CKAzSeB1fy1x"
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = "https://tensora-oai.openai.azure.com/"
openai.api_type = "azure"
openai.api_version = "2023-05-15"
st.markdown(
"""
<style>
[data-testid=column]{
text-align: center;
display: flex;
align-items: center;
justify-content: center;
}
h3{
text-align: left;
}
</style>
""",
unsafe_allow_html=True,
)
with open("sys_prompt_frontend.txt") as f:
sys_prompt = f.read()
def adjust_numbering(lst):
return [f"{i + 1}. {item.split('. ', 1)[1]}" for i, item in enumerate(lst)]
def check_keywords_in_content(database_path, table_name, input_id, keywords):
# Verbindung zur Datenbank herstellen
conn = sqlite3.connect(database_path)
cursor = conn.cursor()
# SQL-Abfrage, um die Zeile mit der angegebenen ID abzurufen
cursor.execute(f'SELECT * FROM {table_name} WHERE id = ?', (input_id,))
# Ergebnis abrufen
row = cursor.fetchone()
# Wenn die Zeile nicht gefunden wurde, False zurückgeben
if not row:
conn.close()
print("ID not found")
return False
# Überprüfen, ob die Keywords in der Spalte content enthalten sind (case-insensitive)
content = row[1].lower() # Annahme: content ist die zweite Spalte, und wir wandeln ihn in Kleinbuchstaben um
keywords_lower = [keyword.lower() for keyword in keywords]
contains_keywords = all(keyword in content for keyword in keywords_lower)
# Verbindung schließen
conn.close()
return contains_keywords
if "similarity_search_string" not in st.session_state:
st.session_state["similarity_search_string"] = None
if "job_string" not in st.session_state:
st.session_state["job_string"] = None
if "docs_res" not in st.session_state:
st.session_state["docs_res"] = None
if "final_candidates" not in st.session_state:
st.session_state["final_candidates"] = None
if "final_question_string" not in st.session_state:
st.session_state["final_question_string"] = []
if "ai_questions" not in st.session_state:
st.session_state["ai_questions"] = None
if "db" not in st.session_state:
embedder = OpenAIEmbeddings(deployment="text-embedding-ada-002", chunk_size=1)
embedding_function = embedder.embed_query
db = AzureSearch(
index_name="wg-cvs",
azure_search_endpoint=os.environ.get("AZURE_SEARCH_ENDPOINT"),
azure_search_key=os.environ.get("AZURE_SEARCH_KEY"),
embedding_function=embedding_function,
)
st.session_state["db"] = db
col1, col2 = st.columns([2, 1])
col1.title("Candidate Search")
col2.image("https://www.workgenius.com/wp-content/uploads/2023/03/WorkGenius_navy-1.svg")
st.write("Please upload the job description for which you would like candidates to be proposed.")
col_file, col_clear = st.columns([6,1])
with col_file:
uploaded_file_jobdescription = st.file_uploader("Upload the job description:", type=["pdf"], key="job")
with col_clear:
if st.button("Clear", use_container_width=True):
streamlit_js_eval(js_expressions="parent.window.location.reload()")
text_area_params = st.text_area(label="Add additional search parameters, which are separated by commas (e.g. master, phd, web developer, spanish)")
submit = st.button("Search candidates",disabled= True if st.session_state["final_candidates"] else False)
if not st.session_state["job"] and submit:
st.error("Please upload a job description to search for candidates")
if st.session_state["docs_res"] and submit:
with st.spinner("Load the candidates, this may take a moment..."):
query_string = "The following keywords must be included: " + text_area_params + " " + st.session_state["job_string"]
checked_candidates = []
db_path = 'cvdb.db'
table_name = 'files'
candidates_per_search = 100
target_candidates_count = 10
current_offset = 0
while len(checked_candidates) < target_candidates_count:
# Führe eine similarity search durch und erhalte 100 Kandidaten
raw_candidates = st.session_state["db"].similarity_search(query_string, k=candidates_per_search+current_offset)
for candidate in raw_candidates[current_offset:]:
candidates_id = candidate.metadata["source"].split("/")[-1]
keyword_bool = check_keywords_in_content(db_path, table_name, candidates_id, text_area_params.split(','))
if keyword_bool:
checked_candidates.append(candidate)
# Überprüfe, ob die Zielanzahl erreicht wurde und breche die Schleife ab, wenn ja
if len(checked_candidates) >= target_candidates_count:
break
current_offset += candidates_per_search
if current_offset == 600:
break
# Setze die Ergebnisse in der Session State Variable
st.session_state["docs_res"] = checked_candidates
if len(checked_candidates) == 0:
st.error("No candidates can be found with these keywords. Please adjust the keywords and try again.", icon="🚨")
if (st.session_state["job"] and submit) or st.session_state["docs_res"]:
if not st.session_state["job_string"]:
pdf_data_jobdescription = st.session_state["job"].read()
pdf_data_jobdescription_string = ""
pdf_reader_job = PdfReader(io.BytesIO(pdf_data_jobdescription))
for page_num in range(len(pdf_reader_job.pages)):
page = pdf_reader_job.pages[page_num]
pdf_data_jobdescription_string += page.extract_text()
# st.session_state["pdf_data_jobdescription"] = pdf_data_jobdescription activate and add sessio state if data is needed
st.session_state["job_string"] = pdf_data_jobdescription_string
if not st.session_state["docs_res"]:
# print("ich bin im spinner")
# print(st.session_state["job_string"]+" "+text_area_params)
with st.spinner("Load the candidates, this may take a moment..."):
#Use this line if you just want to perform one similarity search
# st.session_state["docs_res"] = st.session_state["db"].similarity_search(text_area_params+" "+st.session_state["job_string"], k=100)
query_string = "The following keywords must be included: " + text_area_params + " " + st.session_state["job_string"]
checked_candidates = []
db_path = 'cvdb.db'
table_name = 'files'
candidates_per_search = 100
target_candidates_count = 10
current_offset = 0
while len(checked_candidates) < target_candidates_count:
# Führe eine similarity search durch und erhalte 100 Kandidaten
raw_candidates = st.session_state["db"].similarity_search(query_string, k=candidates_per_search+current_offset)
for candidate in raw_candidates[current_offset:]:
candidates_id = candidate.metadata["source"].split("/")[-1]
keyword_bool = check_keywords_in_content(db_path, table_name, candidates_id, text_area_params.split(','))
if keyword_bool:
checked_candidates.append(candidate)
# Überprüfe, ob die Zielanzahl erreicht wurde und breche die Schleife ab, wenn ja
if len(checked_candidates) >= target_candidates_count:
break
current_offset += candidates_per_search
if current_offset == 600:
break
# Setze die Ergebnisse in der Session State Variable
st.session_state["docs_res"] = checked_candidates
if len(checked_candidates) == 0:
st.error("No candidates can be found with these keywords. Please adjust the keywords and try again.", icon="🚨")
# query_string = "The following keywords must be included: "+text_area_params+" "+st.session_state["job_string"]
# raw_candidates = st.session_state["db"].similarity_search(query_string, k=100)
# checked_candidates = []
# db_path = 'cvdb.db'
# table_name = 'files'
# for candidate in raw_candidates:
# candidates_id = candidate.metadata["source"].split("/")[-1]
# keyword_bool = check_keywords_in_content(db_path,table_name,candidates_id,text_area_params.split(','))
# print(keyword_bool)
# if check_keywords_in_content(db_path,table_name,candidates_id,text_area_params.split(',')):
# if len(checked_candidates)<15:
# checked_candidates.append(candidate)
# else:
# break
# st.session_state["docs_res"] = checked_candidates
#This Code is creating a new Index based on the raw candidates
# raw_candidates = st.session_state["db"].similarity_search(text_area_params+" "+st.session_state["job_string"], k=50)
# raw_candidates_embeddings = []
# for candidate in raw_candidates:
# raw_candidates_embeddings.append(embedding_function(candidate.page_content))
# st.session_state["docs_res"] = st.session_state["db"].similarity_search_by_vector(embedding=raw_candidates_embeddings,k=10,query="Every candidate needs to be proficient in spanish")
# db_temp = AzureSearch.from_documents(
# raw_candidates,
# embedding=embedder,
# index_name="wg-cvs-temp",
# azure_search_endpoint=os.environ.get("AZURE_SEARCH_ENDPOINT"),
# azure_search_key=os.environ.get("AZURE_SEARCH_KEY"),
# )
# st.session_state["docs_res"] = db_temp.similarity_search(query="Every candidate needs to be proficient in spanish", k=10)
#Use this code to check candidates with gpt-4
# raw_candidates = st.session_state["db"].similarity_search(text_area_params+" "+st.session_state["job_string"], k=15)
# temp_candidates = []
# for candidate in raw_candidates:
# res_approve = openai.ChatCompletion.create(
# engine="gpt-4",
# temperature=0.1,
# messages=[
# {
# "role": "system",
# "content": "You are a professional recruiter who receives a resume and a set of requirements. The only thing you have to do is to say whether the requirements are fulfilled or not. you should not explain yourself and simply answer '1' if the requirements are fulfilled and '0' if not.",
# },
# {
# "role": "system",
# "content": "The candidate needs to be located in New York"
# },
# {
# "role": "system",
# "content": candidate.page_content
# }
# ],
# )
# print(res_approve.choices[0]["message"]["content"])
# if res_approve.choices[0]["message"]["content"] == "1":
# temp_candidates.append(candidate)
# st.session_state["docs_res"] = temp_candidates
if not st.session_state["final_candidates"]:
for i,doc in enumerate(st.session_state["docs_res"]):
# print(doc)
cols_final = st.columns([6,1])
with cols_final[1]:
if st.button("Remove",use_container_width=True,key="btn_rm_cv_row_"+str(i)):
# st.write(doc.page_content)
st.session_state["docs_res"].pop(i)
st.rerun()
with cols_final[0]:
# st.subheader(doc.metadata["source"])
with st.expander(doc.metadata["source"]):
st.write(doc.page_content)
if st.button("Accept candidates", key="accept_candidates_btn"):
print("hello")
st.session_state["final_candidates"] = st.session_state["docs_res"].copy()
st.rerun()
else:
print("Now Questions")
st.subheader("Your Candidates:")
st.write(", ".join(candidate.metadata["source"] for candidate in st.session_state["final_candidates"]))
# for i,candidate in enumerate(st.session_state["final_candidates"]):
# st.write(candidate.metadata["source"])
cv_strings = "; Next CV: ".join(candidate.page_content for candidate in st.session_state["final_candidates"])
# print(len(cv_strings))
system = sys_prompt.format(job=st.session_state["job_string"], resume=st.session_state["final_candidates"][0], n=15)
if not st.session_state["ai_questions"]:
try:
# st.write("The questions are generated. This may take a short moment...")
st.info("The questions are generated. This may take a short moment.", icon="ℹ️")
with st.spinner("Loading..."):
res = openai.ChatCompletion.create(
engine="gpt-4",
temperature=0.2,
messages=[
{
"role": "system",
"content": system,
},
],
)
st.session_state["ai_questions"] = [item for item in res.choices[0]["message"]["content"].split("\n") if len(item) > 0]
for i,q in enumerate(res.choices[0]["message"]["content"].split("\n")):
st.session_state["disable_row_"+str(i)] = False
st.rerun()
except Exception as e:
print(f"Fehler beim generieren der Fragen: {str(e)}")
st.error("An error has occurred. Please reload the page or contact the admin.", icon="🚨")
else:
if len(st.session_state["final_question_string"]) <= 0:
for i,question in enumerate(st.session_state["ai_questions"]):
cols = st.columns([5,1])
with cols[1]:
# if st.button("Accept",use_container_width=True,key="btn_accept_row_"+str(i)):
# print("accept")
# pattern = re.compile(r"^[1-9][0-9]?\.")
# questions_length = len(st.session_state["final_question_string"])
# question_from_text_area = st.session_state["text_area_"+str(i)]
# question_to_append = str(questions_length+1)+"."+re.sub(pattern, "", question_from_text_area)
# st.session_state["final_question_string"].append(question_to_append)
# st.session_state["disable_row_"+str(i)] = True
# st.rerun()
if st.button("Delete",use_container_width=True,key="btn_del_row_"+str(i)):
print("delete")
st.session_state["ai_questions"].remove(question)
st.rerun()
with cols[0]:
st.text_area(label="Question "+str(i+1)+":",value=question,label_visibility="collapsed",key="text_area_"+str(i),disabled=st.session_state["disable_row_"+str(i)])
st.write("If you are satisfied with the questions, then accept them. You can still sort them afterwards.")
if st.button("Accept all questions",use_container_width=True,key="accept_all_questions"):
for i,question in enumerate(st.session_state["ai_questions"]):
pattern = re.compile(r"^[1-9][0-9]?\.")
questions_length = len(st.session_state["final_question_string"])
question_from_text_area = st.session_state["text_area_"+str(i)]
question_to_append = str(questions_length+1)+"."+re.sub(pattern, "", question_from_text_area)
st.session_state["final_question_string"].append(question_to_append)
st.session_state["disable_row_"+str(i)] = True
st.rerun()
for i,final_q in enumerate(st.session_state["final_question_string"]):
cols_final = st.columns([5,1])
with cols_final[1]:
if st.button("Up",use_container_width=True,key="btn_up_row_"+str(i),disabled=True if i == 0 else False):
if i > 0:
# Tausche das aktuelle Element mit dem vorherigen Element
st.session_state.final_question_string[i], st.session_state.final_question_string[i - 1] = \
st.session_state.final_question_string[i - 1], st.session_state.final_question_string[i]
st.session_state.final_question_string = adjust_numbering(st.session_state.final_question_string)
st.rerun()
if st.button("Down",use_container_width=True,key="btn_down_row_"+str(i), disabled=True if i == len(st.session_state["final_question_string"])-1 else False):
if i < len(st.session_state.final_question_string) - 1:
# Tausche das aktuelle Element mit dem nächsten Element
st.session_state.final_question_string[i], st.session_state.final_question_string[i + 1] = \
st.session_state.final_question_string[i + 1], st.session_state.final_question_string[i]
st.session_state.final_question_string = adjust_numbering(st.session_state.final_question_string)
st.rerun()
with cols_final[0]:
st.write(final_q)
if st.button("Submit", use_container_width=True):
st.success('Successful search for candidates and generation of questions')