Spaces:

tensora
/

wg-candidate-search

Running

App Files Files Community

mbosse99 commited on Nov 15, 2023

Commit

9a804ac

1 Parent(s): 9536a3c

Upload 4 files

Browse files

Files changed (5) hide show

.gitattributes +1 -0
app.py +362 -0
cvdb.db +3 -0
requirements.txt +5 -0
sys_prompt_frontend.txt +15 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+cvdb.db filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import io
+import os
+import openai
+import re
+import sqlite3
+import streamlit as st
+from streamlit_js_eval import streamlit_js_eval
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.vectorstores.azuresearch import AzureSearch
+from PyPDF2 import PdfReader
+os.environ["OPENAI_API_KEY"] = "201b389eda7b48a496fa81c091f8e51e"
+os.environ["OPENAI_API_BASE"] = "https://tensora-oai.openai.azure.com/"
+os.environ["OPENAI_API_TYPE"] = "azure"
+os.environ["OPENAI_API_VERSION"] = "2023-03-15-preview"
+os.environ["AZURE_SEARCH_ENDPOINT"] = "https://tensora-search.search.windows.net"
+os.environ["AZURE_SEARCH_KEY"] = "LABhDdbb8NPPilxOwPpZ4nXRyHzABsKyXdMiSQ50CKAzSeB1fy1x"
+openai.api_key = os.getenv("OPENAI_API_KEY")
+openai.api_base = "https://tensora-oai.openai.azure.com/"
+openai.api_type = "azure"
+openai.api_version = "2023-05-15"
+st.markdown(
+"""
+<style>
+    [data-testid=column]{
+        text-align: center;
+        display: flex;
+        align-items: center;
+        justify-content: center;
+    }
+    h3{
+        text-align: left;
+    }
+</style>
+""",
+    unsafe_allow_html=True,
+)
+with open("sys_prompt_frontend.txt") as f:
+    sys_prompt = f.read()
+def adjust_numbering(lst):
+    return [f"{i + 1}. {item.split('. ', 1)[1]}" for i, item in enumerate(lst)]
+def check_keywords_in_content(database_path, table_name, input_id, keywords):
+    # Verbindung zur Datenbank herstellen
+    conn = sqlite3.connect(database_path)
+    cursor = conn.cursor()
+    # SQL-Abfrage, um die Zeile mit der angegebenen ID abzurufen
+    cursor.execute(f'SELECT * FROM {table_name} WHERE id = ?', (input_id,))
+    # Ergebnis abrufen
+    row = cursor.fetchone()
+    # Wenn die Zeile nicht gefunden wurde, False zurückgeben
+    if not row:
+        conn.close()
+        print("ID not found")
+        return False
+    # Überprüfen, ob die Keywords in der Spalte content enthalten sind (case-insensitive)
+    content = row[1].lower()  # Annahme: content ist die zweite Spalte, und wir wandeln ihn in Kleinbuchstaben um
+    keywords_lower = [keyword.lower() for keyword in keywords]
+    contains_keywords = all(keyword in content for keyword in keywords_lower)
+    # Verbindung schließen
+    conn.close()
+    return contains_keywords
+if "similarity_search_string" not in st.session_state:
+    st.session_state["similarity_search_string"] = None
+if "job_string" not in st.session_state:
+    st.session_state["job_string"] = None
+if "docs_res" not in st.session_state:
+    st.session_state["docs_res"] = None
+if "final_candidates" not in st.session_state:
+    st.session_state["final_candidates"] = None
+if "final_question_string" not in st.session_state:
+    st.session_state["final_question_string"] = []
+if "ai_questions" not in st.session_state:
+    st.session_state["ai_questions"] = None
+if "db" not in st.session_state:
+    embedder = OpenAIEmbeddings(deployment="text-embedding-ada-002", chunk_size=1)
+    embedding_function = embedder.embed_query
+    db = AzureSearch(
+        index_name="wg-cvs",
+        azure_search_endpoint=os.environ.get("AZURE_SEARCH_ENDPOINT"),
+        azure_search_key=os.environ.get("AZURE_SEARCH_KEY"),
+        embedding_function=embedding_function,
+    )
+    st.session_state["db"] = db
+col1, col2 = st.columns([2, 1])
+col1.title("Candidate Search")
+col2.image("https://www.workgenius.com/wp-content/uploads/2023/03/WorkGenius_navy-1.svg")
+st.write("Please upload the job description for which you would like candidates to be proposed.")
+col_file, col_clear = st.columns([6,1])
+with col_file:
+    uploaded_file_jobdescription = st.file_uploader("Upload the job description:", type=["pdf"], key="job")
+with col_clear:
+    if st.button("Clear", use_container_width=True):
+        streamlit_js_eval(js_expressions="parent.window.location.reload()")
+text_area_params = st.text_area(label="Add additional search parameters, which are separated by commas (e.g. master, phd, web developer, spanish)")
+submit = st.button("Search candidates",disabled= True if st.session_state["final_candidates"] else False)
+if not st.session_state["job"] and submit:
+    st.error("Please upload a job description to search for candidates")
+if st.session_state["docs_res"] and submit:
+    with st.spinner("Load the candidates, this may take a moment..."):
+        query_string = "The following keywords must be included: " + text_area_params + " " + st.session_state["job_string"]
+        checked_candidates = []
+        db_path = 'cvdb.db'
+        table_name = 'files'
+        candidates_per_search = 100
+        target_candidates_count = 10
+        current_offset = 0
+        while len(checked_candidates) < target_candidates_count:
+            # Führe eine similarity search durch und erhalte 100 Kandidaten
+            raw_candidates = st.session_state["db"].similarity_search(query_string, k=candidates_per_search+current_offset)
+            for candidate in raw_candidates[current_offset:]:
+                candidates_id = candidate.metadata["source"].split("/")[-1]
+                keyword_bool = check_keywords_in_content(db_path, table_name, candidates_id, text_area_params.split(','))
+                if keyword_bool:
+                    checked_candidates.append(candidate)
+                    # Überprüfe, ob die Zielanzahl erreicht wurde und breche die Schleife ab, wenn ja
+                    if len(checked_candidates) >= target_candidates_count:
+                        break
+            current_offset += candidates_per_search
+            if current_offset == 600:
+                break
+        # Setze die Ergebnisse in der Session State Variable
+        st.session_state["docs_res"] = checked_candidates
+        if len(checked_candidates) == 0:
+            st.error("No candidates can be found with these keywords. Please adjust the keywords and try again.", icon="🚨")
+if (st.session_state["job"] and submit) or st.session_state["docs_res"]:
+    if not st.session_state["job_string"]:
+        pdf_data_jobdescription = st.session_state["job"].read()
+        pdf_data_jobdescription_string = ""
+        pdf_reader_job = PdfReader(io.BytesIO(pdf_data_jobdescription))
+        for page_num in range(len(pdf_reader_job.pages)):
+                page = pdf_reader_job.pages[page_num]
+                pdf_data_jobdescription_string += page.extract_text()
+        # st.session_state["pdf_data_jobdescription"] = pdf_data_jobdescription activate and add sessio state if data is needed
+        st.session_state["job_string"] = pdf_data_jobdescription_string
+    if not st.session_state["docs_res"]:
+        # print("ich bin im spinner")
+        # print(st.session_state["job_string"]+" "+text_area_params)
+        with st.spinner("Load the candidates, this may take a moment..."):
+            #Use this line if you just want to perform one similarity search
+            # st.session_state["docs_res"] = st.session_state["db"].similarity_search(text_area_params+" "+st.session_state["job_string"], k=100)
+            query_string = "The following keywords must be included: " + text_area_params + " " + st.session_state["job_string"]
+            checked_candidates = []
+            db_path = 'cvdb.db'
+            table_name = 'files'
+            candidates_per_search = 100
+            target_candidates_count = 10
+            current_offset = 0
+            while len(checked_candidates) < target_candidates_count:
+                # Führe eine similarity search durch und erhalte 100 Kandidaten
+                raw_candidates = st.session_state["db"].similarity_search(query_string, k=candidates_per_search+current_offset)
+                for candidate in raw_candidates[current_offset:]:
+                    candidates_id = candidate.metadata["source"].split("/")[-1]
+                    keyword_bool = check_keywords_in_content(db_path, table_name, candidates_id, text_area_params.split(','))
+                    if keyword_bool:
+                        checked_candidates.append(candidate)
+                        # Überprüfe, ob die Zielanzahl erreicht wurde und breche die Schleife ab, wenn ja
+                        if len(checked_candidates) >= target_candidates_count:
+                            break
+                current_offset += candidates_per_search
+                if current_offset == 600:
+                    break
+            # Setze die Ergebnisse in der Session State Variable
+            st.session_state["docs_res"] = checked_candidates
+            if len(checked_candidates) == 0:
+                st.error("No candidates can be found with these keywords. Please adjust the keywords and try again.", icon="🚨")
+            # query_string = "The following keywords must be included: "+text_area_params+" "+st.session_state["job_string"]
+            # raw_candidates = st.session_state["db"].similarity_search(query_string, k=100)
+            # checked_candidates = []
+            # db_path = 'cvdb.db'
+            # table_name = 'files'
+            # for candidate in raw_candidates:
+            #     candidates_id = candidate.metadata["source"].split("/")[-1]
+            #     keyword_bool = check_keywords_in_content(db_path,table_name,candidates_id,text_area_params.split(','))
+            #     print(keyword_bool)
+            #     if check_keywords_in_content(db_path,table_name,candidates_id,text_area_params.split(',')):
+            #         if len(checked_candidates)<15:
+            #             checked_candidates.append(candidate)
+            #         else:
+            #             break
+            # st.session_state["docs_res"] = checked_candidates
+            #This Code is creating a new Index based on the raw candidates
+            # raw_candidates = st.session_state["db"].similarity_search(text_area_params+" "+st.session_state["job_string"], k=50)
+            # raw_candidates_embeddings = []
+            # for candidate in raw_candidates:
+            #     raw_candidates_embeddings.append(embedding_function(candidate.page_content))
+            # st.session_state["docs_res"] = st.session_state["db"].similarity_search_by_vector(embedding=raw_candidates_embeddings,k=10,query="Every candidate needs to be proficient in spanish")
+            # db_temp = AzureSearch.from_documents(
+            #     raw_candidates,
+            #     embedding=embedder,
+            #     index_name="wg-cvs-temp",
+            #     azure_search_endpoint=os.environ.get("AZURE_SEARCH_ENDPOINT"),
+            #     azure_search_key=os.environ.get("AZURE_SEARCH_KEY"),
+            # )
+            # st.session_state["docs_res"] = db_temp.similarity_search(query="Every candidate needs to be proficient in spanish", k=10)
+            #Use this code to check candidates with gpt-4
+            # raw_candidates = st.session_state["db"].similarity_search(text_area_params+" "+st.session_state["job_string"], k=15)
+            # temp_candidates = []
+            # for candidate in raw_candidates:
+            #     res_approve = openai.ChatCompletion.create(
+            #             engine="gpt-4",
+            #             temperature=0.1,
+            #             messages=[
+            #                 {
+            #                     "role": "system",
+            #                     "content": "You are a professional recruiter who receives a resume and a set of requirements. The only thing you have to do is to say whether the requirements are fulfilled or not. you should not explain yourself and simply answer '1' if the requirements are fulfilled and '0' if not.",
+            #                 },
+            #                 {
+            #                     "role": "system",
+            #                     "content": "The candidate needs to be located in New York"
+            #                 },
+            #                 {
+            #                     "role": "system",
+            #                     "content": candidate.page_content
+            #                 }
+            #             ],
+            #             )
+            #     print(res_approve.choices[0]["message"]["content"])
+            #     if res_approve.choices[0]["message"]["content"] == "1":
+            #         temp_candidates.append(candidate)
+            # st.session_state["docs_res"]  = temp_candidates
+    if not st.session_state["final_candidates"]:
+        for i,doc in enumerate(st.session_state["docs_res"]):
+            # print(doc)
+            cols_final = st.columns([6,1])
+            with cols_final[1]:
+                if st.button("Remove",use_container_width=True,key="btn_rm_cv_row_"+str(i)):
+                    # st.write(doc.page_content)
+                    st.session_state["docs_res"].pop(i)
+                    st.rerun()
+            with cols_final[0]:
+                # st.subheader(doc.metadata["source"])
+                with st.expander(doc.metadata["source"]):
+                    st.write(doc.page_content)
+        if st.button("Accept candidates", key="accept_candidates_btn"):
+            print("hello")
+            st.session_state["final_candidates"] = st.session_state["docs_res"].copy()
+            st.rerun()
+    else:
+        print("Now Questions")
+        st.subheader("Your Candidates:")
+        st.write(", ".join(candidate.metadata["source"] for candidate in st.session_state["final_candidates"]))
+        # for i,candidate in enumerate(st.session_state["final_candidates"]):
+        #     st.write(candidate.metadata["source"])
+        cv_strings = "; Next CV: ".join(candidate.page_content for candidate in st.session_state["final_candidates"])
+        # print(len(cv_strings))
+        system = sys_prompt.format(job=st.session_state["job_string"], resume=st.session_state["final_candidates"][0], n=15)
+        if not st.session_state["ai_questions"]:
+            try:
+                # st.write("The questions are generated. This may take a short moment...")
+                st.info("The questions are generated. This may take a short moment.", icon="ℹ️")
+                with st.spinner("Loading..."):
+                    res = openai.ChatCompletion.create(
+                        engine="gpt-4",
+                        temperature=0.2,
+                        messages=[
+                            {
+                                "role": "system",
+                                "content": system,
+                            },
+                        ],
+                        )
+                    st.session_state["ai_questions"] = [item for item in res.choices[0]["message"]["content"].split("\n") if len(item) > 0]
+                    for i,q in enumerate(res.choices[0]["message"]["content"].split("\n")):
+                        st.session_state["disable_row_"+str(i)] = False
+                    st.rerun()
+            except Exception as e:
+                print(f"Fehler beim generieren der Fragen: {str(e)}")
+                st.error("An error has occurred. Please reload the page or contact the admin.", icon="🚨")
+        else:
+            if len(st.session_state["final_question_string"]) <= 0:
+                for i,question in enumerate(st.session_state["ai_questions"]):
+                    cols = st.columns([5,1])
+                    with cols[1]:
+                        # if st.button("Accept",use_container_width=True,key="btn_accept_row_"+str(i)):
+                        #     print("accept")
+                        #     pattern = re.compile(r"^[1-9][0-9]?\.")
+                        #     questions_length = len(st.session_state["final_question_string"])
+                        #     question_from_text_area = st.session_state["text_area_"+str(i)]
+                        #     question_to_append = str(questions_length+1)+"."+re.sub(pattern, "", question_from_text_area)
+                        #     st.session_state["final_question_string"].append(question_to_append)
+                        #     st.session_state["disable_row_"+str(i)] = True
+                        #     st.rerun()
+                        if st.button("Delete",use_container_width=True,key="btn_del_row_"+str(i)):
+                            print("delete")
+                            st.session_state["ai_questions"].remove(question)
+                            st.rerun()
+                    with cols[0]:
+                        st.text_area(label="Question "+str(i+1)+":",value=question,label_visibility="collapsed",key="text_area_"+str(i),disabled=st.session_state["disable_row_"+str(i)])
+                st.write("If you are satisfied with the questions, then accept them. You can still sort them afterwards.")
+                if st.button("Accept all questions",use_container_width=True,key="accept_all_questions"):
+                    for i,question in enumerate(st.session_state["ai_questions"]):
+                            pattern = re.compile(r"^[1-9][0-9]?\.")
+                            questions_length = len(st.session_state["final_question_string"])
+                            question_from_text_area = st.session_state["text_area_"+str(i)]
+                            question_to_append = str(questions_length+1)+"."+re.sub(pattern, "", question_from_text_area)
+                            st.session_state["final_question_string"].append(question_to_append)
+                            st.session_state["disable_row_"+str(i)] = True
+                    st.rerun()
+            for i,final_q in enumerate(st.session_state["final_question_string"]):
+                cols_final = st.columns([5,1])
+                with cols_final[1]:
+                    if st.button("Up",use_container_width=True,key="btn_up_row_"+str(i),disabled=True if i == 0 else False):
+                        if i > 0:
+                            # Tausche das aktuelle Element mit dem vorherigen Element
+                            st.session_state.final_question_string[i], st.session_state.final_question_string[i - 1] = \
+                                st.session_state.final_question_string[i - 1], st.session_state.final_question_string[i]
+                            st.session_state.final_question_string = adjust_numbering(st.session_state.final_question_string)
+                            st.rerun()
+                    if st.button("Down",use_container_width=True,key="btn_down_row_"+str(i), disabled=True if i == len(st.session_state["final_question_string"])-1 else False):
+                        if i < len(st.session_state.final_question_string) - 1:
+                            # Tausche das aktuelle Element mit dem nächsten Element
+                            st.session_state.final_question_string[i], st.session_state.final_question_string[i + 1] = \
+                                st.session_state.final_question_string[i + 1], st.session_state.final_question_string[i]
+                            st.session_state.final_question_string = adjust_numbering(st.session_state.final_question_string)
+                            st.rerun()
+                with cols_final[0]:
+                    st.write(final_q)
+            if st.button("Submit", use_container_width=True):
+                st.success('Successful search for candidates and generation of questions')

cvdb.db ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a37e1cd92c164e099e5e935c776458f4771f4f9b1eb8ad899e8124df64a1dd2
+size 395046912

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+openai==0.28.1
+streamlit
+langchain
+PyPDF2
+streamlit_js_eval

sys_prompt_frontend.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+You are a professional recruiter specialized in conducting interviews. Your task is to generate {n} questions for an interview and collect as much relevant information from the applicant (the user) as possible. As context you will be given the job description. You will also receive one or more resumes from potential candidates to get an overview of the applicants.
+Please follow these rules:
+- Try to ask open ended questions to collect more information from the applicant.
+- Concentrate on questions that the resume alone cannot answer. Aim to fill in the gaps.
+- Don't give feedback, don't summarize and don't explain yourself. Your role is investigative.
+- Use the {n} questions wisely to get an overall impression of the applicant.
+- Just generate the {n} questions, nothing else.
+JOB DESCRIPTION:
+{job}
+RESUME(S):
+{resume}