|
|
|
|
|
import gradio as gr |
|
import pandas as pd |
|
from langchain_core.prompts import PromptTemplate |
|
from langchain_openai import ChatOpenAI |
|
from langchain_core.output_parsers import StrOutputParser |
|
from pydantic import BaseModel, Field, validator |
|
from pydantic import BaseModel, Field, field_validator |
|
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader |
|
import os |
|
|
|
|
|
|
|
class QuestionAnswer(BaseModel): |
|
""" |
|
Model reprezentujący pojedyncze pytanie i odpowiedź z analizy ogłoszenia. |
|
|
|
Attributes: |
|
question_number (int): Numer kolejny pytania |
|
answer (str): Odpowiedź na pytanie (TAK/NIE) |
|
citation (str): Cytat z tekstu ogłoszenia uzasadniający odpowiedź |
|
""" |
|
question_number: int = Field(..., description="Numer pytania") |
|
answer: str = Field(..., description="Odpowiedź, tylko TAK lub NIE") |
|
citation: str = Field(..., description="Fragment cytatu") |
|
|
|
@field_validator("answer") |
|
def validate_answer(cls, v): |
|
if v not in {"TAK", "NIE"}: |
|
raise ValueError("Odpowiedź musi być TAK lub NIE") |
|
return v |
|
|
|
|
|
class JobAdAnalysis(BaseModel): |
|
""" |
|
Model reprezentujący pełną analizę ogłoszenia o pracę. |
|
|
|
Attributes: |
|
answers (list[QuestionAnswer]): Lista odpowiedzi na wszystkie pytania |
|
""" |
|
answers: list[QuestionAnswer] |
|
|
|
|
|
|
|
|
|
from langchain.output_parsers import PydanticOutputParser |
|
parser = PydanticOutputParser(pydantic_object=JobAdAnalysis) |
|
|
|
|
|
question_to_area_map = {} |
|
|
|
|
|
|
|
|
|
|
|
matryca_df = pd.read_csv('matryca.csv', header=None, |
|
names=['area', 'prompt', 'true', 'false', 'more', 'hint']) |
|
|
|
|
|
|
|
def prepare_questions(df): |
|
""" |
|
Przygotowuje tekst pytań na podstawie matrycy danych. |
|
|
|
Args: |
|
df (pandas.DataFrame): DataFrame zawierający matrycę pytań |
|
|
|
Returns: |
|
str: Sformatowany tekst wszystkich pytań |
|
|
|
Note: |
|
Funkcja aktualizuje również globalną mapę question_to_area_map |
|
""" |
|
questions_text = "" |
|
|
|
global question_to_area_map |
|
question_to_area_map = {} |
|
|
|
for index, row in df.iterrows(): |
|
question_number = index + 1 |
|
questions_text += f"{question_number} {row['prompt']}\n" |
|
|
|
question_to_area_map[question_number] = { |
|
'area': row['area'], |
|
'true': row['true'], |
|
'false': row['false'], |
|
'hint': row['hint'], |
|
'more': row['more'] |
|
} |
|
|
|
|
|
return questions_text |
|
|
|
|
|
|
|
|
|
def doc_to_text(file): |
|
extension = os.path.splitext(file)[1].lower() |
|
if extension==".docx": |
|
loader = Docx2txtLoader(file) |
|
elif extension==".pdf": |
|
loader = PyPDFLoader(file) |
|
else: |
|
return "error" |
|
pages = loader.load() |
|
text='' |
|
for page in pages: |
|
text += page.page_content + "\n" |
|
return text |
|
|
|
|
|
def analyze_job_ad(job_ad, file): |
|
if file: |
|
job_ad=doc_to_text(file) |
|
if job_ad == "error": |
|
return None, None |
|
"""Analizuje ogłoszenie o pracę przy użyciu LangChain i OpenAI.""" |
|
questions = prepare_questions(matryca_df) |
|
prompt_template = PromptTemplate.from_template( |
|
"""Przeanalizuj poniższe ogłoszenie o pracę pod kątem dostępności dla osób z niepełnosprawnościami. |
|
|
|
Ogłoszenie: |
|
{job_ad} |
|
|
|
Odpowiedz na następujące pytania: |
|
{questions} |
|
|
|
Format odpowiedzi powinien być w następującej strukturze JSON: |
|
{{ |
|
"answers": [ |
|
{{ |
|
"question_number": 1, |
|
"answer": "TAK/NIE", |
|
"citation": "dokładny cytat z tekstu" |
|
}} |
|
] |
|
}} |
|
""" |
|
) |
|
|
|
model = ChatOpenAI(temperature=0, model="gpt-4o-mini") |
|
chain = prompt_template | model | parser |
|
response = chain.invoke({"job_ad": job_ad, "questions": questions}) |
|
output_df = pd.DataFrame(columns=['area', 'answer', 'citation', 'content', 'more']) |
|
for i in range(16): |
|
temp_df = pd.DataFrame() |
|
if response.answers[i].answer == 'TAK': |
|
new_row = { |
|
'area': matryca_df.area[i], |
|
'answer': response.answers[i].answer, |
|
'citation': response.answers[i].citation, |
|
'content': matryca_df.true[i], |
|
'more': matryca_df.more[i] |
|
} |
|
temp_df = pd.DataFrame([new_row]) |
|
output_df = pd.concat([output_df, temp_df], ignore_index=True) |
|
elif response.answers[i].answer == 'NIE': |
|
new_row = { |
|
'area': matryca_df.area[i], |
|
'answer': response.answers[i].answer, |
|
'citation': response.answers[i].citation, |
|
'content': matryca_df.false[i], |
|
'more': matryca_df.more[i] |
|
} |
|
temp_df = pd.DataFrame([new_row]) |
|
output_df = pd.concat([output_df, temp_df], ignore_index=True) |
|
word=create_report(output_df) |
|
return output_df.to_json(orient='index'), word |
|
|
|
|
|
|
|
|
|
|
|
from docx import Document |
|
from io import BytesIO |
|
from datetime import datetime |
|
import tempfile |
|
def create_report(result: pd.DataFrame) -> BytesIO: |
|
doc = Document('template.docx') |
|
doc.add_heading('Raport analizy ogłoszenia o pracę', 0) |
|
doc.add_paragraph(f'Data wygenerowania: {datetime.now().strftime("%d.%m.%Y %H:%M")}') |
|
for _, row in result.iterrows(): |
|
doc.add_heading(row['area'], 1) |
|
|
|
doc.add_paragraph(row['citation'], style='Intense Quote') |
|
|
|
citat=row['content'] |
|
lines = citat.split('\n') |
|
for line in lines: |
|
if len(line)>0: |
|
doc.add_paragraph(line) |
|
|
|
if pd.notna(row['more']): |
|
explanation=row['more'] |
|
lines = explanation.split('\n') |
|
for line in lines: |
|
if len(line)>0: |
|
doc.add_paragraph(line) |
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp_file: |
|
doc.save(tmp_file.name) |
|
return tmp_file.name |
|
|
|
|
|
|
|
|
|
demo=gr.Interface( |
|
fn=analyze_job_ad, |
|
inputs=[gr.TextArea(), gr.File()], |
|
outputs=[gr.JSON(), gr.DownloadButton(label='Pobierz raport w formacie Word')], |
|
title="KoREKtor" |
|
).launch(inbrowser=True) |
|
|
|
|
|
|
|
|