|
import gradio as gr |
|
import pandas as pd |
|
from langchain_core.prompts import PromptTemplate |
|
from langchain_openai import ChatOpenAI |
|
from langchain_core.output_parsers import StrOutputParser |
|
from pydantic import BaseModel, Field, field_validator |
|
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader |
|
from langchain.output_parsers import PydanticOutputParser |
|
from docx import Document |
|
from datetime import datetime |
|
import os |
|
import tempfile |
|
|
|
|
|
class QuestionAnswer(BaseModel): |
|
question_number: int = Field(..., description="Numer pytania") |
|
answer: str = Field(..., description="Odpowiedź, tylko TAK lub NIE") |
|
citation: str = Field(..., description="Fragment cytatu") |
|
|
|
@field_validator("answer") |
|
def validate_answer(cls, v): |
|
if v not in {"TAK", "NIE"}: |
|
raise ValueError("Odpowiedź musi być TAK lub NIE") |
|
return v |
|
|
|
class JobAdAnalysis(BaseModel): |
|
answers: list[QuestionAnswer] |
|
|
|
parser = PydanticOutputParser(pydantic_object=JobAdAnalysis) |
|
|
|
|
|
matryca_df = pd.read_csv('matryca.csv', header=None, |
|
names=['area', 'prompt', 'true', 'false', 'more', 'hint']) |
|
|
|
question_to_area_map = {} |
|
|
|
def prepare_questions(df): |
|
global question_to_area_map |
|
question_to_area_map = {} |
|
questions_text = "" |
|
for index, row in df.iterrows(): |
|
question_number = index + 1 |
|
questions_text += f"{question_number} {row['prompt']}\n" |
|
question_to_area_map[question_number] = { |
|
'area': row['area'], |
|
'true': row['true'], |
|
'false': row['false'], |
|
'hint': row['hint'], |
|
'more': row['more'] |
|
} |
|
return questions_text |
|
|
|
def doc_to_text(file): |
|
extension = os.path.splitext(file.name)[1].lower() |
|
if extension == ".docx": |
|
loader = Docx2txtLoader(file.name) |
|
elif extension == ".pdf": |
|
loader = PyPDFLoader(file.name) |
|
else: |
|
return "error" |
|
pages = loader.load() |
|
return "\n".join(page.page_content for page in pages) |
|
|
|
def create_report(result: pd.DataFrame) -> str: |
|
doc = Document('template.docx') |
|
doc.add_heading('Raport analizy ogłoszenia o pracę', 0) |
|
doc.add_paragraph(f'Data wygenerowania: {datetime.now().strftime("%d.%m.%Y %H:%M")}') |
|
for _, row in result.iterrows(): |
|
doc.add_heading(str(row['area']), 1) |
|
doc.add_paragraph(str(row['citation']), style='Intense Quote') |
|
for line in str(row['content']).split('\n'): |
|
if line.strip(): |
|
doc.add_paragraph(line) |
|
if pd.notna(row['more']): |
|
for line in str(row['more']).split('\n'): |
|
if line.strip(): |
|
doc.add_paragraph(line) |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp: |
|
doc.save(tmp.name) |
|
return tmp.name |
|
|
|
def analyze_job_ad(job_ad, file): |
|
if file: |
|
job_ad = doc_to_text(file) |
|
if job_ad == "error": |
|
return None, None |
|
questions = prepare_questions(matryca_df) |
|
prompt_template = PromptTemplate.from_template( |
|
"""Przeanalizuj poniższe ogłoszenie o pracę pod kątem dostępności dla osób z niepełnosprawnościami. |
|
|
|
Ogłoszenie: |
|
{job_ad} |
|
|
|
Odpowiedz na następujące pytania: |
|
{questions} |
|
|
|
Format odpowiedzi powinien być w następującej strukturze JSON: |
|
{{ |
|
"answers": [ |
|
{{ |
|
"question_number": 1, |
|
"answer": "TAK/NIE", |
|
"citation": "dokładny cytat z tekstu" |
|
}} |
|
] |
|
}} |
|
""" |
|
) |
|
|
|
model = ChatOpenAI(temperature=0, model="gpt-4o-mini") |
|
chain = prompt_template | model | parser |
|
response = chain.invoke({"job_ad": job_ad, "questions": questions}) |
|
|
|
output_df = pd.DataFrame(columns=['area', 'answer', 'citation', 'content', 'more']) |
|
for i in range(16): |
|
if response.answers[i].answer in {"TAK", "NIE"}: |
|
new_row = { |
|
'area': matryca_df.area[i], |
|
'answer': response.answers[i].answer, |
|
'citation': response.answers[i].citation, |
|
'content': matryca_df.true[i] if response.answers[i].answer == 'TAK' else matryca_df.false[i], |
|
'more': matryca_df.more[i] |
|
} |
|
output_df = pd.concat([output_df, pd.DataFrame([new_row])], ignore_index=True) |
|
|
|
word_file_path = create_report(output_df) |
|
json_output = output_df.to_dict(orient="records") |
|
return json_output, word_file_path |
|
|
|
|
|
demo = gr.Interface( |
|
fn=analyze_job_ad, |
|
inputs=[gr.TextArea(label="Ogłoszenie (opcjonalnie)"), gr.File(label="Plik PDF lub DOCX")], |
|
outputs=[gr.JSON(label="Wyniki analizy"), gr.File(label="Pobierz raport w formacie Word")], |
|
title="KoREKtor – analiza ogłoszenia", |
|
).launch() |