File size: 4,911 Bytes
79e8897 dedd8bb 3847c85 dedd8bb 3847c85 79e8897 3847c85 79e8897 3847c85 79e8897 3847c85 79e8897 3847c85 79e8897 dedd8bb 3847c85 dedd8bb 3847c85 dedd8bb 3847c85 dedd8bb 3847c85 dedd8bb abdb79a 79e8897 3847c85 79e8897 3847c85 79e8897 3847c85 79e8897 3847c85 79e8897 3847c85 4b17ded 79e8897 3847c85 79e8897 3847c85 79e8897 3847c85 79e8897 3847c85 79e8897 3847c85 009fbd9 3847c85 79e8897 3847c85 79e8897 3847c85 ff37902 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import gradio as gr
import pandas as pd
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from pydantic import BaseModel, Field, field_validator
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.output_parsers import PydanticOutputParser
from docx import Document
from datetime import datetime
import os
import tempfile
# Model danych
class QuestionAnswer(BaseModel):
question_number: int = Field(..., description="Numer pytania")
answer: str = Field(..., description="Odpowiedź, tylko TAK lub NIE")
citation: str = Field(..., description="Fragment cytatu")
@field_validator("answer")
def validate_answer(cls, v):
if v not in {"TAK", "NIE"}:
raise ValueError("Odpowiedź musi być TAK lub NIE")
return v
class JobAdAnalysis(BaseModel):
answers: list[QuestionAnswer]
parser = PydanticOutputParser(pydantic_object=JobAdAnalysis)
# Wczytanie matrycy danych
matryca_df = pd.read_csv('matryca.csv', header=None,
names=['area', 'prompt', 'true', 'false', 'more', 'hint'])
question_to_area_map = {}
def prepare_questions(df):
global question_to_area_map
question_to_area_map = {}
questions_text = ""
for index, row in df.iterrows():
question_number = index + 1
questions_text += f"{question_number} {row['prompt']}\n"
question_to_area_map[question_number] = {
'area': row['area'],
'true': row['true'],
'false': row['false'],
'hint': row['hint'],
'more': row['more']
}
return questions_text
def doc_to_text(file):
extension = os.path.splitext(file.name)[1].lower()
if extension == ".docx":
loader = Docx2txtLoader(file.name)
elif extension == ".pdf":
loader = PyPDFLoader(file.name)
else:
return "error"
pages = loader.load()
return "\n".join(page.page_content for page in pages)
def create_report(result: pd.DataFrame) -> str:
doc = Document('template.docx')
doc.add_heading('Raport analizy ogłoszenia o pracę', 0)
doc.add_paragraph(f'Data wygenerowania: {datetime.now().strftime("%d.%m.%Y %H:%M")}')
for _, row in result.iterrows():
doc.add_heading(str(row['area']), 1)
doc.add_paragraph(str(row['citation']), style='Intense Quote')
for line in str(row['content']).split('\n'):
if line.strip():
doc.add_paragraph(line)
if pd.notna(row['more']):
for line in str(row['more']).split('\n'):
if line.strip():
doc.add_paragraph(line)
with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
doc.save(tmp.name)
return tmp.name # Zwracamy ścieżkę do pliku tymczasowego
def analyze_job_ad(job_ad, file):
if file:
job_ad = doc_to_text(file)
if job_ad == "error":
return None, None
questions = prepare_questions(matryca_df)
prompt_template = PromptTemplate.from_template(
"""Przeanalizuj poniższe ogłoszenie o pracę pod kątem dostępności dla osób z niepełnosprawnościami.
Ogłoszenie:
{job_ad}
Odpowiedz na następujące pytania:
{questions}
Format odpowiedzi powinien być w następującej strukturze JSON:
{{
"answers": [
{{
"question_number": 1,
"answer": "TAK/NIE",
"citation": "dokładny cytat z tekstu"
}}
]
}}
"""
)
model = ChatOpenAI(temperature=0, model="gpt-4o-mini")
chain = prompt_template | model | parser
response = chain.invoke({"job_ad": job_ad, "questions": questions})
output_df = pd.DataFrame(columns=['area', 'answer', 'citation', 'content', 'more'])
for i in range(16):
if response.answers[i].answer in {"TAK", "NIE"}:
new_row = {
'area': matryca_df.area[i],
'answer': response.answers[i].answer,
'citation': response.answers[i].citation,
'content': matryca_df.true[i] if response.answers[i].answer == 'TAK' else matryca_df.false[i],
'more': matryca_df.more[i]
}
output_df = pd.concat([output_df, pd.DataFrame([new_row])], ignore_index=True)
word_file_path = create_report(output_df)
json_output = output_df.to_dict(orient="records")
return json_output, word_file_path
# Interfejs Gradio
demo = gr.Interface(
fn=analyze_job_ad,
inputs=[gr.TextArea(label="Ogłoszenie (opcjonalnie)"), gr.File(label="Plik PDF lub DOCX")],
outputs=[gr.JSON(label="Wyniki analizy"), gr.File(label="Pobierz raport w formacie Word")],
title="KoREKtor – analiza ogłoszenia",
).launch() |