Kolejna wersja do przetestowania online
Browse files
app.py
CHANGED
@@ -1,26 +1,18 @@
|
|
1 |
-
# %%
|
2 |
-
# załadowanie bibliotek
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
5 |
from langchain_core.prompts import PromptTemplate
|
6 |
from langchain_openai import ChatOpenAI
|
7 |
from langchain_core.output_parsers import StrOutputParser
|
8 |
-
from pydantic import BaseModel, Field, validator
|
9 |
from pydantic import BaseModel, Field, field_validator
|
10 |
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
|
|
|
|
|
|
|
11 |
import os
|
|
|
12 |
|
13 |
-
|
14 |
-
# %%
|
15 |
class QuestionAnswer(BaseModel):
|
16 |
-
"""
|
17 |
-
Model reprezentujący pojedyncze pytanie i odpowiedź z analizy ogłoszenia.
|
18 |
-
|
19 |
-
Attributes:
|
20 |
-
question_number (int): Numer kolejny pytania
|
21 |
-
answer (str): Odpowiedź na pytanie (TAK/NIE)
|
22 |
-
citation (str): Cytat z tekstu ogłoszenia uzasadniający odpowiedź
|
23 |
-
"""
|
24 |
question_number: int = Field(..., description="Numer pytania")
|
25 |
answer: str = Field(..., description="Odpowiedź, tylko TAK lub NIE")
|
26 |
citation: str = Field(..., description="Fragment cytatu")
|
@@ -31,56 +23,24 @@ class QuestionAnswer(BaseModel):
|
|
31 |
raise ValueError("Odpowiedź musi być TAK lub NIE")
|
32 |
return v
|
33 |
|
34 |
-
|
35 |
class JobAdAnalysis(BaseModel):
|
36 |
-
"""
|
37 |
-
Model reprezentujący pełną analizę ogłoszenia o pracę.
|
38 |
-
|
39 |
-
Attributes:
|
40 |
-
answers (list[QuestionAnswer]): Lista odpowiedzi na wszystkie pytania
|
41 |
-
"""
|
42 |
answers: list[QuestionAnswer]
|
43 |
|
44 |
-
|
45 |
-
# %%
|
46 |
-
# Użycie wbudowanego parsera Pydantic w LangChain:
|
47 |
-
from langchain.output_parsers import PydanticOutputParser
|
48 |
parser = PydanticOutputParser(pydantic_object=JobAdAnalysis)
|
49 |
|
50 |
-
#
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
# %%
|
56 |
-
# Wczytanie matrycy danych do DataFrame
|
57 |
-
matryca_df = pd.read_csv('matryca.csv', header=None,
|
58 |
-
names=['area', 'prompt', 'true', 'false', 'more', 'hint'])
|
59 |
|
|
|
60 |
|
61 |
-
# %%
|
62 |
def prepare_questions(df):
|
63 |
-
"""
|
64 |
-
Przygotowuje tekst pytań na podstawie matrycy danych.
|
65 |
-
|
66 |
-
Args:
|
67 |
-
df (pandas.DataFrame): DataFrame zawierający matrycę pytań
|
68 |
-
|
69 |
-
Returns:
|
70 |
-
str: Sformatowany tekst wszystkich pytań
|
71 |
-
|
72 |
-
Note:
|
73 |
-
Funkcja aktualizuje również globalną mapę question_to_area_map
|
74 |
-
"""
|
75 |
-
questions_text = ""
|
76 |
-
# Tworzymy słownik mapujący numer pytania na obszar i inne informacje
|
77 |
global question_to_area_map
|
78 |
question_to_area_map = {}
|
79 |
-
|
80 |
for index, row in df.iterrows():
|
81 |
question_number = index + 1
|
82 |
questions_text += f"{question_number} {row['prompt']}\n"
|
83 |
-
# Zapisujemy wszystkie potrzebne informacje
|
84 |
question_to_area_map[question_number] = {
|
85 |
'area': row['area'],
|
86 |
'true': row['true'],
|
@@ -88,44 +48,52 @@ def prepare_questions(df):
|
|
88 |
'hint': row['hint'],
|
89 |
'more': row['more']
|
90 |
}
|
91 |
-
|
92 |
-
|
93 |
return questions_text
|
94 |
|
95 |
-
|
96 |
-
# %%
|
97 |
-
# Konwersja plików PDF i Word do tekstu
|
98 |
def doc_to_text(file):
|
99 |
-
extension = os.path.splitext(file)[1].lower()
|
100 |
-
if extension==".docx":
|
101 |
-
loader = Docx2txtLoader(file)
|
102 |
-
elif extension==".pdf":
|
103 |
-
loader = PyPDFLoader(file)
|
104 |
else:
|
105 |
return "error"
|
106 |
pages = loader.load()
|
107 |
-
|
108 |
-
for page in pages:
|
109 |
-
text += page.page_content + "\n"
|
110 |
-
return text
|
111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
def analyze_job_ad(job_ad, file):
|
114 |
if file:
|
115 |
-
job_ad=doc_to_text(file)
|
116 |
if job_ad == "error":
|
117 |
return None, None
|
118 |
-
"""Analizuje ogłoszenie o pracę przy użyciu LangChain i OpenAI."""
|
119 |
questions = prepare_questions(matryca_df)
|
120 |
prompt_template = PromptTemplate.from_template(
|
121 |
"""Przeanalizuj poniższe ogłoszenie o pracę pod kątem dostępności dla osób z niepełnosprawnościami.
|
122 |
-
|
123 |
Ogłoszenie:
|
124 |
{job_ad}
|
125 |
-
|
126 |
Odpowiedz na następujące pytania:
|
127 |
{questions}
|
128 |
-
|
129 |
Format odpowiedzi powinien być w następującej strukturze JSON:
|
130 |
{{
|
131 |
"answers": [
|
@@ -134,83 +102,35 @@ def analyze_job_ad(job_ad, file):
|
|
134 |
"answer": "TAK/NIE",
|
135 |
"citation": "dokładny cytat z tekstu"
|
136 |
}}
|
137 |
-
|
138 |
}}
|
139 |
"""
|
140 |
)
|
141 |
-
|
142 |
model = ChatOpenAI(temperature=0, model="gpt-4o-mini")
|
143 |
chain = prompt_template | model | parser
|
144 |
response = chain.invoke({"job_ad": job_ad, "questions": questions})
|
|
|
145 |
output_df = pd.DataFrame(columns=['area', 'answer', 'citation', 'content', 'more'])
|
146 |
for i in range(16):
|
147 |
-
|
148 |
-
if response.answers[i].answer == 'TAK':
|
149 |
new_row = {
|
150 |
'area': matryca_df.area[i],
|
151 |
'answer': response.answers[i].answer,
|
152 |
'citation': response.answers[i].citation,
|
153 |
-
'content': matryca_df.true[i],
|
154 |
'more': matryca_df.more[i]
|
155 |
}
|
156 |
-
|
157 |
-
output_df = pd.concat([output_df, temp_df], ignore_index=True)
|
158 |
-
elif response.answers[i].answer == 'NIE':
|
159 |
-
new_row = {
|
160 |
-
'area': matryca_df.area[i],
|
161 |
-
'answer': response.answers[i].answer,
|
162 |
-
'citation': response.answers[i].citation,
|
163 |
-
'content': matryca_df.false[i],
|
164 |
-
'more': matryca_df.more[i]
|
165 |
-
}
|
166 |
-
temp_df = pd.DataFrame([new_row])
|
167 |
-
output_df = pd.concat([output_df, temp_df], ignore_index=True)
|
168 |
-
word=create_report(output_df)
|
169 |
-
return output_df.to_json(orient='index'), word
|
170 |
-
|
171 |
|
|
|
|
|
|
|
172 |
|
173 |
-
#
|
174 |
-
|
175 |
-
from docx import Document
|
176 |
-
from io import BytesIO
|
177 |
-
from datetime import datetime
|
178 |
-
import tempfile
|
179 |
-
def create_report(result: pd.DataFrame) -> BytesIO:
|
180 |
-
doc = Document('template.docx')
|
181 |
-
doc.add_heading('Raport analizy ogłoszenia o pracę', 0)
|
182 |
-
doc.add_paragraph(f'Data wygenerowania: {datetime.now().strftime("%d.%m.%Y %H:%M")}')
|
183 |
-
for _, row in result.iterrows():
|
184 |
-
doc.add_heading(row['area'], 1)
|
185 |
-
# Dodanie znalezionego cytatu
|
186 |
-
doc.add_paragraph(row['citation'], style='Intense Quote')
|
187 |
-
# Dodanie interpretacji
|
188 |
-
citat=row['content']
|
189 |
-
lines = citat.split('\n')
|
190 |
-
for line in lines:
|
191 |
-
if len(line)>0:
|
192 |
-
doc.add_paragraph(line)
|
193 |
-
|
194 |
-
if pd.notna(row['more']):
|
195 |
-
explanation=row['more']
|
196 |
-
lines = explanation.split('\n')
|
197 |
-
for line in lines:
|
198 |
-
if len(line)>0:
|
199 |
-
doc.add_paragraph(line)
|
200 |
-
|
201 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp_file:
|
202 |
-
doc.save(tmp_file.name)
|
203 |
-
return tmp_file.name
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
# %%
|
208 |
-
demo=gr.Interface(
|
209 |
fn=analyze_job_ad,
|
210 |
-
inputs=[gr.TextArea(), gr.File()],
|
211 |
-
outputs=[gr.JSON(), gr.
|
212 |
-
title="KoREKtor"
|
213 |
).launch(inbrowser=True)
|
214 |
-
|
215 |
-
|
216 |
-
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
from langchain_core.prompts import PromptTemplate
|
4 |
from langchain_openai import ChatOpenAI
|
5 |
from langchain_core.output_parsers import StrOutputParser
|
|
|
6 |
from pydantic import BaseModel, Field, field_validator
|
7 |
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
|
8 |
+
from langchain.output_parsers import PydanticOutputParser
|
9 |
+
from docx import Document
|
10 |
+
from datetime import datetime
|
11 |
import os
|
12 |
+
import tempfile
|
13 |
|
14 |
+
# Model danych
|
|
|
15 |
class QuestionAnswer(BaseModel):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
question_number: int = Field(..., description="Numer pytania")
|
17 |
answer: str = Field(..., description="Odpowiedź, tylko TAK lub NIE")
|
18 |
citation: str = Field(..., description="Fragment cytatu")
|
|
|
23 |
raise ValueError("Odpowiedź musi być TAK lub NIE")
|
24 |
return v
|
25 |
|
|
|
26 |
class JobAdAnalysis(BaseModel):
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
answers: list[QuestionAnswer]
|
28 |
|
|
|
|
|
|
|
|
|
29 |
parser = PydanticOutputParser(pydantic_object=JobAdAnalysis)
|
30 |
|
31 |
+
# Wczytanie matrycy danych
|
32 |
+
matryca_df = pd.read_csv('matryca.csv', header=None,
|
33 |
+
names=['area', 'prompt', 'true', 'false', 'more', 'hint'])
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
+
question_to_area_map = {}
|
36 |
|
|
|
37 |
def prepare_questions(df):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
global question_to_area_map
|
39 |
question_to_area_map = {}
|
40 |
+
questions_text = ""
|
41 |
for index, row in df.iterrows():
|
42 |
question_number = index + 1
|
43 |
questions_text += f"{question_number} {row['prompt']}\n"
|
|
|
44 |
question_to_area_map[question_number] = {
|
45 |
'area': row['area'],
|
46 |
'true': row['true'],
|
|
|
48 |
'hint': row['hint'],
|
49 |
'more': row['more']
|
50 |
}
|
|
|
|
|
51 |
return questions_text
|
52 |
|
|
|
|
|
|
|
53 |
def doc_to_text(file):
|
54 |
+
extension = os.path.splitext(file.name)[1].lower()
|
55 |
+
if extension == ".docx":
|
56 |
+
loader = Docx2txtLoader(file.name)
|
57 |
+
elif extension == ".pdf":
|
58 |
+
loader = PyPDFLoader(file.name)
|
59 |
else:
|
60 |
return "error"
|
61 |
pages = loader.load()
|
62 |
+
return "\n".join(page.page_content for page in pages)
|
|
|
|
|
|
|
63 |
|
64 |
+
def create_report(result: pd.DataFrame) -> str:
|
65 |
+
doc = Document('template.docx')
|
66 |
+
doc.add_heading('Raport analizy ogłoszenia o pracę', 0)
|
67 |
+
doc.add_paragraph(f'Data wygenerowania: {datetime.now().strftime("%d.%m.%Y %H:%M")}')
|
68 |
+
for _, row in result.iterrows():
|
69 |
+
doc.add_heading(str(row['area']), 1)
|
70 |
+
doc.add_paragraph(str(row['citation']), style='Intense Quote')
|
71 |
+
for line in str(row['content']).split('\n'):
|
72 |
+
if line.strip():
|
73 |
+
doc.add_paragraph(line)
|
74 |
+
if pd.notna(row['more']):
|
75 |
+
for line in str(row['more']).split('\n'):
|
76 |
+
if line.strip():
|
77 |
+
doc.add_paragraph(line)
|
78 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
|
79 |
+
doc.save(tmp.name)
|
80 |
+
return tmp.name # Zwracamy ścieżkę do pliku tymczasowego
|
81 |
|
82 |
def analyze_job_ad(job_ad, file):
|
83 |
if file:
|
84 |
+
job_ad = doc_to_text(file)
|
85 |
if job_ad == "error":
|
86 |
return None, None
|
|
|
87 |
questions = prepare_questions(matryca_df)
|
88 |
prompt_template = PromptTemplate.from_template(
|
89 |
"""Przeanalizuj poniższe ogłoszenie o pracę pod kątem dostępności dla osób z niepełnosprawnościami.
|
90 |
+
|
91 |
Ogłoszenie:
|
92 |
{job_ad}
|
93 |
+
|
94 |
Odpowiedz na następujące pytania:
|
95 |
{questions}
|
96 |
+
|
97 |
Format odpowiedzi powinien być w następującej strukturze JSON:
|
98 |
{{
|
99 |
"answers": [
|
|
|
102 |
"answer": "TAK/NIE",
|
103 |
"citation": "dokładny cytat z tekstu"
|
104 |
}}
|
105 |
+
]
|
106 |
}}
|
107 |
"""
|
108 |
)
|
109 |
+
|
110 |
model = ChatOpenAI(temperature=0, model="gpt-4o-mini")
|
111 |
chain = prompt_template | model | parser
|
112 |
response = chain.invoke({"job_ad": job_ad, "questions": questions})
|
113 |
+
|
114 |
output_df = pd.DataFrame(columns=['area', 'answer', 'citation', 'content', 'more'])
|
115 |
for i in range(16):
|
116 |
+
if response.answers[i].answer in {"TAK", "NIE"}:
|
|
|
117 |
new_row = {
|
118 |
'area': matryca_df.area[i],
|
119 |
'answer': response.answers[i].answer,
|
120 |
'citation': response.answers[i].citation,
|
121 |
+
'content': matryca_df.true[i] if response.answers[i].answer == 'TAK' else matryca_df.false[i],
|
122 |
'more': matryca_df.more[i]
|
123 |
}
|
124 |
+
output_df = pd.concat([output_df, pd.DataFrame([new_row])], ignore_index=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
+
word_file_path = create_report(output_df)
|
127 |
+
json_output = {str(k): v for k, v in output_df.to_dict(orient="index").items()}
|
128 |
+
return json_output, word_file_path
|
129 |
|
130 |
+
# Interfejs Gradio
|
131 |
+
demo = gr.Interface(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
fn=analyze_job_ad,
|
133 |
+
inputs=[gr.TextArea(label="Ogłoszenie (opcjonalnie)"), gr.File(label="Plik PDF lub DOCX")],
|
134 |
+
outputs=[gr.JSON(label="Wyniki analizy"), gr.File(label="Pobierz raport w formacie Word")],
|
135 |
+
title="KoREKtor – analiza ogłoszenia",
|
136 |
).launch(inbrowser=True)
|
|
|
|
|
|