Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -181,6 +181,104 @@
|
|
181 |
|
182 |
|
183 |
# v2:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
@cl.on_chat_start
|
185 |
async def on_chat_start():
|
186 |
|
@@ -245,3 +343,20 @@ async def on_chat_start():
|
|
245 |
# Store the chain in user session
|
246 |
cl.user_session.set("chain", chain)
|
247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
|
182 |
|
183 |
# v2:
|
184 |
+
import re
|
185 |
+
import PyPDF2
|
186 |
+
from langchain_community.embeddings import OllamaEmbeddings
|
187 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
188 |
+
from langchain_community.vectorstores import Chroma
|
189 |
+
from langchain.chains import ConversationalRetrievalChain
|
190 |
+
from langchain_community.chat_models import ChatOllama
|
191 |
+
from langchain_groq import ChatGroq
|
192 |
+
from langchain.memory import ChatMessageHistory, ConversationBufferMemory
|
193 |
+
import chainlit as cl
|
194 |
+
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
|
195 |
+
import logging
|
196 |
+
import pypandoc
|
197 |
+
import pdfkit
|
198 |
+
from paddleocr import PaddleOCR
|
199 |
+
import fitz
|
200 |
+
import asyncio
|
201 |
+
from langchain_nomic.embeddings import NomicEmbeddings
|
202 |
+
|
203 |
+
llm_groq = ChatGroq(
|
204 |
+
model_name='llama3-70b-8192'
|
205 |
+
)
|
206 |
+
|
207 |
+
# Initialize anonymizer
|
208 |
+
anonymizer = PresidioReversibleAnonymizer(
|
209 |
+
analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'],
|
210 |
+
faker_seed=18
|
211 |
+
)
|
212 |
+
|
213 |
+
def extract_text_from_pdf(file_path):
|
214 |
+
pdf = PyPDF2.PdfReader(file_path)
|
215 |
+
pdf_text = ""
|
216 |
+
for page in pdf.pages:
|
217 |
+
pdf_text += page.extract_text()
|
218 |
+
return pdf_text
|
219 |
+
|
220 |
+
def has_sufficient_selectable_text(page, threshold=50):
|
221 |
+
text = page.extract_text()
|
222 |
+
if len(text.strip()) > threshold:
|
223 |
+
return True
|
224 |
+
return False
|
225 |
+
|
226 |
+
async def get_text(file_path):
|
227 |
+
text = ""
|
228 |
+
try:
|
229 |
+
logging.info("Starting OCR process for file: %s", file_path)
|
230 |
+
extension = file_path.split(".")[-1].lower()
|
231 |
+
allowed_extension = ["jpg", "jpeg", "png", "pdf", "docx"]
|
232 |
+
if extension not in allowed_extension:
|
233 |
+
error = "Not a valid File. Allowed Format are jpg, jpeg, png, pdf, docx"
|
234 |
+
logging.error(error)
|
235 |
+
return {"error": error}
|
236 |
+
|
237 |
+
if extension == "docx":
|
238 |
+
file_path = convert_docx_to_pdf(file_path)
|
239 |
+
|
240 |
+
ocr = PaddleOCR(use_angle_cls=True, lang='en')
|
241 |
+
result = ocr.ocr(file_path, cls=True)
|
242 |
+
for idx in range(len(result)):
|
243 |
+
res = result[idx]
|
244 |
+
for line in res:
|
245 |
+
text += line[1][0] + " "
|
246 |
+
logging.info("OCR process completed successfully for file: %s", file_path)
|
247 |
+
except Exception as e:
|
248 |
+
logging.error("Error occurred during OCR process for file %s: %s", file_path, e)
|
249 |
+
text = "Error occurred during OCR process."
|
250 |
+
logging.info("Extracted text: %s", text)
|
251 |
+
return text
|
252 |
+
|
253 |
+
def convert_docx_to_pdf(input_path):
|
254 |
+
html_path = input_path.replace('.docx', '.html')
|
255 |
+
output_path = ".".join(input_path.split(".")[:-1]) + ".pdf"
|
256 |
+
pypandoc.convert_file(input_path, 'html', outputfile=html_path)
|
257 |
+
pdfkit.from_file(html_path, output_path)
|
258 |
+
logging.info("DOCX Format Handled")
|
259 |
+
return output_path
|
260 |
+
|
261 |
+
async def extract_text_from_mixed_pdf(file_path):
|
262 |
+
pdf = PyPDF2.PdfReader(file_path)
|
263 |
+
ocr = PaddleOCR(use_angle_cls=True, lang='en')
|
264 |
+
pdf_text = ""
|
265 |
+
for i, page in enumerate(pdf.pages):
|
266 |
+
text = page.extract_text()
|
267 |
+
if not has_sufficient_selectable_text(page):
|
268 |
+
logging.info(f"Page {i+1} has insufficient selectable text, performing OCR.")
|
269 |
+
pdf_document = fitz.open(file_path)
|
270 |
+
pdf_page = pdf_document.load_page(i)
|
271 |
+
pix = pdf_page.get_pixmap()
|
272 |
+
image_path = f"page_{i+1}.png"
|
273 |
+
pix.save(image_path)
|
274 |
+
result = ocr.ocr(image_path, cls=True)
|
275 |
+
for idx in range(len(result)):
|
276 |
+
res = result[idx]
|
277 |
+
for line in res:
|
278 |
+
text += line[1][0] + " "
|
279 |
+
pdf_text += text
|
280 |
+
return pdf_text
|
281 |
+
|
282 |
@cl.on_chat_start
|
283 |
async def on_chat_start():
|
284 |
|
|
|
343 |
# Store the chain in user session
|
344 |
cl.user_session.set("chain", chain)
|
345 |
|
346 |
+
@cl.on_message
|
347 |
+
async def main(message: cl.Message):
|
348 |
+
|
349 |
+
# Retrieve the chain from user session
|
350 |
+
chain = cl.user_session.get("chain")
|
351 |
+
# Callbacks happen asynchronously/parallel
|
352 |
+
cb = cl.AsyncLangchainCallbackHandler()
|
353 |
+
|
354 |
+
# Call the chain with user's message content
|
355 |
+
res = await chain.ainvoke(message.content, callbacks=[cb])
|
356 |
+
answer = anonymizer.deanonymize(
|
357 |
+
res["answer"]
|
358 |
+
)
|
359 |
+
text_elements = []
|
360 |
+
|
361 |
+
# Return results
|
362 |
+
await cl.Message(content=answer, elements=text_elements).send()
|