akash015 commited on
Commit
482ac25
·
verified ·
1 Parent(s): 6e0b1c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -0
app.py CHANGED
@@ -181,6 +181,104 @@
181
 
182
 
183
  # v2:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  @cl.on_chat_start
185
  async def on_chat_start():
186
 
@@ -245,3 +343,20 @@ async def on_chat_start():
245
  # Store the chain in user session
246
  cl.user_session.set("chain", chain)
247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
 
183
  # v2:
184
+ import re
185
+ import PyPDF2
186
+ from langchain_community.embeddings import OllamaEmbeddings
187
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
188
+ from langchain_community.vectorstores import Chroma
189
+ from langchain.chains import ConversationalRetrievalChain
190
+ from langchain_community.chat_models import ChatOllama
191
+ from langchain_groq import ChatGroq
192
+ from langchain.memory import ChatMessageHistory, ConversationBufferMemory
193
+ import chainlit as cl
194
+ from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
195
+ import logging
196
+ import pypandoc
197
+ import pdfkit
198
+ from paddleocr import PaddleOCR
199
+ import fitz
200
+ import asyncio
201
+ from langchain_nomic.embeddings import NomicEmbeddings
202
+
203
+ llm_groq = ChatGroq(
204
+ model_name='llama3-70b-8192'
205
+ )
206
+
207
+ # Initialize anonymizer
208
+ anonymizer = PresidioReversibleAnonymizer(
209
+ analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'],
210
+ faker_seed=18
211
+ )
212
+
213
+ def extract_text_from_pdf(file_path):
214
+ pdf = PyPDF2.PdfReader(file_path)
215
+ pdf_text = ""
216
+ for page in pdf.pages:
217
+ pdf_text += page.extract_text()
218
+ return pdf_text
219
+
220
+ def has_sufficient_selectable_text(page, threshold=50):
221
+ text = page.extract_text()
222
+ if len(text.strip()) > threshold:
223
+ return True
224
+ return False
225
+
226
+ async def get_text(file_path):
227
+ text = ""
228
+ try:
229
+ logging.info("Starting OCR process for file: %s", file_path)
230
+ extension = file_path.split(".")[-1].lower()
231
+ allowed_extension = ["jpg", "jpeg", "png", "pdf", "docx"]
232
+ if extension not in allowed_extension:
233
+ error = "Not a valid File. Allowed Format are jpg, jpeg, png, pdf, docx"
234
+ logging.error(error)
235
+ return {"error": error}
236
+
237
+ if extension == "docx":
238
+ file_path = convert_docx_to_pdf(file_path)
239
+
240
+ ocr = PaddleOCR(use_angle_cls=True, lang='en')
241
+ result = ocr.ocr(file_path, cls=True)
242
+ for idx in range(len(result)):
243
+ res = result[idx]
244
+ for line in res:
245
+ text += line[1][0] + " "
246
+ logging.info("OCR process completed successfully for file: %s", file_path)
247
+ except Exception as e:
248
+ logging.error("Error occurred during OCR process for file %s: %s", file_path, e)
249
+ text = "Error occurred during OCR process."
250
+ logging.info("Extracted text: %s", text)
251
+ return text
252
+
253
+ def convert_docx_to_pdf(input_path):
254
+ html_path = input_path.replace('.docx', '.html')
255
+ output_path = ".".join(input_path.split(".")[:-1]) + ".pdf"
256
+ pypandoc.convert_file(input_path, 'html', outputfile=html_path)
257
+ pdfkit.from_file(html_path, output_path)
258
+ logging.info("DOCX Format Handled")
259
+ return output_path
260
+
261
+ async def extract_text_from_mixed_pdf(file_path):
262
+ pdf = PyPDF2.PdfReader(file_path)
263
+ ocr = PaddleOCR(use_angle_cls=True, lang='en')
264
+ pdf_text = ""
265
+ for i, page in enumerate(pdf.pages):
266
+ text = page.extract_text()
267
+ if not has_sufficient_selectable_text(page):
268
+ logging.info(f"Page {i+1} has insufficient selectable text, performing OCR.")
269
+ pdf_document = fitz.open(file_path)
270
+ pdf_page = pdf_document.load_page(i)
271
+ pix = pdf_page.get_pixmap()
272
+ image_path = f"page_{i+1}.png"
273
+ pix.save(image_path)
274
+ result = ocr.ocr(image_path, cls=True)
275
+ for idx in range(len(result)):
276
+ res = result[idx]
277
+ for line in res:
278
+ text += line[1][0] + " "
279
+ pdf_text += text
280
+ return pdf_text
281
+
282
  @cl.on_chat_start
283
  async def on_chat_start():
284
 
 
343
  # Store the chain in user session
344
  cl.user_session.set("chain", chain)
345
 
346
+ @cl.on_message
347
+ async def main(message: cl.Message):
348
+
349
+ # Retrieve the chain from user session
350
+ chain = cl.user_session.get("chain")
351
+ # Callbacks happen asynchronously/parallel
352
+ cb = cl.AsyncLangchainCallbackHandler()
353
+
354
+ # Call the chain with user's message content
355
+ res = await chain.ainvoke(message.content, callbacks=[cb])
356
+ answer = anonymizer.deanonymize(
357
+ res["answer"]
358
+ )
359
+ text_elements = []
360
+
361
+ # Return results
362
+ await cl.Message(content=answer, elements=text_elements).send()