Spaces:

bacancydataprophets
/

MeDocChat

Runtime error

App Files Files Community

akash015 commited on Jun 27, 2024

Commit

c3ecbfd

verified ·

1 Parent(s): 95fb55f

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -199

app.py CHANGED Viewed

@@ -1,187 +1,4 @@
-# import re
-# import PyPDF2
-# from langchain_community.embeddings import OllamaEmbeddings
-# from langchain.text_splitter import RecursiveCharacterTextSplitter
-# from langchain_community.vectorstores import Chroma
-# from langchain.chains import ConversationalRetrievalChain
-# from langchain_community.chat_models import ChatOllama
-# from langchain_groq import ChatGroq
-# from langchain.memory import ChatMessageHistory, ConversationBufferMemory
-# import chainlit as cl
-# from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
-# import logging
-# import pypandoc
-# import pdfkit
-# from paddleocr import PaddleOCR
-# import fitz
-# import asyncio
-# from langchain_nomic.embeddings import NomicEmbeddings
-# llm_groq = ChatGroq(
-#             model_name='llama3-70b-8192'
-#     )
-# # Initialize anonymizer
-# anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'], faker_seed=18)
-# def extract_text_from_pdf(file_path):
-#     pdf = PyPDF2.PdfReader(file_path)
-#     pdf_text = ""
-#     for page in pdf.pages:
-#         pdf_text += page.extract_text()
-#     return pdf_text
-# def has_sufficient_selectable_text(page, threshold=50):
-#     text = page.extract_text()
-#     if len(text.strip()) > threshold:
-#         return True
-#     return False
-# async def get_text(file_path):
-#     text = ""
-#     try:
-#         logging.info("Starting OCR process for file: %s", file_path)
-#         extension = file_path.split(".")[-1].lower()
-#         allowed_extension = ["jpg", "jpeg", "png", "pdf", "docx"]
-#         if extension not in allowed_extension:
-#             error = "Not a valid File. Allowed Format are jpg, jpeg, png, pdf, docx"
-#             logging.error(error)
-#             return {"error": error}
-#         if extension == "docx":
-#             file_path = convert_docx_to_pdf(file_path)
-#         ocr = PaddleOCR(use_angle_cls=True, lang='en')
-#         result = ocr.ocr(file_path, cls=True)
-#         for idx in range(len(result)):
-#             res = result[idx]
-#             for line in res:
-#                 text += line[1][0] + " "
-#         logging.info("OCR process completed successfully for file: %s", file_path)
-#     except Exception as e:
-#         logging.error("Error occurred during OCR process for file %s: %s", file_path, e)
-#         text = "Error occurred during OCR process."
-#     logging.info("Extracted text: %s", text)
-#     return text
-# def convert_docx_to_pdf(input_path):
-#     html_path = input_path.replace('.docx', '.html')
-#     output_path = ".".join(input_path.split(".")[:-1]) + ".pdf"
-#     pypandoc.convert_file(input_path, 'html', outputfile=html_path)
-#     pdfkit.from_file(html_path, output_path)
-#     logging.info("DOCX Format Handled")
-#     return output_path
-# async def extract_text_from_mixed_pdf(file_path):
-#     pdf = PyPDF2.PdfReader(file_path)
-#     ocr = PaddleOCR(use_angle_cls=True, lang='en')
-#     pdf_text = ""
-#     for i, page in enumerate(pdf.pages):
-#         text = page.extract_text()
-#         if not has_sufficient_selectable_text(page):
-#             logging.info(f"Page {i+1} has insufficient selectable text, performing OCR.")
-#             pdf_document = fitz.open(file_path)
-#             pdf_page = pdf_document.load_page(i)
-#             pix = pdf_page.get_pixmap()
-#             image_path = f"page_{i+1}.png"
-#             pix.save(image_path)
-#             result = ocr.ocr(image_path, cls=True)
-#             for idx in range(len(result)):
-#                 res = result[idx]
-#                 for line in res:
-#                     text += line[1][0] + " "
-#         pdf_text += text
-#     return pdf_text
-# @cl.on_chat_start
-# async def on_chat_start():
-#     files = None # Initialize variable to store uploaded files
-#     # Wait for the user to upload a file
-#     while files is None:
-#         files = await cl.AskFileMessage(
-#             content="Please upload a pdf file to begin!",
-#             # accept=["application/pdf"],
-#             accept=["application/pdf", "image/jpeg", "image/png", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
-#             max_size_mb=100,
-#             timeout=180,
-#         ).send()
-#     file = files[0] # Get the first uploaded file
-#     # Inform the user that processing has started
-#     msg = cl.Message(content=f"Processing `{file.name}`...")
-#     await msg.send()
-#     # Extract text from PDF, checking for selectable and handwritten text
-#     if file.name.endswith('.pdf'):
-#         pdf_text = await extract_text_from_mixed_pdf(file.path)
-#     else:
-#         pdf_text = await get_text(file.path)
-#     # Anonymize the text
-#     anonymized_text = anonymizer.anonymize(
-#         pdf_text
-#     )
-#     embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
-#     docsearch = await cl.make_async(Chroma.from_texts)(
-#         [anonymized_text], embeddings, metadatas=[{"source": "0-pl"}]
-#     )
-#     # }
-#     # Initialize message history for conversation
-#     message_history = ChatMessageHistory()
-#     # Memory for conversational context
-#     memory = ConversationBufferMemory(
-#         memory_key="chat_history",
-#         output_key="answer",
-#         chat_memory=message_history,
-#         return_messages=True,
-#     )
-#     # Create a chain that uses the Chroma vector store
-#     chain = ConversationalRetrievalChain.from_llm(
-#         llm = llm_groq,
-#         chain_type="stuff",
-#         retriever=docsearch.as_retriever(),
-#         memory=memory,
-#         return_source_documents=True,
-#     )
-#     # Let the user know that the system is ready
-#     msg.content = f"Processing `{file.name}` done. You can now ask questions!"
-#     await msg.update()
-#     # Store the chain in user session
-#     cl.user_session.set("chain", chain)
-# @cl.on_message
-# async def main(message: cl.Message):
-#     # Retrieve the chain from user session
-#     chain = cl.user_session.get("chain")
-#     # Callbacks happen asynchronously/parallel
-#     cb = cl.AsyncLangchainCallbackHandler()
-#     # Call the chain with user's message content
-#     res = await chain.ainvoke(message.content, callbacks=[cb])
-#     answer = anonymizer.deanonymize(
-#         res["answer"]
-#     )
-#     text_elements = []
-#     # Return results
-#     await cl.Message(content=answer, elements=text_elements).send()
-# v2:
-import re
 import PyPDF2
 from langchain_community.embeddings import OllamaEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -196,19 +13,16 @@ import logging
 import pypandoc
 import pdfkit
 from paddleocr import PaddleOCR
-import fitz
 import asyncio
 from langchain_nomic.embeddings import NomicEmbeddings
 llm_groq = ChatGroq(
-    model_name='llama3-70b-8192'
-)
 # Initialize anonymizer
-anonymizer = PresidioReversibleAnonymizer(
-    analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'],
-    faker_seed=18
-)
 def extract_text_from_pdf(file_path):
     pdf = PyPDF2.PdfReader(file_path)
@@ -233,10 +47,10 @@ async def get_text(file_path):
             error = "Not a valid File. Allowed Format are jpg, jpeg, png, pdf, docx"
             logging.error(error)
             return {"error": error}
         if extension == "docx":
             file_path = convert_docx_to_pdf(file_path)
         ocr = PaddleOCR(use_angle_cls=True, lang='en')
         result = ocr.ocr(file_path, cls=True)
         for idx in range(len(result)):
@@ -281,19 +95,21 @@ async def extract_text_from_mixed_pdf(file_path):
 @cl.on_chat_start
 async def on_chat_start():
-    files = None  # Initialize variable to store uploaded files
     # Wait for the user to upload a file
     while files is None:
         files = await cl.AskFileMessage(
             content="Please upload a pdf file to begin!",
             accept=["application/pdf", "image/jpeg", "image/png", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
             max_size_mb=100,
             timeout=180,
         ).send()
-    file = files[0]  # Get the first uploaded file
     # Inform the user that processing has started
     msg = cl.Message(content=f"Processing `{file.name}`...")
     await msg.send()
@@ -314,6 +130,7 @@ async def on_chat_start():
     docsearch = await cl.make_async(Chroma.from_texts)(
         [anonymized_text], embeddings, metadatas=[{"source": "0-pl"}]
     )
     # Initialize message history for conversation
     message_history = ChatMessageHistory()
@@ -338,14 +155,15 @@ async def on_chat_start():
     # Let the user know that the system is ready
     msg.content = f"Processing `{file.name}` done. You can now ask questions!"
     await msg.update()
     # Store the chain in user session
     cl.user_session.set("chain", chain)
 @cl.on_message
 async def main(message: cl.Message):
     # Retrieve the chain from user session
-    chain = cl.user_session.get("chain")
     # Callbacks happen asynchronously/parallel
     cb = cl.AsyncLangchainCallbackHandler()
@@ -358,4 +176,6 @@ async def main(message: cl.Message):
     # Return results
     await cl.Message(content=answer, elements=text_elements).send()

+import re
 import PyPDF2
 from langchain_community.embeddings import OllamaEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import pypandoc
 import pdfkit
 from paddleocr import PaddleOCR
+import fitz
 import asyncio
 from langchain_nomic.embeddings import NomicEmbeddings
 llm_groq = ChatGroq(
+            model_name='llama3-70b-8192'
+    )
 # Initialize anonymizer
+anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'], faker_seed=18)
 def extract_text_from_pdf(file_path):
     pdf = PyPDF2.PdfReader(file_path)
             error = "Not a valid File. Allowed Format are jpg, jpeg, png, pdf, docx"
             logging.error(error)
             return {"error": error}
         if extension == "docx":
             file_path = convert_docx_to_pdf(file_path)
         ocr = PaddleOCR(use_angle_cls=True, lang='en')
         result = ocr.ocr(file_path, cls=True)
         for idx in range(len(result)):
 @cl.on_chat_start
 async def on_chat_start():
+    files = None # Initialize variable to store uploaded files
     # Wait for the user to upload a file
     while files is None:
         files = await cl.AskFileMessage(
             content="Please upload a pdf file to begin!",
+            # accept=["application/pdf"],
             accept=["application/pdf", "image/jpeg", "image/png", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
             max_size_mb=100,
             timeout=180,
         ).send()
+    file = files[0] # Get the first uploaded file
     # Inform the user that processing has started
     msg = cl.Message(content=f"Processing `{file.name}`...")
     await msg.send()
     docsearch = await cl.make_async(Chroma.from_texts)(
         [anonymized_text], embeddings, metadatas=[{"source": "0-pl"}]
     )
+    # }
     # Initialize message history for conversation
     message_history = ChatMessageHistory()
     # Let the user know that the system is ready
     msg.content = f"Processing `{file.name}` done. You can now ask questions!"
     await msg.update()
     # Store the chain in user session
     cl.user_session.set("chain", chain)
 @cl.on_message
 async def main(message: cl.Message):
     # Retrieve the chain from user session
+    chain = cl.user_session.get("chain")
     # Callbacks happen asynchronously/parallel
     cb = cl.AsyncLangchainCallbackHandler()
     # Return results
     await cl.Message(content=answer, elements=text_elements).send()