Spaces:

bacancydataprophets
/

MeDocChat

Runtime error

File size: 12,760 Bytes

# import re 
# import PyPDF2
# from langchain_community.embeddings import OllamaEmbeddings
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_community.vectorstores import Chroma
# from langchain.chains import ConversationalRetrievalChain
# from langchain_community.chat_models import ChatOllama
# from langchain_groq import ChatGroq
# from langchain.memory import ChatMessageHistory, ConversationBufferMemory
# import chainlit as cl
# from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
# import logging
# import pypandoc
# import pdfkit
# from paddleocr import PaddleOCR
# import fitz  
# import asyncio
# from langchain_nomic.embeddings import NomicEmbeddings

# llm_groq = ChatGroq(
#             model_name='llama3-70b-8192'
#     )

# # Initialize anonymizer
# anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'], faker_seed=18)

# def extract_text_from_pdf(file_path):
#     pdf = PyPDF2.PdfReader(file_path)
#     pdf_text = ""
#     for page in pdf.pages:
#         pdf_text += page.extract_text()
#     return pdf_text

# def has_sufficient_selectable_text(page, threshold=50):
#     text = page.extract_text()
#     if len(text.strip()) > threshold:
#         return True
#     return False

# async def get_text(file_path):
#     text = ""
#     try:
#         logging.info("Starting OCR process for file: %s", file_path)
#         extension = file_path.split(".")[-1].lower()
#         allowed_extension = ["jpg", "jpeg", "png", "pdf", "docx"]
#         if extension not in allowed_extension:
#             error = "Not a valid File. Allowed Format are jpg, jpeg, png, pdf, docx"
#             logging.error(error)
#             return {"error": error}
        
#         if extension == "docx":
#             file_path = convert_docx_to_pdf(file_path)
        
#         ocr = PaddleOCR(use_angle_cls=True, lang='en')
#         result = ocr.ocr(file_path, cls=True)
#         for idx in range(len(result)):
#             res = result[idx]
#             for line in res:
#                 text += line[1][0] + " "
#         logging.info("OCR process completed successfully for file: %s", file_path)
#     except Exception as e:
#         logging.error("Error occurred during OCR process for file %s: %s", file_path, e)
#         text = "Error occurred during OCR process."
#     logging.info("Extracted text: %s", text)
#     return text

# def convert_docx_to_pdf(input_path):
#     html_path = input_path.replace('.docx', '.html')
#     output_path = ".".join(input_path.split(".")[:-1]) + ".pdf"
#     pypandoc.convert_file(input_path, 'html', outputfile=html_path)
#     pdfkit.from_file(html_path, output_path)
#     logging.info("DOCX Format Handled")
#     return output_path

# async def extract_text_from_mixed_pdf(file_path):
#     pdf = PyPDF2.PdfReader(file_path)
#     ocr = PaddleOCR(use_angle_cls=True, lang='en')
#     pdf_text = ""
#     for i, page in enumerate(pdf.pages):
#         text = page.extract_text()
#         if not has_sufficient_selectable_text(page):
#             logging.info(f"Page {i+1} has insufficient selectable text, performing OCR.")
#             pdf_document = fitz.open(file_path)
#             pdf_page = pdf_document.load_page(i)
#             pix = pdf_page.get_pixmap()
#             image_path = f"page_{i+1}.png"
#             pix.save(image_path)
#             result = ocr.ocr(image_path, cls=True)
#             for idx in range(len(result)):
#                 res = result[idx]
#                 for line in res:
#                     text += line[1][0] + " "
#         pdf_text += text
#     return pdf_text

# @cl.on_chat_start
# async def on_chat_start():
    
#     files = None # Initialize variable to store uploaded files

#     # Wait for the user to upload a file
#     while files is None:
#         files = await cl.AskFileMessage(
#             content="Please upload a pdf file to begin!",
#             # accept=["application/pdf"],
#             accept=["application/pdf", "image/jpeg", "image/png", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
#             max_size_mb=100,
#             timeout=180, 
#         ).send()

#     file = files[0] # Get the first uploaded file
    
#     # Inform the user that processing has started
#     msg = cl.Message(content=f"Processing `{file.name}`...")
#     await msg.send()

#     # Extract text from PDF, checking for selectable and handwritten text
#     if file.name.endswith('.pdf'):
#         pdf_text = await extract_text_from_mixed_pdf(file.path)
#     else:
#         pdf_text = await get_text(file.path)

#     # Anonymize the text
#     anonymized_text = anonymizer.anonymize(
#         pdf_text
#     )
    
#     embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
    
#     docsearch = await cl.make_async(Chroma.from_texts)(
#         [anonymized_text], embeddings, metadatas=[{"source": "0-pl"}]
#     )
#     # }
    
#     # Initialize message history for conversation
#     message_history = ChatMessageHistory()
    
#     # Memory for conversational context
#     memory = ConversationBufferMemory(
#         memory_key="chat_history",
#         output_key="answer",
#         chat_memory=message_history,
#         return_messages=True,
#     )

#     # Create a chain that uses the Chroma vector store
#     chain = ConversationalRetrievalChain.from_llm(
#         llm = llm_groq,
#         chain_type="stuff",
#         retriever=docsearch.as_retriever(),
#         memory=memory,
#         return_source_documents=True,
#     )

#     # Let the user know that the system is ready
#     msg.content = f"Processing `{file.name}` done. You can now ask questions!"
#     await msg.update()
#     # Store the chain in user session
#     cl.user_session.set("chain", chain)


# @cl.on_message
# async def main(message: cl.Message):
        
#     # Retrieve the chain from user session
#     chain = cl.user_session.get("chain") 
#     # Callbacks happen asynchronously/parallel 
#     cb = cl.AsyncLangchainCallbackHandler()
    
#     # Call the chain with user's message content
#     res = await chain.ainvoke(message.content, callbacks=[cb])
#     answer = anonymizer.deanonymize(
#         res["answer"]
#     )  
#     text_elements = [] 
            
#     # Return results
#     await cl.Message(content=answer, elements=text_elements).send()




# v2:
import re
import PyPDF2
from langchain_community.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain_community.chat_models import ChatOllama
from langchain_groq import ChatGroq
from langchain.memory import ChatMessageHistory, ConversationBufferMemory
import chainlit as cl
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
import logging
import pypandoc
import pdfkit
from paddleocr import PaddleOCR
import fitz
import asyncio
from langchain_nomic.embeddings import NomicEmbeddings

llm_groq = ChatGroq(
    model_name='llama3-70b-8192'
)

# Initialize anonymizer
anonymizer = PresidioReversibleAnonymizer(
    analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'],
    faker_seed=18
)

def extract_text_from_pdf(file_path):
    pdf = PyPDF2.PdfReader(file_path)
    pdf_text = ""
    for page in pdf.pages:
        pdf_text += page.extract_text()
    return pdf_text

def has_sufficient_selectable_text(page, threshold=50):
    text = page.extract_text()
    if len(text.strip()) > threshold:
        return True
    return False

async def get_text(file_path):
    text = ""
    try:
        logging.info("Starting OCR process for file: %s", file_path)
        extension = file_path.split(".")[-1].lower()
        allowed_extension = ["jpg", "jpeg", "png", "pdf", "docx"]
        if extension not in allowed_extension:
            error = "Not a valid File. Allowed Format are jpg, jpeg, png, pdf, docx"
            logging.error(error)
            return {"error": error}

        if extension == "docx":
            file_path = convert_docx_to_pdf(file_path)

        ocr = PaddleOCR(use_angle_cls=True, lang='en')
        result = ocr.ocr(file_path, cls=True)
        for idx in range(len(result)):
            res = result[idx]
            for line in res:
                text += line[1][0] + " "
        logging.info("OCR process completed successfully for file: %s", file_path)
    except Exception as e:
        logging.error("Error occurred during OCR process for file %s: %s", file_path, e)
        text = "Error occurred during OCR process."
    logging.info("Extracted text: %s", text)
    return text

def convert_docx_to_pdf(input_path):
    html_path = input_path.replace('.docx', '.html')
    output_path = ".".join(input_path.split(".")[:-1]) + ".pdf"
    pypandoc.convert_file(input_path, 'html', outputfile=html_path)
    pdfkit.from_file(html_path, output_path)
    logging.info("DOCX Format Handled")
    return output_path

async def extract_text_from_mixed_pdf(file_path):
    pdf = PyPDF2.PdfReader(file_path)
    ocr = PaddleOCR(use_angle_cls=True, lang='en')
    pdf_text = ""
    for i, page in enumerate(pdf.pages):
        text = page.extract_text()
        if not has_sufficient_selectable_text(page):
            logging.info(f"Page {i+1} has insufficient selectable text, performing OCR.")
            pdf_document = fitz.open(file_path)
            pdf_page = pdf_document.load_page(i)
            pix = pdf_page.get_pixmap()
            image_path = f"page_{i+1}.png"
            pix.save(image_path)
            result = ocr.ocr(image_path, cls=True)
            for idx in range(len(result)):
                res = result[idx]
                for line in res:
                    text += line[1][0] + " "
        pdf_text += text
    return pdf_text

@cl.on_chat_start
async def on_chat_start():
    
    files = None  # Initialize variable to store uploaded files

    # Wait for the user to upload a file
    while files is None:
        files = await cl.AskFileMessage(
            content="Please upload a pdf file to begin!",
            accept=["application/pdf", "image/jpeg", "image/png", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
            max_size_mb=100,
            timeout=180, 
        ).send()

    file = files[0]  # Get the first uploaded file

    # Inform the user that processing has started
    msg = cl.Message(content=f"Processing `{file.name}`...")
    await msg.send()

    # Extract text from PDF, checking for selectable and handwritten text
    if file.name.endswith('.pdf'):
        pdf_text = await extract_text_from_mixed_pdf(file.path)
    else:
        pdf_text = await get_text(file.path)

    # Anonymize the text
    anonymized_text = anonymizer.anonymize(
        pdf_text
    )
    
    embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
    
    docsearch = await cl.make_async(Chroma.from_texts)(
        [anonymized_text], embeddings, metadatas=[{"source": "0-pl"}]
    )
    
    # Initialize message history for conversation
    message_history = ChatMessageHistory()
    
    # Memory for conversational context
    memory = ConversationBufferMemory(
        memory_key="chat_history",
        output_key="answer",
        chat_memory=message_history,
        return_messages=True,
    )

    # Create a chain that uses the Chroma vector store
    chain = ConversationalRetrievalChain.from_llm(
        llm = llm_groq,
        chain_type="stuff",
        retriever=docsearch.as_retriever(),
        memory=memory,
        return_source_documents=True,
    )

    # Let the user know that the system is ready
    msg.content = f"Processing `{file.name}` done. You can now ask questions!"
    await msg.update()
    
    # Store the chain in user session
    cl.user_session.set("chain", chain)

@cl.on_message
async def main(message: cl.Message):
        
    # Retrieve the chain from user session
    chain = cl.user_session.get("chain")
    # Callbacks happen asynchronously/parallel 
    cb = cl.AsyncLangchainCallbackHandler()
    
    # Call the chain with user's message content
    res = await chain.ainvoke(message.content, callbacks=[cb])
    answer = anonymizer.deanonymize(
        res["answer"]
    )  
    text_elements = [] 
            
    # Return results
    await cl.Message(content=answer, elements=text_elements).send()