akash015 commited on
Commit
0a5c7ec
·
verified ·
1 Parent(s): 39ad3de

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +199 -0
  2. public/test.css +20 -0
  3. requirements.txt +21 -0
app.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import PyPDF2
3
+ from langchain_community.embeddings import OllamaEmbeddings
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_community.vectorstores import Chroma
6
+ from langchain.chains import ConversationalRetrievalChain
7
+ from langchain_community.chat_models import ChatOllama
8
+ from langchain_groq import ChatGroq
9
+ from langchain.memory import ChatMessageHistory, ConversationBufferMemory
10
+ import chainlit as cl
11
+ from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
12
+ import logging
13
+ import pypandoc
14
+ import pdfkit
15
+ from paddleocr import PaddleOCR
16
+ import fitz
17
+ import asyncio
18
+ from langchain_nomic.embeddings import NomicEmbeddings
19
+
20
+ llm_groq = ChatGroq(
21
+ model_name='llama3-70b-8192'
22
+ )
23
+
24
+ # Initialize anonymizer
25
+ anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL', 'US_BANK_NUMBER', 'US_DRIVER_LICENSE', 'US_ITIN', 'US_PASSPORT', 'US_SSN'], faker_seed=18)
26
+
27
+ def extract_text_from_pdf(file_path):
28
+ pdf = PyPDF2.PdfReader(file_path)
29
+ pdf_text = ""
30
+ for page in pdf.pages:
31
+ pdf_text += page.extract_text()
32
+ return pdf_text
33
+
34
+ def has_sufficient_selectable_text(page, threshold=50):
35
+ text = page.extract_text()
36
+ if len(text.strip()) > threshold:
37
+ return True
38
+ return False
39
+
40
+ async def get_text(file_path):
41
+ text = ""
42
+ try:
43
+ logging.info("Starting OCR process for file: %s", file_path)
44
+ extension = file_path.split(".")[-1].lower()
45
+ allowed_extension = ["jpg", "jpeg", "png", "pdf", "docx"]
46
+ if extension not in allowed_extension:
47
+ error = "Not a valid File. Allowed Format are jpg, jpeg, png, pdf, docx"
48
+ logging.error(error)
49
+ return {"error": error}
50
+
51
+ if extension == "docx":
52
+ file_path = convert_docx_to_pdf(file_path)
53
+
54
+ ocr = PaddleOCR(use_angle_cls=True, lang='en')
55
+ result = ocr.ocr(file_path, cls=True)
56
+ for idx in range(len(result)):
57
+ res = result[idx]
58
+ for line in res:
59
+ text += line[1][0] + " "
60
+ logging.info("OCR process completed successfully for file: %s", file_path)
61
+ except Exception as e:
62
+ logging.error("Error occurred during OCR process for file %s: %s", file_path, e)
63
+ text = "Error occurred during OCR process."
64
+ logging.info("Extracted text: %s", text)
65
+ return text
66
+
67
+ def convert_docx_to_pdf(input_path):
68
+ html_path = input_path.replace('.docx', '.html')
69
+ output_path = ".".join(input_path.split(".")[:-1]) + ".pdf"
70
+ pypandoc.convert_file(input_path, 'html', outputfile=html_path)
71
+ pdfkit.from_file(html_path, output_path)
72
+ logging.info("DOCX Format Handled")
73
+ return output_path
74
+
75
+ async def extract_text_from_mixed_pdf(file_path):
76
+ pdf = PyPDF2.PdfReader(file_path)
77
+ ocr = PaddleOCR(use_angle_cls=True, lang='en')
78
+ pdf_text = ""
79
+ for i, page in enumerate(pdf.pages):
80
+ text = page.extract_text()
81
+ if not has_sufficient_selectable_text(page):
82
+ logging.info(f"Page {i+1} has insufficient selectable text, performing OCR.")
83
+ pdf_document = fitz.open(file_path)
84
+ pdf_page = pdf_document.load_page(i)
85
+ pix = pdf_page.get_pixmap()
86
+ image_path = f"page_{i+1}.png"
87
+ pix.save(image_path)
88
+ result = ocr.ocr(image_path, cls=True)
89
+ for idx in range(len(result)):
90
+ res = result[idx]
91
+ for line in res:
92
+ text += line[1][0] + " "
93
+ pdf_text += text
94
+ return pdf_text
95
+
96
+ @cl.on_chat_start
97
+ async def on_chat_start():
98
+
99
+ files = None # Initialize variable to store uploaded files
100
+
101
+ # Wait for the user to upload a file
102
+ while files is None:
103
+ files = await cl.AskFileMessage(
104
+ content="Please upload a pdf file to begin!",
105
+ # accept=["application/pdf"],
106
+ accept=["application/pdf", "image/jpeg", "image/png", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
107
+ max_size_mb=100,
108
+ timeout=180,
109
+ ).send()
110
+
111
+ file = files[0] # Get the first uploaded file
112
+
113
+ # Inform the user that processing has started
114
+ msg = cl.Message(content=f"Processing `{file.name}`...")
115
+ await msg.send()
116
+
117
+ # Extract text from PDF, checking for selectable and handwritten text
118
+ if file.name.endswith('.pdf'):
119
+ pdf_text = await extract_text_from_mixed_pdf(file.path)
120
+ else:
121
+ pdf_text = await get_text(file.path)
122
+
123
+ # Anonymize the text
124
+ anonymized_text = anonymizer.anonymize(
125
+ pdf_text
126
+ )
127
+
128
+ # with splitting into chunks
129
+ # {
130
+ # # Split the sanitized text into chunks
131
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
132
+ # texts = text_splitter.split_text(anonymized_text)
133
+
134
+ # # Create metadata for each chunk
135
+ # metadatas = [{"source": f"{i}-pl"} for i in range(len(texts))]
136
+
137
+ # # Create a Chroma vector store
138
+ # embeddings = OllamaEmbeddings(model="nomic-embed-text")
139
+ # docsearch = await cl.make_async(Chroma.from_texts)(
140
+ # texts, embeddings, metadatas=metadatas
141
+ # )
142
+ # }
143
+
144
+ # without splitting into chunks
145
+ # {
146
+ # Create a Chroma vector store
147
+
148
+ # embeddings = OllamaEmbeddings(model="nomic-embed-text")
149
+ embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
150
+
151
+ docsearch = await cl.make_async(Chroma.from_texts)(
152
+ [anonymized_text], embeddings, metadatas=[{"source": "0-pl"}]
153
+ )
154
+ # }
155
+
156
+ # Initialize message history for conversation
157
+ message_history = ChatMessageHistory()
158
+
159
+ # Memory for conversational context
160
+ memory = ConversationBufferMemory(
161
+ memory_key="chat_history",
162
+ output_key="answer",
163
+ chat_memory=message_history,
164
+ return_messages=True,
165
+ )
166
+
167
+ # Create a chain that uses the Chroma vector store
168
+ chain = ConversationalRetrievalChain.from_llm(
169
+ llm = llm_groq,
170
+ chain_type="stuff",
171
+ retriever=docsearch.as_retriever(),
172
+ memory=memory,
173
+ return_source_documents=True,
174
+ )
175
+
176
+ # Let the user know that the system is ready
177
+ msg.content = f"Processing `{file.name}` done. You can now ask questions!"
178
+ await msg.update()
179
+ # Store the chain in user session
180
+ cl.user_session.set("chain", chain)
181
+
182
+
183
+ @cl.on_message
184
+ async def main(message: cl.Message):
185
+
186
+ # Retrieve the chain from user session
187
+ chain = cl.user_session.get("chain")
188
+ # Callbacks happen asynchronously/parallel
189
+ cb = cl.AsyncLangchainCallbackHandler()
190
+
191
+ # Call the chain with user's message content
192
+ res = await chain.ainvoke(message.content, callbacks=[cb])
193
+ answer = anonymizer.deanonymize(
194
+ "ok"+res["answer"]
195
+ )
196
+ text_elements = []
197
+
198
+ # Return results
199
+ await cl.Message(content=answer, elements=text_elements).send()
public/test.css ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .MuiButtonBase-root.MuiIconButton-root.MuiIconButton-sizeMedium.css-1egpgfe {
2
+ display: none;
3
+ }
4
+
5
+ .MuiStack-root.watermark.css-1705j0v {
6
+ display: none;
7
+ }
8
+
9
+ .MuiAvatar-img.css-1hy9t21 {
10
+ content: url("/public/image.png"); /* Path to your custom avatar image */
11
+ }
12
+
13
+ img[src="http://localhost:8000/logo?theme=dark"]
14
+ {
15
+ display: none;
16
+ }
17
+
18
+ #open-sidebar-button {
19
+ display: none;
20
+ }
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ chainlitchainlit==1.1.304
2
+ langchain==0.2.5
3
+ langchain-community==0.2.5
4
+ langchain-core==0.2.9
5
+ langchain-groq==0.1.5
6
+ langchain-experimental==0.0.61
7
+ PyPDF2==3.0.1
8
+ chromadb==0.5.3
9
+ groq==0.9.0
10
+ ollama==0.2.1
11
+ pypandoc==1.13
12
+ pdfkit==1.0.0
13
+ docx2pdf==0.1.8
14
+ paddlepaddle==2.6.1
15
+ paddleocr==2.7.3
16
+ presidio-analyzer==2.2.354
17
+ presidio-anonymizer==2.2.354
18
+ spacy==3.7.5
19
+ Faker==25.9.1
20
+ langchain-nomic==0.1.2
21
+ # python 3.10.0