akash015 commited on
Commit
04e4114
·
verified ·
1 Parent(s): f5cc59f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +207 -196
app.py CHANGED
@@ -1,196 +1,207 @@
1
- import re
2
- import PyPDF2
3
- from langchain_community.embeddings import OllamaEmbeddings
4
- from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain_community.vectorstores import Chroma
6
- from langchain.chains import ConversationalRetrievalChain
7
- from langchain_community.chat_models import ChatOllama
8
- from langchain_groq import ChatGroq
9
- from langchain.memory import ChatMessageHistory, ConversationBufferMemory
10
- import chainlit as cl
11
- from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
12
- import logging
13
- import pypandoc
14
- import pdfkit
15
- from paddleocr import PaddleOCR
16
- import fitz
17
- import asyncio
18
-
19
- llm_groq = ChatGroq(
20
- model_name='llama3-70b-8192'
21
- )
22
-
23
- # Initialize anonymizer
24
- anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL', 'US_BANK_NUMBER', 'US_DRIVER_LICENSE', 'US_ITIN', 'US_PASSPORT', 'US_SSN'], faker_seed=18)
25
-
26
- def extract_text_from_pdf(file_path):
27
- pdf = PyPDF2.PdfReader(file_path)
28
- pdf_text = ""
29
- for page in pdf.pages:
30
- pdf_text += page.extract_text()
31
- return pdf_text
32
-
33
- def has_sufficient_selectable_text(page, threshold=50):
34
- text = page.extract_text()
35
- if len(text.strip()) > threshold:
36
- return True
37
- return False
38
-
39
- async def get_text(file_path):
40
- text = ""
41
- try:
42
- logging.info("Starting OCR process for file: %s", file_path)
43
- extension = file_path.split(".")[-1].lower()
44
- allowed_extension = ["jpg", "jpeg", "png", "pdf", "docx"]
45
- if extension not in allowed_extension:
46
- error = "Not a valid File. Allowed Format are jpg, jpeg, png, pdf, docx"
47
- logging.error(error)
48
- return {"error": error}
49
-
50
- if extension == "docx":
51
- file_path = convert_docx_to_pdf(file_path)
52
-
53
- ocr = PaddleOCR(use_angle_cls=True, lang='en')
54
- result = ocr.ocr(file_path, cls=True)
55
- for idx in range(len(result)):
56
- res = result[idx]
57
- for line in res:
58
- text += line[1][0] + " "
59
- logging.info("OCR process completed successfully for file: %s", file_path)
60
- except Exception as e:
61
- logging.error("Error occurred during OCR process for file %s: %s", file_path, e)
62
- text = "Error occurred during OCR process."
63
- logging.info("Extracted text: %s", text)
64
- return text
65
-
66
- def convert_docx_to_pdf(input_path):
67
- html_path = input_path.replace('.docx', '.html')
68
- output_path = ".".join(input_path.split(".")[:-1]) + ".pdf"
69
- pypandoc.convert_file(input_path, 'html', outputfile=html_path)
70
- pdfkit.from_file(html_path, output_path)
71
- logging.info("DOCX Format Handled")
72
- return output_path
73
-
74
- async def extract_text_from_mixed_pdf(file_path):
75
- pdf = PyPDF2.PdfReader(file_path)
76
- ocr = PaddleOCR(use_angle_cls=True, lang='en')
77
- pdf_text = ""
78
- for i, page in enumerate(pdf.pages):
79
- text = page.extract_text()
80
- if not has_sufficient_selectable_text(page):
81
- logging.info(f"Page {i+1} has insufficient selectable text, performing OCR.")
82
- pdf_document = fitz.open(file_path)
83
- pdf_page = pdf_document.load_page(i)
84
- pix = pdf_page.get_pixmap()
85
- image_path = f"page_{i+1}.png"
86
- pix.save(image_path)
87
- result = ocr.ocr(image_path, cls=True)
88
- for idx in range(len(result)):
89
- res = result[idx]
90
- for line in res:
91
- text += line[1][0] + " "
92
- pdf_text += text
93
- return pdf_text
94
-
95
- @cl.on_chat_start
96
- async def on_chat_start():
97
-
98
- files = None # Initialize variable to store uploaded files
99
-
100
- # Wait for the user to upload a file
101
- while files is None:
102
- files = await cl.AskFileMessage(
103
- content="Please upload a pdf file to begin!",
104
- # accept=["application/pdf"],
105
- accept=["application/pdf", "image/jpeg", "image/png", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
106
- max_size_mb=100,
107
- timeout=180,
108
- ).send()
109
-
110
- file = files[0] # Get the first uploaded file
111
-
112
- # Inform the user that processing has started
113
- msg = cl.Message(content=f"Processing `{file.name}`...")
114
- await msg.send()
115
-
116
- # Extract text from PDF, checking for selectable and handwritten text
117
- if file.name.endswith('.pdf'):
118
- pdf_text = await extract_text_from_mixed_pdf(file.path)
119
- else:
120
- pdf_text = await get_text(file.path)
121
-
122
- # Anonymize the text
123
- anonymized_text = anonymizer.anonymize(
124
- pdf_text
125
- )
126
-
127
- # with splitting into chunks
128
- # {
129
- # # Split the sanitized text into chunks
130
- # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
131
- # texts = text_splitter.split_text(anonymized_text)
132
-
133
- # # Create metadata for each chunk
134
- # metadatas = [{"source": f"{i}-pl"} for i in range(len(texts))]
135
-
136
- # # Create a Chroma vector store
137
- # embeddings = OllamaEmbeddings(model="nomic-embed-text")
138
- # docsearch = await cl.make_async(Chroma.from_texts)(
139
- # texts, embeddings, metadatas=metadatas
140
- # )
141
- # }
142
-
143
- # without splitting into chunks
144
- # {
145
- # Create a Chroma vector store
146
- embeddings = OllamaEmbeddings(model="nomic-embed-text")
147
- docsearch = await cl.make_async(Chroma.from_texts)(
148
- [anonymized_text], embeddings, metadatas=[{"source": "0-pl"}]
149
- )
150
- # }
151
-
152
- # Initialize message history for conversation
153
- message_history = ChatMessageHistory()
154
-
155
- # Memory for conversational context
156
- memory = ConversationBufferMemory(
157
- memory_key="chat_history",
158
- output_key="answer",
159
- chat_memory=message_history,
160
- return_messages=True,
161
- )
162
-
163
- # Create a chain that uses the Chroma vector store
164
- chain = ConversationalRetrievalChain.from_llm(
165
- llm = llm_groq,
166
- chain_type="stuff",
167
- retriever=docsearch.as_retriever(),
168
- memory=memory,
169
- return_source_documents=True,
170
- )
171
-
172
- # Let the user know that the system is ready
173
- msg.content = f"Processing `{file.name}` done. You can now ask questions!"
174
- await msg.update()
175
- # Store the chain in user session
176
- cl.user_session.set("chain", chain)
177
-
178
-
179
- @cl.on_message
180
- async def main(message: cl.Message):
181
-
182
- # Retrieve the chain from user session
183
- chain = cl.user_session.get("chain")
184
- # Callbacks happen asynchronously/parallel
185
- cb = cl.AsyncLangchainCallbackHandler()
186
-
187
- # Call the chain with user's message content
188
- res = await chain.ainvoke(message.content, callbacks=[cb])
189
- answer = anonymizer.deanonymize(
190
- "ok"+res["answer"]
191
- )
192
- text_elements = []
193
-
194
- # Return results
195
- await cl.Message(content=answer, elements=text_elements).send()
196
-
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import PyPDF2
3
+ from langchain_community.embeddings import OllamaEmbeddings
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_community.vectorstores import Chroma
6
+ from langchain.chains import ConversationalRetrievalChain
7
+ from langchain_community.chat_models import ChatOllama
8
+ from langchain_groq import ChatGroq
9
+ from langchain.memory import ChatMessageHistory, ConversationBufferMemory
10
+ import chainlit as cl
11
+ from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
12
+ import logging
13
+ import pypandoc
14
+ import pdfkit
15
+ from paddleocr import PaddleOCR
16
+ import fitz
17
+ import asyncio
18
+ from langchain_nomic.embeddings import NomicEmbeddings
19
+
20
+ # initialise LLM model
21
+ llm_groq = ChatGroq(
22
+ model_name='llama3-70b-8192'
23
+ )
24
+
25
+ # Initialize anonymizer
26
+ anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL', 'US_BANK_NUMBER', 'US_DRIVER_LICENSE', 'US_ITIN', 'US_PASSPORT', 'US_SSN'], faker_seed=18)
27
+
28
+ # initalise nomic embedding model
29
+ embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
30
+
31
+ def embed_text(text):
32
+ if len(text.split()) <= 50:
33
+ return embeddings.embed_query(text)
34
+ else:
35
+ return embeddings.embed_document(text)
36
+
37
+ def extract_text_from_pdf(file_path):
38
+ pdf = PyPDF2.PdfReader(file_path)
39
+ pdf_text = ""
40
+ for page in pdf.pages:
41
+ pdf_text += page.extract_text()
42
+ return pdf_text
43
+
44
+ def has_sufficient_selectable_text(page, threshold=50):
45
+ text = page.extract_text()
46
+ if len(text.strip()) > threshold:
47
+ return True
48
+ return False
49
+
50
+ async def get_text(file_path):
51
+ text = ""
52
+ try:
53
+ logging.info("Starting OCR process for file: %s", file_path)
54
+ extension = file_path.split(".")[-1].lower()
55
+ allowed_extension = ["jpg", "jpeg", "png", "pdf", "docx"]
56
+ if extension not in allowed_extension:
57
+ error = "Not a valid File. Allowed Format are jpg, jpeg, png, pdf, docx"
58
+ logging.error(error)
59
+ return {"error": error}
60
+
61
+ if extension == "docx":
62
+ file_path = convert_docx_to_pdf(file_path)
63
+
64
+ ocr = PaddleOCR(use_angle_cls=True, lang='en')
65
+ result = ocr.ocr(file_path, cls=True)
66
+ for idx in range(len(result)):
67
+ res = result[idx]
68
+ for line in res:
69
+ text += line[1][0] + " "
70
+ logging.info("OCR process completed successfully for file: %s", file_path)
71
+ except Exception as e:
72
+ logging.error("Error occurred during OCR process for file %s: %s", file_path, e)
73
+ text = "Error occurred during OCR process."
74
+ logging.info("Extracted text: %s", text)
75
+ return text
76
+
77
+ def convert_docx_to_pdf(input_path):
78
+ html_path = input_path.replace('.docx', '.html')
79
+ output_path = ".".join(input_path.split(".")[:-1]) + ".pdf"
80
+ pypandoc.convert_file(input_path, 'html', outputfile=html_path)
81
+ pdfkit.from_file(html_path, output_path)
82
+ logging.info("DOCX Format Handled")
83
+ return output_path
84
+
85
+ async def extract_text_from_mixed_pdf(file_path):
86
+ pdf = PyPDF2.PdfReader(file_path)
87
+ ocr = PaddleOCR(use_angle_cls=True, lang='en')
88
+ pdf_text = ""
89
+ for i, page in enumerate(pdf.pages):
90
+ text = page.extract_text()
91
+ if not has_sufficient_selectable_text(page):
92
+ logging.info(f"Page {i+1} has insufficient selectable text, performing OCR.")
93
+ pdf_document = fitz.open(file_path)
94
+ pdf_page = pdf_document.load_page(i)
95
+ pix = pdf_page.get_pixmap()
96
+ image_path = f"page_{i+1}.png"
97
+ pix.save(image_path)
98
+ result = ocr.ocr(image_path, cls=True)
99
+ for idx in range(len(result)):
100
+ res = result[idx]
101
+ for line in res:
102
+ text += line[1][0] + " "
103
+ pdf_text += text
104
+ return pdf_text
105
+
106
+ @cl.on_chat_start
107
+ async def on_chat_start():
108
+
109
+ files = None # Initialize variable to store uploaded files
110
+
111
+ # Wait for the user to upload a file
112
+ while files is None:
113
+ files = await cl.AskFileMessage(
114
+ content="Please upload a pdf file to begin!",
115
+ # accept=["application/pdf"],
116
+ accept=["application/pdf", "image/jpeg", "image/png", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
117
+ max_size_mb=100,
118
+ timeout=180,
119
+ ).send()
120
+
121
+ file = files[0] # Get the first uploaded file
122
+
123
+ # Inform the user that processing has started
124
+ msg = cl.Message(content=f"Processing `{file.name}`...")
125
+ await msg.send()
126
+
127
+ # Extract text from PDF, checking for selectable and handwritten text
128
+ if file.name.endswith('.pdf'):
129
+ pdf_text = await extract_text_from_mixed_pdf(file.path)
130
+ else:
131
+ pdf_text = await get_text(file.path)
132
+
133
+ # Anonymize the text
134
+ anonymized_text = anonymizer.anonymize(
135
+ pdf_text
136
+ )
137
+
138
+ # with splitting into chunks
139
+ # {
140
+ # # Split the sanitized text into chunks
141
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
142
+ # texts = text_splitter.split_text(anonymized_text)
143
+
144
+ # # Create metadata for each chunk
145
+ # metadatas = [{"source": f"{i}-pl"} for i in range(len(texts))]
146
+
147
+ # # Create a Chroma vector store
148
+ # embeddings = OllamaEmbeddings(model="nomic-embed-text")
149
+ # docsearch = await cl.make_async(Chroma.from_texts)(
150
+ # texts, embeddings, metadatas=metadatas
151
+ # )
152
+ # }
153
+
154
+ # without splitting into chunks
155
+ # {
156
+ # Create a Chroma vector store
157
+ # embeddings = OllamaEmbeddings(model="nomic-embed-text")
158
+ docsearch = await cl.make_async(Chroma.from_texts)(
159
+ [anonymized_text], embeddings, metadatas=[{"source": "0-pl"}]
160
+ )
161
+ # }
162
+
163
+ # Initialize message history for conversation
164
+ message_history = ChatMessageHistory()
165
+
166
+ # Memory for conversational context
167
+ memory = ConversationBufferMemory(
168
+ memory_key="chat_history",
169
+ output_key="answer",
170
+ chat_memory=message_history,
171
+ return_messages=True,
172
+ )
173
+
174
+ # Create a chain that uses the Chroma vector store
175
+ chain = ConversationalRetrievalChain.from_llm(
176
+ llm = llm_groq,
177
+ chain_type="stuff",
178
+ retriever=docsearch.as_retriever(),
179
+ memory=memory,
180
+ return_source_documents=True,
181
+ )
182
+
183
+ # Let the user know that the system is ready
184
+ msg.content = f"Processing `{file.name}` done. You can now ask questions!"
185
+ await msg.update()
186
+ # Store the chain in user session
187
+ cl.user_session.set("chain", chain)
188
+
189
+
190
+ @cl.on_message
191
+ async def main(message: cl.Message):
192
+
193
+ # Retrieve the chain from user session
194
+ chain = cl.user_session.get("chain")
195
+ # Callbacks happen asynchronously/parallel
196
+ cb = cl.AsyncLangchainCallbackHandler()
197
+
198
+ # Call the chain with user's message content
199
+ res = await chain.ainvoke(message.content, callbacks=[cb])
200
+ answer = anonymizer.deanonymize(
201
+ "ok"+res["answer"]
202
+ )
203
+ text_elements = []
204
+
205
+ # Return results
206
+ await cl.Message(content=answer, elements=text_elements).send()
207
+