akash015 commited on
Commit
c3ecbfd
·
verified ·
1 Parent(s): 95fb55f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -199
app.py CHANGED
@@ -1,187 +1,4 @@
1
- # import re
2
- # import PyPDF2
3
- # from langchain_community.embeddings import OllamaEmbeddings
4
- # from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- # from langchain_community.vectorstores import Chroma
6
- # from langchain.chains import ConversationalRetrievalChain
7
- # from langchain_community.chat_models import ChatOllama
8
- # from langchain_groq import ChatGroq
9
- # from langchain.memory import ChatMessageHistory, ConversationBufferMemory
10
- # import chainlit as cl
11
- # from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
12
- # import logging
13
- # import pypandoc
14
- # import pdfkit
15
- # from paddleocr import PaddleOCR
16
- # import fitz
17
- # import asyncio
18
- # from langchain_nomic.embeddings import NomicEmbeddings
19
-
20
- # llm_groq = ChatGroq(
21
- # model_name='llama3-70b-8192'
22
- # )
23
-
24
- # # Initialize anonymizer
25
- # anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'], faker_seed=18)
26
-
27
- # def extract_text_from_pdf(file_path):
28
- # pdf = PyPDF2.PdfReader(file_path)
29
- # pdf_text = ""
30
- # for page in pdf.pages:
31
- # pdf_text += page.extract_text()
32
- # return pdf_text
33
-
34
- # def has_sufficient_selectable_text(page, threshold=50):
35
- # text = page.extract_text()
36
- # if len(text.strip()) > threshold:
37
- # return True
38
- # return False
39
-
40
- # async def get_text(file_path):
41
- # text = ""
42
- # try:
43
- # logging.info("Starting OCR process for file: %s", file_path)
44
- # extension = file_path.split(".")[-1].lower()
45
- # allowed_extension = ["jpg", "jpeg", "png", "pdf", "docx"]
46
- # if extension not in allowed_extension:
47
- # error = "Not a valid File. Allowed Format are jpg, jpeg, png, pdf, docx"
48
- # logging.error(error)
49
- # return {"error": error}
50
-
51
- # if extension == "docx":
52
- # file_path = convert_docx_to_pdf(file_path)
53
-
54
- # ocr = PaddleOCR(use_angle_cls=True, lang='en')
55
- # result = ocr.ocr(file_path, cls=True)
56
- # for idx in range(len(result)):
57
- # res = result[idx]
58
- # for line in res:
59
- # text += line[1][0] + " "
60
- # logging.info("OCR process completed successfully for file: %s", file_path)
61
- # except Exception as e:
62
- # logging.error("Error occurred during OCR process for file %s: %s", file_path, e)
63
- # text = "Error occurred during OCR process."
64
- # logging.info("Extracted text: %s", text)
65
- # return text
66
-
67
- # def convert_docx_to_pdf(input_path):
68
- # html_path = input_path.replace('.docx', '.html')
69
- # output_path = ".".join(input_path.split(".")[:-1]) + ".pdf"
70
- # pypandoc.convert_file(input_path, 'html', outputfile=html_path)
71
- # pdfkit.from_file(html_path, output_path)
72
- # logging.info("DOCX Format Handled")
73
- # return output_path
74
-
75
- # async def extract_text_from_mixed_pdf(file_path):
76
- # pdf = PyPDF2.PdfReader(file_path)
77
- # ocr = PaddleOCR(use_angle_cls=True, lang='en')
78
- # pdf_text = ""
79
- # for i, page in enumerate(pdf.pages):
80
- # text = page.extract_text()
81
- # if not has_sufficient_selectable_text(page):
82
- # logging.info(f"Page {i+1} has insufficient selectable text, performing OCR.")
83
- # pdf_document = fitz.open(file_path)
84
- # pdf_page = pdf_document.load_page(i)
85
- # pix = pdf_page.get_pixmap()
86
- # image_path = f"page_{i+1}.png"
87
- # pix.save(image_path)
88
- # result = ocr.ocr(image_path, cls=True)
89
- # for idx in range(len(result)):
90
- # res = result[idx]
91
- # for line in res:
92
- # text += line[1][0] + " "
93
- # pdf_text += text
94
- # return pdf_text
95
-
96
- # @cl.on_chat_start
97
- # async def on_chat_start():
98
-
99
- # files = None # Initialize variable to store uploaded files
100
-
101
- # # Wait for the user to upload a file
102
- # while files is None:
103
- # files = await cl.AskFileMessage(
104
- # content="Please upload a pdf file to begin!",
105
- # # accept=["application/pdf"],
106
- # accept=["application/pdf", "image/jpeg", "image/png", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
107
- # max_size_mb=100,
108
- # timeout=180,
109
- # ).send()
110
-
111
- # file = files[0] # Get the first uploaded file
112
-
113
- # # Inform the user that processing has started
114
- # msg = cl.Message(content=f"Processing `{file.name}`...")
115
- # await msg.send()
116
-
117
- # # Extract text from PDF, checking for selectable and handwritten text
118
- # if file.name.endswith('.pdf'):
119
- # pdf_text = await extract_text_from_mixed_pdf(file.path)
120
- # else:
121
- # pdf_text = await get_text(file.path)
122
-
123
- # # Anonymize the text
124
- # anonymized_text = anonymizer.anonymize(
125
- # pdf_text
126
- # )
127
-
128
- # embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
129
-
130
- # docsearch = await cl.make_async(Chroma.from_texts)(
131
- # [anonymized_text], embeddings, metadatas=[{"source": "0-pl"}]
132
- # )
133
- # # }
134
-
135
- # # Initialize message history for conversation
136
- # message_history = ChatMessageHistory()
137
-
138
- # # Memory for conversational context
139
- # memory = ConversationBufferMemory(
140
- # memory_key="chat_history",
141
- # output_key="answer",
142
- # chat_memory=message_history,
143
- # return_messages=True,
144
- # )
145
-
146
- # # Create a chain that uses the Chroma vector store
147
- # chain = ConversationalRetrievalChain.from_llm(
148
- # llm = llm_groq,
149
- # chain_type="stuff",
150
- # retriever=docsearch.as_retriever(),
151
- # memory=memory,
152
- # return_source_documents=True,
153
- # )
154
-
155
- # # Let the user know that the system is ready
156
- # msg.content = f"Processing `{file.name}` done. You can now ask questions!"
157
- # await msg.update()
158
- # # Store the chain in user session
159
- # cl.user_session.set("chain", chain)
160
-
161
-
162
- # @cl.on_message
163
- # async def main(message: cl.Message):
164
-
165
- # # Retrieve the chain from user session
166
- # chain = cl.user_session.get("chain")
167
- # # Callbacks happen asynchronously/parallel
168
- # cb = cl.AsyncLangchainCallbackHandler()
169
-
170
- # # Call the chain with user's message content
171
- # res = await chain.ainvoke(message.content, callbacks=[cb])
172
- # answer = anonymizer.deanonymize(
173
- # res["answer"]
174
- # )
175
- # text_elements = []
176
-
177
- # # Return results
178
- # await cl.Message(content=answer, elements=text_elements).send()
179
-
180
-
181
-
182
-
183
- # v2:
184
- import re
185
  import PyPDF2
186
  from langchain_community.embeddings import OllamaEmbeddings
187
  from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -196,19 +13,16 @@ import logging
196
  import pypandoc
197
  import pdfkit
198
  from paddleocr import PaddleOCR
199
- import fitz
200
  import asyncio
201
  from langchain_nomic.embeddings import NomicEmbeddings
202
 
203
  llm_groq = ChatGroq(
204
- model_name='llama3-70b-8192'
205
- )
206
 
207
  # Initialize anonymizer
208
- anonymizer = PresidioReversibleAnonymizer(
209
- analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'],
210
- faker_seed=18
211
- )
212
 
213
  def extract_text_from_pdf(file_path):
214
  pdf = PyPDF2.PdfReader(file_path)
@@ -233,10 +47,10 @@ async def get_text(file_path):
233
  error = "Not a valid File. Allowed Format are jpg, jpeg, png, pdf, docx"
234
  logging.error(error)
235
  return {"error": error}
236
-
237
  if extension == "docx":
238
  file_path = convert_docx_to_pdf(file_path)
239
-
240
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
241
  result = ocr.ocr(file_path, cls=True)
242
  for idx in range(len(result)):
@@ -281,19 +95,21 @@ async def extract_text_from_mixed_pdf(file_path):
281
 
282
  @cl.on_chat_start
283
  async def on_chat_start():
284
- files = None # Initialize variable to store uploaded files
 
285
 
286
  # Wait for the user to upload a file
287
  while files is None:
288
  files = await cl.AskFileMessage(
289
  content="Please upload a pdf file to begin!",
 
290
  accept=["application/pdf", "image/jpeg", "image/png", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
291
  max_size_mb=100,
292
  timeout=180,
293
  ).send()
294
 
295
- file = files[0] # Get the first uploaded file
296
-
297
  # Inform the user that processing has started
298
  msg = cl.Message(content=f"Processing `{file.name}`...")
299
  await msg.send()
@@ -314,6 +130,7 @@ async def on_chat_start():
314
  docsearch = await cl.make_async(Chroma.from_texts)(
315
  [anonymized_text], embeddings, metadatas=[{"source": "0-pl"}]
316
  )
 
317
 
318
  # Initialize message history for conversation
319
  message_history = ChatMessageHistory()
@@ -338,14 +155,15 @@ async def on_chat_start():
338
  # Let the user know that the system is ready
339
  msg.content = f"Processing `{file.name}` done. You can now ask questions!"
340
  await msg.update()
341
-
342
  # Store the chain in user session
343
  cl.user_session.set("chain", chain)
344
 
 
345
  @cl.on_message
346
  async def main(message: cl.Message):
 
347
  # Retrieve the chain from user session
348
- chain = cl.user_session.get("chain")
349
  # Callbacks happen asynchronously/parallel
350
  cb = cl.AsyncLangchainCallbackHandler()
351
 
@@ -358,4 +176,6 @@ async def main(message: cl.Message):
358
 
359
  # Return results
360
  await cl.Message(content=answer, elements=text_elements).send()
361
-
 
 
 
1
+ import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import PyPDF2
3
  from langchain_community.embeddings import OllamaEmbeddings
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
13
  import pypandoc
14
  import pdfkit
15
  from paddleocr import PaddleOCR
16
+ import fitz
17
  import asyncio
18
  from langchain_nomic.embeddings import NomicEmbeddings
19
 
20
  llm_groq = ChatGroq(
21
+ model_name='llama3-70b-8192'
22
+ )
23
 
24
  # Initialize anonymizer
25
+ anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'], faker_seed=18)
 
 
 
26
 
27
  def extract_text_from_pdf(file_path):
28
  pdf = PyPDF2.PdfReader(file_path)
 
47
  error = "Not a valid File. Allowed Format are jpg, jpeg, png, pdf, docx"
48
  logging.error(error)
49
  return {"error": error}
50
+
51
  if extension == "docx":
52
  file_path = convert_docx_to_pdf(file_path)
53
+
54
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
55
  result = ocr.ocr(file_path, cls=True)
56
  for idx in range(len(result)):
 
95
 
96
  @cl.on_chat_start
97
  async def on_chat_start():
98
+
99
+ files = None # Initialize variable to store uploaded files
100
 
101
  # Wait for the user to upload a file
102
  while files is None:
103
  files = await cl.AskFileMessage(
104
  content="Please upload a pdf file to begin!",
105
+ # accept=["application/pdf"],
106
  accept=["application/pdf", "image/jpeg", "image/png", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
107
  max_size_mb=100,
108
  timeout=180,
109
  ).send()
110
 
111
+ file = files[0] # Get the first uploaded file
112
+
113
  # Inform the user that processing has started
114
  msg = cl.Message(content=f"Processing `{file.name}`...")
115
  await msg.send()
 
130
  docsearch = await cl.make_async(Chroma.from_texts)(
131
  [anonymized_text], embeddings, metadatas=[{"source": "0-pl"}]
132
  )
133
+ # }
134
 
135
  # Initialize message history for conversation
136
  message_history = ChatMessageHistory()
 
155
  # Let the user know that the system is ready
156
  msg.content = f"Processing `{file.name}` done. You can now ask questions!"
157
  await msg.update()
 
158
  # Store the chain in user session
159
  cl.user_session.set("chain", chain)
160
 
161
+
162
  @cl.on_message
163
  async def main(message: cl.Message):
164
+
165
  # Retrieve the chain from user session
166
+ chain = cl.user_session.get("chain")
167
  # Callbacks happen asynchronously/parallel
168
  cb = cl.AsyncLangchainCallbackHandler()
169
 
 
176
 
177
  # Return results
178
  await cl.Message(content=answer, elements=text_elements).send()
179
+
180
+
181
+