Luciferalive commited on
Commit
2d6c283
·
verified ·
1 Parent(s): fdd610d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -4
app.py CHANGED
@@ -13,6 +13,7 @@ from langchain_community.embeddings import SentenceTransformerEmbeddings
13
  from groq import Groq
14
  import gradio as gr
15
  import requests
 
16
 
17
  # Ensure the Tesseract OCR path is set correctly
18
  pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
@@ -22,9 +23,11 @@ GROQ_API_KEY = "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"
22
  def extract_text_from_doc(doc_content):
23
  """Extract text from DOC file content."""
24
  try:
25
- doc = Document(io.BytesIO(doc_content))
26
- extracted_text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
27
- return extracted_text
 
 
28
  except Exception as e:
29
  print("Failed to extract text from DOC:", e)
30
  return ""
@@ -188,4 +191,4 @@ iface = gr.Interface(
188
  description="Ask a question and get an answer from the AI assistant."
189
  )
190
 
191
- iface.launch()
 
13
  from groq import Groq
14
  import gradio as gr
15
  import requests
16
+ from zipfile import ZipFile
17
 
18
  # Ensure the Tesseract OCR path is set correctly
19
  pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
 
23
  def extract_text_from_doc(doc_content):
24
  """Extract text from DOC file content."""
25
  try:
26
+ with ZipFile(io.BytesIO(doc_content)) as zip_file:
27
+ xml_content = zip_file.read('word/document.xml')
28
+ doc = Document(io.BytesIO(xml_content))
29
+ extracted_text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
30
+ return extracted_text
31
  except Exception as e:
32
  print("Failed to extract text from DOC:", e)
33
  return ""
 
191
  description="Ask a question and get an answer from the AI assistant."
192
  )
193
 
194
+ iface.launch()