Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -13,6 +13,7 @@ from langchain_community.embeddings import SentenceTransformerEmbeddings
|
|
13 |
from groq import Groq
|
14 |
import gradio as gr
|
15 |
import requests
|
|
|
16 |
|
17 |
# Ensure the Tesseract OCR path is set correctly
|
18 |
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
|
@@ -22,9 +23,11 @@ GROQ_API_KEY = "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"
|
|
22 |
def extract_text_from_doc(doc_content):
|
23 |
"""Extract text from DOC file content."""
|
24 |
try:
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
28 |
except Exception as e:
|
29 |
print("Failed to extract text from DOC:", e)
|
30 |
return ""
|
@@ -188,4 +191,4 @@ iface = gr.Interface(
|
|
188 |
description="Ask a question and get an answer from the AI assistant."
|
189 |
)
|
190 |
|
191 |
-
iface.launch()
|
|
|
13 |
from groq import Groq
|
14 |
import gradio as gr
|
15 |
import requests
|
16 |
+
from zipfile import ZipFile
|
17 |
|
18 |
# Ensure the Tesseract OCR path is set correctly
|
19 |
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
|
|
|
23 |
def extract_text_from_doc(doc_content):
|
24 |
"""Extract text from DOC file content."""
|
25 |
try:
|
26 |
+
with ZipFile(io.BytesIO(doc_content)) as zip_file:
|
27 |
+
xml_content = zip_file.read('word/document.xml')
|
28 |
+
doc = Document(io.BytesIO(xml_content))
|
29 |
+
extracted_text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
30 |
+
return extracted_text
|
31 |
except Exception as e:
|
32 |
print("Failed to extract text from DOC:", e)
|
33 |
return ""
|
|
|
191 |
description="Ask a question and get an answer from the AI assistant."
|
192 |
)
|
193 |
|
194 |
+
iface.launch()
|