Spaces:

Luciferalive
/

goosev9

Sleeping

Luciferalive commited on Jun 17, 2024

Commit

2d6c283

verified ·

1 Parent(s): fdd610d

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ from langchain_community.embeddings import SentenceTransformerEmbeddings
 from groq import Groq
 import gradio as gr
 import requests
 # Ensure the Tesseract OCR path is set correctly
 pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
@@ -22,9 +23,11 @@ GROQ_API_KEY = "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"
 def extract_text_from_doc(doc_content):
     """Extract text from DOC file content."""
     try:
-        doc = Document(io.BytesIO(doc_content))
-        extracted_text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
-        return extracted_text
     except Exception as e:
         print("Failed to extract text from DOC:", e)
         return ""
@@ -188,4 +191,4 @@ iface = gr.Interface(
     description="Ask a question and get an answer from the AI assistant."
 )
-iface.launch()

 from groq import Groq
 import gradio as gr
 import requests
+from zipfile import ZipFile
 # Ensure the Tesseract OCR path is set correctly
 pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
 def extract_text_from_doc(doc_content):
     """Extract text from DOC file content."""
     try:
+        with ZipFile(io.BytesIO(doc_content)) as zip_file:
+            xml_content = zip_file.read('word/document.xml')
+            doc = Document(io.BytesIO(xml_content))
+            extracted_text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
+            return extracted_text
     except Exception as e:
         print("Failed to extract text from DOC:", e)
         return ""
     description="Ask a question and get an answer from the AI assistant."
 )
+iface.launch()