Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from fastapi import FastAPI, File, UploadFile
|
2 |
-
import
|
3 |
-
import
|
4 |
import openpyxl
|
5 |
from pptx import Presentation
|
6 |
import torch
|
@@ -16,11 +16,8 @@ import easyocr
|
|
16 |
# Initialize FastAPI
|
17 |
app = FastAPI()
|
18 |
|
19 |
-
# Load AI Model for Question Answering (
|
20 |
-
qa_pipeline = pipeline("
|
21 |
-
|
22 |
-
# Initialize Translator for Multilingual Support
|
23 |
-
translator = pipeline("translation", model="facebook/m2m100_418M")
|
24 |
|
25 |
# Load Pretrained Object Detection Model (if needed)
|
26 |
model = fasterrcnn_resnet50_fpn(pretrained=True)
|
@@ -48,25 +45,21 @@ def truncate_text(text, max_tokens=450):
|
|
48 |
words = text.split()
|
49 |
return " ".join(words[:max_tokens])
|
50 |
|
51 |
-
# Text Extraction Functions
|
52 |
def extract_text_from_pdf(pdf_file):
|
53 |
-
text = ""
|
54 |
try:
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
if page_text:
|
59 |
-
text += page_text + "\n"
|
60 |
except Exception as e:
|
61 |
return f"Error reading PDF: {str(e)}"
|
62 |
-
return text.strip() if text else "No text found."
|
63 |
|
64 |
-
def
|
65 |
try:
|
66 |
-
|
67 |
-
return "
|
68 |
except Exception as e:
|
69 |
-
return f"Error reading
|
70 |
|
71 |
def extract_text_from_pptx(pptx_file):
|
72 |
try:
|
@@ -99,9 +92,6 @@ def extract_text_from_image(image_file):
|
|
99 |
result = reader.readtext(np.array(image))
|
100 |
return " ".join([res[1] for res in result]) if result else "No text found."
|
101 |
|
102 |
-
def translate_text(text, target_lang="en"):
|
103 |
-
return translator(text, src_lang="auto", tgt_lang=target_lang)[0]["translation_text"]
|
104 |
-
|
105 |
# Function to answer questions based on document content
|
106 |
def answer_question_from_document(file, question):
|
107 |
validation_error = validate_file_type(file)
|
@@ -111,10 +101,8 @@ def answer_question_from_document(file, question):
|
|
111 |
file_ext = file.name.split(".")[-1].lower()
|
112 |
if file_ext == "pdf":
|
113 |
text = extract_text_from_pdf(file)
|
114 |
-
elif file_ext
|
115 |
-
text =
|
116 |
-
elif file_ext == "pptx":
|
117 |
-
text = extract_text_from_pptx(file)
|
118 |
elif file_ext == "xlsx":
|
119 |
text = extract_text_from_excel(file)
|
120 |
else:
|
@@ -123,22 +111,20 @@ def answer_question_from_document(file, question):
|
|
123 |
if not text:
|
124 |
return "No text extracted from the document."
|
125 |
|
126 |
-
text = translate_text(text) # Translate non-English text to English
|
127 |
truncated_text = truncate_text(text)
|
128 |
-
response = qa_pipeline(
|
129 |
|
130 |
-
return response["
|
131 |
|
132 |
def answer_question_from_image(image, question):
|
133 |
image_text = extract_text_from_image(image)
|
134 |
if not image_text:
|
135 |
return "No meaningful content detected in the image."
|
136 |
|
137 |
-
image_text = translate_text(image_text) # Translate non-English text to English
|
138 |
truncated_text = truncate_text(image_text)
|
139 |
-
response = qa_pipeline(
|
140 |
|
141 |
-
return response["
|
142 |
|
143 |
# Gradio UI for Document & Image QA
|
144 |
doc_interface = gr.Interface(
|
|
|
1 |
from fastapi import FastAPI, File, UploadFile
|
2 |
+
import fitz # PyMuPDF for PDF parsing
|
3 |
+
from tika import parser # Apache Tika for document parsing
|
4 |
import openpyxl
|
5 |
from pptx import Presentation
|
6 |
import torch
|
|
|
16 |
# Initialize FastAPI
|
17 |
app = FastAPI()
|
18 |
|
19 |
+
# Load AI Model for Question Answering (DeepSeek-V2-Chat)
|
20 |
+
qa_pipeline = pipeline("text-generation", model="deepseek-ai/DeepSeek-V2-Chat")
|
|
|
|
|
|
|
21 |
|
22 |
# Load Pretrained Object Detection Model (if needed)
|
23 |
model = fasterrcnn_resnet50_fpn(pretrained=True)
|
|
|
45 |
words = text.split()
|
46 |
return " ".join(words[:max_tokens])
|
47 |
|
48 |
+
# Document Text Extraction Functions
|
49 |
def extract_text_from_pdf(pdf_file):
|
|
|
50 |
try:
|
51 |
+
doc = fitz.open(pdf_file)
|
52 |
+
text = "\n".join([page.get_text("text") for page in doc])
|
53 |
+
return text if text else "No text found."
|
|
|
|
|
54 |
except Exception as e:
|
55 |
return f"Error reading PDF: {str(e)}"
|
|
|
56 |
|
57 |
+
def extract_text_with_tika(file):
|
58 |
try:
|
59 |
+
parsed = parser.from_buffer(file)
|
60 |
+
return parsed.get("content", "No text found.").strip()
|
61 |
except Exception as e:
|
62 |
+
return f"Error reading document: {str(e)}"
|
63 |
|
64 |
def extract_text_from_pptx(pptx_file):
|
65 |
try:
|
|
|
92 |
result = reader.readtext(np.array(image))
|
93 |
return " ".join([res[1] for res in result]) if result else "No text found."
|
94 |
|
|
|
|
|
|
|
95 |
# Function to answer questions based on document content
|
96 |
def answer_question_from_document(file, question):
|
97 |
validation_error = validate_file_type(file)
|
|
|
101 |
file_ext = file.name.split(".")[-1].lower()
|
102 |
if file_ext == "pdf":
|
103 |
text = extract_text_from_pdf(file)
|
104 |
+
elif file_ext in ["docx", "pptx"]:
|
105 |
+
text = extract_text_with_tika(file)
|
|
|
|
|
106 |
elif file_ext == "xlsx":
|
107 |
text = extract_text_from_excel(file)
|
108 |
else:
|
|
|
111 |
if not text:
|
112 |
return "No text extracted from the document."
|
113 |
|
|
|
114 |
truncated_text = truncate_text(text)
|
115 |
+
response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
|
116 |
|
117 |
+
return response[0]["generated_text"]
|
118 |
|
119 |
def answer_question_from_image(image, question):
|
120 |
image_text = extract_text_from_image(image)
|
121 |
if not image_text:
|
122 |
return "No meaningful content detected in the image."
|
123 |
|
|
|
124 |
truncated_text = truncate_text(image_text)
|
125 |
+
response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
|
126 |
|
127 |
+
return response[0]["generated_text"]
|
128 |
|
129 |
# Gradio UI for Document & Image QA
|
130 |
doc_interface = gr.Interface(
|