ikraamkb commited on
Commit
2852c90
·
verified ·
1 Parent(s): 73efb2c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -32
app.py CHANGED
@@ -1,6 +1,6 @@
1
  from fastapi import FastAPI, File, UploadFile
2
- import pdfplumber
3
- import docx
4
  import openpyxl
5
  from pptx import Presentation
6
  import torch
@@ -16,11 +16,8 @@ import easyocr
16
  # Initialize FastAPI
17
  app = FastAPI()
18
 
19
- # Load AI Model for Question Answering (Proper Extractive QA Model)
20
- qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
21
-
22
- # Initialize Translator for Multilingual Support
23
- translator = pipeline("translation", model="facebook/m2m100_418M")
24
 
25
  # Load Pretrained Object Detection Model (if needed)
26
  model = fasterrcnn_resnet50_fpn(pretrained=True)
@@ -48,25 +45,21 @@ def truncate_text(text, max_tokens=450):
48
  words = text.split()
49
  return " ".join(words[:max_tokens])
50
 
51
- # Text Extraction Functions
52
  def extract_text_from_pdf(pdf_file):
53
- text = ""
54
  try:
55
- with pdfplumber.open(pdf_file) as pdf:
56
- for page in pdf.pages:
57
- page_text = page.extract_text()
58
- if page_text:
59
- text += page_text + "\n"
60
  except Exception as e:
61
  return f"Error reading PDF: {str(e)}"
62
- return text.strip() if text else "No text found."
63
 
64
- def extract_text_from_docx(docx_file):
65
  try:
66
- doc = docx.Document(docx_file)
67
- return "\n".join([para.text for para in doc.paragraphs])
68
  except Exception as e:
69
- return f"Error reading DOCX: {str(e)}"
70
 
71
  def extract_text_from_pptx(pptx_file):
72
  try:
@@ -99,9 +92,6 @@ def extract_text_from_image(image_file):
99
  result = reader.readtext(np.array(image))
100
  return " ".join([res[1] for res in result]) if result else "No text found."
101
 
102
- def translate_text(text, target_lang="en"):
103
- return translator(text, src_lang="auto", tgt_lang=target_lang)[0]["translation_text"]
104
-
105
  # Function to answer questions based on document content
106
  def answer_question_from_document(file, question):
107
  validation_error = validate_file_type(file)
@@ -111,10 +101,8 @@ def answer_question_from_document(file, question):
111
  file_ext = file.name.split(".")[-1].lower()
112
  if file_ext == "pdf":
113
  text = extract_text_from_pdf(file)
114
- elif file_ext == "docx":
115
- text = extract_text_from_docx(file)
116
- elif file_ext == "pptx":
117
- text = extract_text_from_pptx(file)
118
  elif file_ext == "xlsx":
119
  text = extract_text_from_excel(file)
120
  else:
@@ -123,22 +111,20 @@ def answer_question_from_document(file, question):
123
  if not text:
124
  return "No text extracted from the document."
125
 
126
- text = translate_text(text) # Translate non-English text to English
127
  truncated_text = truncate_text(text)
128
- response = qa_pipeline({"question": question, "context": truncated_text})
129
 
130
- return response["answer"]
131
 
132
  def answer_question_from_image(image, question):
133
  image_text = extract_text_from_image(image)
134
  if not image_text:
135
  return "No meaningful content detected in the image."
136
 
137
- image_text = translate_text(image_text) # Translate non-English text to English
138
  truncated_text = truncate_text(image_text)
139
- response = qa_pipeline({"question": question, "context": truncated_text})
140
 
141
- return response["answer"]
142
 
143
  # Gradio UI for Document & Image QA
144
  doc_interface = gr.Interface(
 
1
  from fastapi import FastAPI, File, UploadFile
2
+ import fitz # PyMuPDF for PDF parsing
3
+ from tika import parser # Apache Tika for document parsing
4
  import openpyxl
5
  from pptx import Presentation
6
  import torch
 
16
  # Initialize FastAPI
17
  app = FastAPI()
18
 
19
+ # Load AI Model for Question Answering (DeepSeek-V2-Chat)
20
+ qa_pipeline = pipeline("text-generation", model="deepseek-ai/DeepSeek-V2-Chat")
 
 
 
21
 
22
  # Load Pretrained Object Detection Model (if needed)
23
  model = fasterrcnn_resnet50_fpn(pretrained=True)
 
45
  words = text.split()
46
  return " ".join(words[:max_tokens])
47
 
48
+ # Document Text Extraction Functions
49
  def extract_text_from_pdf(pdf_file):
 
50
  try:
51
+ doc = fitz.open(pdf_file)
52
+ text = "\n".join([page.get_text("text") for page in doc])
53
+ return text if text else "No text found."
 
 
54
  except Exception as e:
55
  return f"Error reading PDF: {str(e)}"
 
56
 
57
+ def extract_text_with_tika(file):
58
  try:
59
+ parsed = parser.from_buffer(file)
60
+ return parsed.get("content", "No text found.").strip()
61
  except Exception as e:
62
+ return f"Error reading document: {str(e)}"
63
 
64
  def extract_text_from_pptx(pptx_file):
65
  try:
 
92
  result = reader.readtext(np.array(image))
93
  return " ".join([res[1] for res in result]) if result else "No text found."
94
 
 
 
 
95
  # Function to answer questions based on document content
96
  def answer_question_from_document(file, question):
97
  validation_error = validate_file_type(file)
 
101
  file_ext = file.name.split(".")[-1].lower()
102
  if file_ext == "pdf":
103
  text = extract_text_from_pdf(file)
104
+ elif file_ext in ["docx", "pptx"]:
105
+ text = extract_text_with_tika(file)
 
 
106
  elif file_ext == "xlsx":
107
  text = extract_text_from_excel(file)
108
  else:
 
111
  if not text:
112
  return "No text extracted from the document."
113
 
 
114
  truncated_text = truncate_text(text)
115
+ response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
116
 
117
+ return response[0]["generated_text"]
118
 
119
  def answer_question_from_image(image, question):
120
  image_text = extract_text_from_image(image)
121
  if not image_text:
122
  return "No meaningful content detected in the image."
123
 
 
124
  truncated_text = truncate_text(image_text)
125
+ response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
126
 
127
+ return response[0]["generated_text"]
128
 
129
  # Gradio UI for Document & Image QA
130
  doc_interface = gr.Interface(