sachin commited on
Commit
a10c7a6
·
1 Parent(s): 6ea2bcc
Files changed (1) hide show
  1. app.py +69 -30
app.py CHANGED
@@ -1,52 +1,91 @@
1
  import gradio as gr
2
  import requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  # Function to send the POST request to the API
5
  def extract_text_from_pdf(pdf_file, page_number):
 
 
 
 
 
 
 
 
6
  # API endpoint
7
  url = "http://209.20.158.215:7861/extract-text-eng/"
8
 
9
  # Prepare the payload
10
- files = {
11
- "file": (pdf_file.name, pdf_file, "application/pdf")
12
- }
13
- data = {
14
- "page_number": str(page_number),
15
- "src_lang": "eng_Latn",
16
- "tgt_lang": "eng_Latn",
17
- "prompt": "describe the image"
18
- }
19
-
20
- # Headers
21
- headers = {
22
- "accept": "application/json"
23
- }
24
 
25
- try:
26
- # Send the POST request
27
- response = requests.post(url, files=files, data=data, headers=headers)
28
-
29
- # Check if the request was successful
30
- if response.status_code == 200:
31
- return response.json().get("result", "No result returned from API")
32
- else:
33
- return f"Error: {response.status_code} - {response.text}"
34
- except Exception as e:
35
- return f"Error: Failed to connect to the API - {str(e)}"
 
 
 
 
 
 
 
36
 
37
  # Gradio interface
38
- with gr.Blocks(title="PDF Text Extraction") as demo:
39
- gr.Markdown("# Extract Text from PDF and Describe Content")
 
 
 
 
 
 
40
 
41
  # Input components
42
- pdf_input = gr.File(label="Upload PDF File", file_types=[".pdf"])
43
  page_number_input = gr.Number(label="Page Number", value=1, precision=0, minimum=1)
44
 
45
  # Submit button
46
- submit_button = gr.Button("Extract and Describe")
47
 
48
  # Output component
49
- output_text = gr.Textbox(label="API Response", lines=10)
 
 
 
 
50
 
51
  # Connect the button to the function
52
  submit_button.click(
 
1
  import gradio as gr
2
  import requests
3
+ from PyPDF2 import PdfReader
4
+ import io
5
+ import os
6
+
7
+ # Function to validate PDF file
8
+ def is_valid_pdf(file_path):
9
+ try:
10
+ # If file_path is a string (Gradio provides a temporary file path)
11
+ if isinstance(file_path, str) and os.path.exists(file_path):
12
+ with open(file_path, "rb") as f:
13
+ pdf = PdfReader(f)
14
+ if len(pdf.pages) > 0:
15
+ return True, f"Valid PDF with {len(pdf.pages)} pages"
16
+ return False, "Invalid PDF: No pages found"
17
+ else:
18
+ return False, "Invalid PDF: File path is not valid"
19
+ except Exception as e:
20
+ return False, f"Invalid PDF: {str(e)}"
21
 
22
  # Function to send the POST request to the API
23
  def extract_text_from_pdf(pdf_file, page_number):
24
+ if not pdf_file:
25
+ return "Error: No file uploaded. Please upload a PDF file."
26
+
27
+ # Validate the PDF using the file path
28
+ valid, message = is_valid_pdf(pdf_file)
29
+ if not valid:
30
+ return f"Error: {message}. Please upload a valid PDF file or repair the current one."
31
+
32
  # API endpoint
33
  url = "http://209.20.158.215:7861/extract-text-eng/"
34
 
35
  # Prepare the payload
36
+ with open(pdf_file, "rb") as f:
37
+ files = {
38
+ "file": ("uploaded.pdf", f, "application/pdf")
39
+ }
40
+ data = {
41
+ "page_number": str(page_number),
42
+ "src_lang": "eng_Latn",
43
+ "tgt_lang": "eng_Latn",
44
+ "prompt": "describe the image"
45
+ }
 
 
 
 
46
 
47
+ # Headers
48
+ headers = {
49
+ "accept": "application/json"
50
+ }
51
+
52
+ try:
53
+ # Send the POST request
54
+ response = requests.post(url, files=files, data=data, headers=headers)
55
+
56
+ # Check if the request was successful
57
+ if response.status_code == 200:
58
+ result = response.json()
59
+ page_content = result.get("page_content", "No description returned from API")
60
+ return page_content
61
+ else:
62
+ return f"Error: {response.status_code} - {response.text}"
63
+ except Exception as e:
64
+ return f"Error: Failed to connect to the API - {str(e)}"
65
 
66
  # Gradio interface
67
+ with gr.Blocks(title="PDF Content Description") as demo:
68
+ gr.Markdown("# PDF Content Description Extractor")
69
+ gr.Markdown(
70
+ """
71
+ Upload a PDF file (e.g., Dhwani-AI-Pitch-Europe.pdf) and specify a page number to extract a description of its content.
72
+ The API will analyze the page and return a textual description, such as details about images, text, or layout.
73
+ """
74
+ )
75
 
76
  # Input components
77
+ pdf_input = gr.File(label="Upload PDF File", file_types=[".pdf"], type="filepath")
78
  page_number_input = gr.Number(label="Page Number", value=1, precision=0, minimum=1)
79
 
80
  # Submit button
81
+ submit_button = gr.Button("Extract Description")
82
 
83
  # Output component
84
+ output_text = gr.Textbox(
85
+ label="Content Description",
86
+ lines=10,
87
+ placeholder="The API response will appear here, describing the content of the specified PDF page."
88
+ )
89
 
90
  # Connect the button to the function
91
  submit_button.click(