Dileep7729 commited on
Commit
bd746ed
·
verified ·
1 Parent(s): 0cf955c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -68
app.py CHANGED
@@ -1,68 +1,75 @@
1
- import gradio as gr
2
- import torch
3
- from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
4
- import pytesseract
5
-
6
- # Set the Tesseract executable path (for Windows users)
7
- import pytesseract
8
- pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
9
-
10
-
11
- # Load the model and processor
12
- processor = LayoutLMv3Processor.from_pretrained("quadranttechnologies/Table_OCR")
13
- model = LayoutLMv3ForTokenClassification.from_pretrained("quadranttechnologies/Table_OCR")
14
- model.eval()
15
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
- model.to(device)
17
-
18
- def process_image(image):
19
- try:
20
- # Preprocess the image using the processor
21
- encoding = processor(image, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
22
-
23
- # Move inputs to the same device as the model
24
- encoding = {key: val.to(device) for key, val in encoding.items()}
25
-
26
- # Perform inference
27
- with torch.no_grad():
28
- outputs = model(**encoding)
29
- predictions = torch.argmax(outputs.logits, dim=-1)
30
-
31
- # Extract input IDs, bounding boxes, and predicted labels
32
- words = encoding["input_ids"]
33
- bboxes = encoding["bbox"]
34
- labels = predictions.squeeze().tolist()
35
-
36
- # Format output as JSON
37
- structured_output = []
38
- for word_id, bbox, label in zip(words.squeeze().tolist(), bboxes.squeeze().tolist(), labels):
39
- # Decode the word ID to text
40
- word = processor.tokenizer.decode([word_id]).strip()
41
- if word: # Avoid adding empty words
42
- structured_output.append({
43
- "word": word,
44
- "bounding_box": bbox,
45
- "label": model.config.id2label[label] # Convert label ID to label name
46
- })
47
-
48
- return structured_output
49
-
50
- except Exception as e:
51
- return {"error": str(e)} # Return error details if any issue occurs
52
-
53
- # Define the Gradio interface
54
- interface = gr.Interface(
55
- fn=process_image,
56
- inputs=gr.Image(type="pil"), # Accepts image input
57
- outputs="json", # Outputs JSON structure
58
- title="Table OCR",
59
- description="Upload an image (e.g., receipt or document) to extract structured information in JSON format."
60
- )
61
-
62
- # Launch the app
63
- if __name__ == "__main__":
64
- interface.launch(share=True)
65
-
66
-
67
-
68
-
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
4
+ import pytesseract
5
+
6
+ import pytesseract
7
+
8
+ # Explicitly set the Tesseract path
9
+ # For Hugging Face Spaces, set this to the default Linux path
10
+ pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
11
+
12
+ # For local development on Windows
13
+ # Uncomment the line below if running locally on Windows
14
+ # pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
15
+
16
+
17
+
18
+ # Load the model and processor
19
+ processor = LayoutLMv3Processor.from_pretrained("quadranttechnologies/Table_OCR")
20
+ model = LayoutLMv3ForTokenClassification.from_pretrained("quadranttechnologies/Table_OCR")
21
+ model.eval()
22
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
+ model.to(device)
24
+
25
+ def process_image(image):
26
+ try:
27
+ # Preprocess the image using the processor
28
+ encoding = processor(image, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
29
+
30
+ # Move inputs to the same device as the model
31
+ encoding = {key: val.to(device) for key, val in encoding.items()}
32
+
33
+ # Perform inference
34
+ with torch.no_grad():
35
+ outputs = model(**encoding)
36
+ predictions = torch.argmax(outputs.logits, dim=-1)
37
+
38
+ # Extract input IDs, bounding boxes, and predicted labels
39
+ words = encoding["input_ids"]
40
+ bboxes = encoding["bbox"]
41
+ labels = predictions.squeeze().tolist()
42
+
43
+ # Format output as JSON
44
+ structured_output = []
45
+ for word_id, bbox, label in zip(words.squeeze().tolist(), bboxes.squeeze().tolist(), labels):
46
+ # Decode the word ID to text
47
+ word = processor.tokenizer.decode([word_id]).strip()
48
+ if word: # Avoid adding empty words
49
+ structured_output.append({
50
+ "word": word,
51
+ "bounding_box": bbox,
52
+ "label": model.config.id2label[label] # Convert label ID to label name
53
+ })
54
+
55
+ return structured_output
56
+
57
+ except Exception as e:
58
+ return {"error": str(e)} # Return error details if any issue occurs
59
+
60
+ # Define the Gradio interface
61
+ interface = gr.Interface(
62
+ fn=process_image,
63
+ inputs=gr.Image(type="pil"), # Accepts image input
64
+ outputs="json", # Outputs JSON structure
65
+ title="Table OCR",
66
+ description="Upload an image (e.g., receipt or document) to extract structured information in JSON format."
67
+ )
68
+
69
+ # Launch the app
70
+ if __name__ == "__main__":
71
+ interface.launch(share=True)
72
+
73
+
74
+
75
+