Spaces:
Sleeping
Sleeping
Commit
·
99c2b2d
0
Parent(s):
Duplicate from ai-based/azure_ocr
Browse filesCo-authored-by: Meet Jani <[email protected]>
- .gitattributes +34 -0
- README.md +14 -0
- app.py +67 -0
- extract_text.py +47 -0
- packages.txt +1 -0
- preprocessing_images.py +91 -0
- requirements.txt +25 -0
.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Azure Ocr
|
3 |
+
emoji: 🏢
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: green
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.24.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: other
|
11 |
+
duplicated_from: ai-based/azure_ocr
|
12 |
+
---
|
13 |
+
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import openai
|
3 |
+
import gradio as gr
|
4 |
+
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
|
5 |
+
from msrest.authentication import CognitiveServicesCredentials
|
6 |
+
from azure.storage.blob import BlobClient
|
7 |
+
#import utils functions
|
8 |
+
from preprocessing_images import preprocessing_function
|
9 |
+
from extract_text import azure_ocr
|
10 |
+
my_container = os.getenv("AZURE_CONTAINER")
|
11 |
+
subscription_key = os.getenv("SUB_KEY")
|
12 |
+
endpoint = os.getenv("AZURE_ENDPOINT")
|
13 |
+
connection_string = os.getenv("AZURE_CON_STRING")
|
14 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
15 |
+
computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))
|
16 |
+
|
17 |
+
def ocr_pdf(pdf_url):
|
18 |
+
preprocessing_function(pdf_url)
|
19 |
+
my_blob = pdf_url.split('/')[-1]
|
20 |
+
blob = BlobClient.from_connection_string(conn_str=connection_string, container_name= my_container, blob_name=my_blob)
|
21 |
+
with open("answer_paper.pdf", "rb") as data:
|
22 |
+
blob.upload_blob(data,overwrite=True)
|
23 |
+
text = azure_ocr(blob.url,computervision_client)
|
24 |
+
return text.strip()
|
25 |
+
|
26 |
+
def classify_cause(incident_description):
|
27 |
+
response = openai.Completion.create(
|
28 |
+
engine="text-davinci-003",
|
29 |
+
prompt= f"Identify the root cause from the below list:\nincident_description:{incident_description}\n",
|
30 |
+
temperature= 0,
|
31 |
+
max_tokens= 50,
|
32 |
+
n=1,
|
33 |
+
stop=None
|
34 |
+
#timeout=15,
|
35 |
+
)
|
36 |
+
classification = response.choices[0].text.strip()
|
37 |
+
return classification
|
38 |
+
|
39 |
+
def classify_class(incident_description):
|
40 |
+
response = openai.Completion.create(
|
41 |
+
engine="text-davinci-003",
|
42 |
+
prompt= f"Classify the following incident description into one of the given classes:Aircraft Autopilot Problem, Auxiliary Power Problem,Cabin Pressure Problem, Engine Problem,Fuel System Problem,Avionics Problem,Communications Problem,Electrical System Problem,Engine Problem,Fire/Smoke Problem,Fuel System Problem,Ground Service Problem,Hydraulic System Problem,Ice/Frost Problem,Landing Gear Problem,Maintenance Problem,Oxygen System Problem,other problem\nincident_description:{incident_description}\n",
|
43 |
+
temperature= 0,
|
44 |
+
max_tokens= 50,
|
45 |
+
n=1,
|
46 |
+
stop=None
|
47 |
+
#timeout=15,
|
48 |
+
)
|
49 |
+
classification = response.choices[0].text.strip()
|
50 |
+
return classification
|
51 |
+
|
52 |
+
|
53 |
+
def avatiation(pdf_url):
|
54 |
+
pdftext = ocr_pdf(pdf_url)
|
55 |
+
defect_class = classify_class(pdftext)
|
56 |
+
main_issue = classify_cause(pdftext)
|
57 |
+
return defect_class, main_issue
|
58 |
+
|
59 |
+
|
60 |
+
inputs = gr.inputs.Textbox(label="Link for aviation log reports")
|
61 |
+
|
62 |
+
outputs = [gr.outputs.Textbox(label="Main Issue of the log report"),
|
63 |
+
gr.outputs.Textbox(label="category of the log report")]
|
64 |
+
|
65 |
+
demo = gr.Interface(fn=avatiation,inputs=inputs,outputs=outputs, title="ATA Auto classification using OCR and GPT3 ")
|
66 |
+
demo.launch()
|
67 |
+
|
extract_text.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from preprocessing_images import preprocessing_function
|
2 |
+
from datetime import datetime
|
3 |
+
from azure.storage.blob import BlobClient
|
4 |
+
from msrest.authentication import CognitiveServicesCredentials
|
5 |
+
#importing azure packages
|
6 |
+
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
|
7 |
+
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
|
8 |
+
|
9 |
+
#ocr extraction using azure computer vision API
|
10 |
+
def azure_ocr(pdf_url,computervision_client):
|
11 |
+
try:
|
12 |
+
read_response = computervision_client.read(pdf_url,raw=True)
|
13 |
+
read_operation_location = read_response.headers["Operation-Location"]
|
14 |
+
operation_id = read_operation_location.split("/")[-1]
|
15 |
+
while True:
|
16 |
+
read_result = computervision_client.get_read_result(operation_id)
|
17 |
+
if read_result.status not in ['notStarted', 'running']:
|
18 |
+
break
|
19 |
+
words = []
|
20 |
+
if read_result.status == OperationStatusCodes.succeeded:
|
21 |
+
for text_result in read_result.analyze_result.read_results:
|
22 |
+
for line in text_result.lines:
|
23 |
+
words.append(line.text)
|
24 |
+
all_text = ' '.join(words)
|
25 |
+
return all_text
|
26 |
+
except Exception as e:
|
27 |
+
raise Exception(e)
|
28 |
+
def extract_text_from_url(test_pdf_url):
|
29 |
+
try:
|
30 |
+
preprocessing_function(test_pdf_url)
|
31 |
+
my_blob = 'test_clean_pdf' + datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
|
32 |
+
blob = BlobClient.from_connection_string(conn_str=connection_string, container_name= my_container, blob_name=my_blob)
|
33 |
+
with open("answer_paper.pdf", "rb") as data:
|
34 |
+
blob.upload_blob(data)
|
35 |
+
computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))
|
36 |
+
text = azure_ocr(blob.url, computervision_client)
|
37 |
+
text = text.lower()
|
38 |
+
n = text.find("150 word")
|
39 |
+
if n > 0:
|
40 |
+
text = text[n+10:]
|
41 |
+
|
42 |
+
elif text.find("150 ward") > 0:
|
43 |
+
nn = text.find("150 ward")
|
44 |
+
text = text[nn+10:]
|
45 |
+
return text
|
46 |
+
except Exception as e:
|
47 |
+
raise Exception(e)
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
poppler-utils
|
preprocessing_images.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#importing packages
|
2 |
+
from pdf2image import convert_from_path
|
3 |
+
from fpdf import FPDF
|
4 |
+
import cv2
|
5 |
+
import numpy as np
|
6 |
+
import requests
|
7 |
+
from PIL import Image
|
8 |
+
from PIL import ImageEnhance
|
9 |
+
|
10 |
+
def boxcutter(img):
|
11 |
+
img = np.array(img)
|
12 |
+
height, width = img.shape[:2]
|
13 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # convert to grayscale
|
14 |
+
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
|
15 |
+
(T, threshInv) = cv2.threshold(blurred, 230, 255, cv2.THRESH_BINARY_INV)
|
16 |
+
contours, hierarchy = cv2.findContours(threshInv,cv2.RETR_LIST, \
|
17 |
+
cv2.CHAIN_APPROX_SIMPLE)
|
18 |
+
mx = (0,0,0,0) # biggest bounding box so far
|
19 |
+
mx_area =0
|
20 |
+
my_dict = {}
|
21 |
+
for cont in contours:
|
22 |
+
x,y,w,h = cv2.boundingRect(cont)
|
23 |
+
area = w*h
|
24 |
+
nw = x,y,w,h
|
25 |
+
my_dict[nw] = area
|
26 |
+
if len(my_dict)<=2:
|
27 |
+
return img
|
28 |
+
else:
|
29 |
+
x,y,w,h = sorted(my_dict, key=my_dict.get)[-2]
|
30 |
+
roi=img[y:y+h,x:x+w]
|
31 |
+
height_r, width_r = roi.shape[:2]
|
32 |
+
if height_r > int(height*0.6) and width_r > int(width*0.6):
|
33 |
+
return roi
|
34 |
+
# cv2.imwrite('C:/Users/Meet/Desktop/crop/'+ img_path.split('/')[-1], roi)
|
35 |
+
# cv2.rectangle(img,(x,y),(x+w,y+h),(200,0,0),2)
|
36 |
+
# cv2.imwrite('C:/Users/Meet/Desktop/cont/' + img_path.split('/')[-1], img)
|
37 |
+
|
38 |
+
else:
|
39 |
+
return img
|
40 |
+
# cv2.imwrite('C:/Users/Meet/Desktop/crop/'+ img_path.split('/')[-1], img)
|
41 |
+
# cv2.rectangle(img,(x,y),(x+w,y+h),(200,0,0),2)
|
42 |
+
# cv2.imwrite('C:/Users/Meet/Desktop/cont/' + img_path.split('/')[-1], img)
|
43 |
+
|
44 |
+
def noise_removal(image):
|
45 |
+
#increasing contrast of text
|
46 |
+
image=Image.fromarray(image)
|
47 |
+
enhancer=ImageEnhance.Contrast(image)
|
48 |
+
enhanced_image=enhancer.enhance(2.5)
|
49 |
+
image=np.array(enhanced_image)
|
50 |
+
#noise removel using dilate,erode, morphology and median blur
|
51 |
+
kernel = np.ones((1, 1), np.uint8)
|
52 |
+
image=cv2.dilate(image,kernel,iterations=1)
|
53 |
+
kernel=np.ones((1,1),np.uint8)
|
54 |
+
image=cv2.erode(image,kernel,iterations=1)
|
55 |
+
image = cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel,iterations=1)
|
56 |
+
image=cv2.medianBlur(image,3)
|
57 |
+
image=cv2.bitwise_not(image)
|
58 |
+
kernel=np.ones((1,1),np.uint8)
|
59 |
+
image=cv2.erode(image,kernel,iterations=3)
|
60 |
+
image=cv2.bitwise_not(image)
|
61 |
+
image=cv2.bitwise_not(image)
|
62 |
+
kernel=np.ones((1,1),np.uint8)
|
63 |
+
image=cv2.dilate(image,kernel,iterations=3)
|
64 |
+
image=cv2.bitwise_not(image)
|
65 |
+
return (image)
|
66 |
+
|
67 |
+
def preprocessing_function(url):
|
68 |
+
#reading and converting the pdf into image format
|
69 |
+
response = requests.get(url)
|
70 |
+
with open('metadata.pdf', 'wb') as f:
|
71 |
+
f.write(response.content)
|
72 |
+
images = convert_from_path('metadata.pdf', 200)
|
73 |
+
#removing red color from the answer paper
|
74 |
+
for i in range(len(images)):
|
75 |
+
img = boxcutter(images[i])
|
76 |
+
image = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
|
77 |
+
img_hsv=cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
|
78 |
+
lower_red = np.array([161,50,50])
|
79 |
+
upper_red = np.array([189,255,255])
|
80 |
+
mask1 = cv2.inRange(img_hsv, lower_red, upper_red)
|
81 |
+
image[np.where(mask1==255)] = 255
|
82 |
+
#using noise removal function to remove noise and enhance the images
|
83 |
+
img_bw = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
|
84 |
+
no_noise = noise_removal(img_bw)
|
85 |
+
cv2.imwrite("no_noise"+str(i)+".jpg", no_noise)
|
86 |
+
#saving the the final preprocessed images as pdf
|
87 |
+
pdf = FPDF()
|
88 |
+
for j in range(len(images)):
|
89 |
+
pdf.add_page()
|
90 |
+
pdf.image("no_noise"+str(j)+".jpg", 0, 0, 200, 300)
|
91 |
+
pdf.output("answer_paper.pdf", "F")
|
requirements.txt
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==1.0.0
|
2 |
+
astunparse==1.6.3
|
3 |
+
azure-cognitiveservices-vision-computervision==0.9.0
|
4 |
+
azure-common==1.1.27
|
5 |
+
azure-core==1.21.1
|
6 |
+
azure-storage-blob==12.9.0
|
7 |
+
flatbuffers==2.0
|
8 |
+
fpdf==1.7.2
|
9 |
+
h5py==3.6.0
|
10 |
+
jsonify==0.5
|
11 |
+
joblib==1.1.0
|
12 |
+
numpy==1.22.1
|
13 |
+
msrest==0.6.21
|
14 |
+
nltk==3.7
|
15 |
+
opencv-python-headless==4.6.0.66
|
16 |
+
Pillow==9.2.0
|
17 |
+
pdf2image==1.16.0
|
18 |
+
PyPDF2==2.9.0
|
19 |
+
pandas==1.4.3
|
20 |
+
requests==2.28.1
|
21 |
+
scikit-learn==1.1.1
|
22 |
+
scipy==1.8.1
|
23 |
+
urllib3==1.26.11
|
24 |
+
gradio==3.26.0
|
25 |
+
openai==0.27.8
|