anupam210 themeetjani commited on
Commit
99c2b2d
·
0 Parent(s):

Duplicate from ai-based/azure_ocr

Browse files

Co-authored-by: Meet Jani <[email protected]>

Files changed (7) hide show
  1. .gitattributes +34 -0
  2. README.md +14 -0
  3. app.py +67 -0
  4. extract_text.py +47 -0
  5. packages.txt +1 -0
  6. preprocessing_images.py +91 -0
  7. requirements.txt +25 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Azure Ocr
3
+ emoji: 🏢
4
+ colorFrom: yellow
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 3.24.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: other
11
+ duplicated_from: ai-based/azure_ocr
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ import gradio as gr
4
+ from azure.cognitiveservices.vision.computervision import ComputerVisionClient
5
+ from msrest.authentication import CognitiveServicesCredentials
6
+ from azure.storage.blob import BlobClient
7
+ #import utils functions
8
+ from preprocessing_images import preprocessing_function
9
+ from extract_text import azure_ocr
10
+ my_container = os.getenv("AZURE_CONTAINER")
11
+ subscription_key = os.getenv("SUB_KEY")
12
+ endpoint = os.getenv("AZURE_ENDPOINT")
13
+ connection_string = os.getenv("AZURE_CON_STRING")
14
+ openai.api_key = os.getenv("OPENAI_API_KEY")
15
+ computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))
16
+
17
+ def ocr_pdf(pdf_url):
18
+ preprocessing_function(pdf_url)
19
+ my_blob = pdf_url.split('/')[-1]
20
+ blob = BlobClient.from_connection_string(conn_str=connection_string, container_name= my_container, blob_name=my_blob)
21
+ with open("answer_paper.pdf", "rb") as data:
22
+ blob.upload_blob(data,overwrite=True)
23
+ text = azure_ocr(blob.url,computervision_client)
24
+ return text.strip()
25
+
26
+ def classify_cause(incident_description):
27
+ response = openai.Completion.create(
28
+ engine="text-davinci-003",
29
+ prompt= f"Identify the root cause from the below list:\nincident_description:{incident_description}\n",
30
+ temperature= 0,
31
+ max_tokens= 50,
32
+ n=1,
33
+ stop=None
34
+ #timeout=15,
35
+ )
36
+ classification = response.choices[0].text.strip()
37
+ return classification
38
+
39
+ def classify_class(incident_description):
40
+ response = openai.Completion.create(
41
+ engine="text-davinci-003",
42
+ prompt= f"Classify the following incident description into one of the given classes:Aircraft Autopilot Problem, Auxiliary Power Problem,Cabin Pressure Problem, Engine Problem,Fuel System Problem,Avionics Problem,Communications Problem,Electrical System Problem,Engine Problem,Fire/Smoke Problem,Fuel System Problem,Ground Service Problem,Hydraulic System Problem,Ice/Frost Problem,Landing Gear Problem,Maintenance Problem,Oxygen System Problem,other problem\nincident_description:{incident_description}\n",
43
+ temperature= 0,
44
+ max_tokens= 50,
45
+ n=1,
46
+ stop=None
47
+ #timeout=15,
48
+ )
49
+ classification = response.choices[0].text.strip()
50
+ return classification
51
+
52
+
53
+ def avatiation(pdf_url):
54
+ pdftext = ocr_pdf(pdf_url)
55
+ defect_class = classify_class(pdftext)
56
+ main_issue = classify_cause(pdftext)
57
+ return defect_class, main_issue
58
+
59
+
60
+ inputs = gr.inputs.Textbox(label="Link for aviation log reports")
61
+
62
+ outputs = [gr.outputs.Textbox(label="Main Issue of the log report"),
63
+ gr.outputs.Textbox(label="category of the log report")]
64
+
65
+ demo = gr.Interface(fn=avatiation,inputs=inputs,outputs=outputs, title="ATA Auto classification using OCR and GPT3 ")
66
+ demo.launch()
67
+
extract_text.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from preprocessing_images import preprocessing_function
2
+ from datetime import datetime
3
+ from azure.storage.blob import BlobClient
4
+ from msrest.authentication import CognitiveServicesCredentials
5
+ #importing azure packages
6
+ from azure.cognitiveservices.vision.computervision import ComputerVisionClient
7
+ from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
8
+
9
+ #ocr extraction using azure computer vision API
10
+ def azure_ocr(pdf_url,computervision_client):
11
+ try:
12
+ read_response = computervision_client.read(pdf_url,raw=True)
13
+ read_operation_location = read_response.headers["Operation-Location"]
14
+ operation_id = read_operation_location.split("/")[-1]
15
+ while True:
16
+ read_result = computervision_client.get_read_result(operation_id)
17
+ if read_result.status not in ['notStarted', 'running']:
18
+ break
19
+ words = []
20
+ if read_result.status == OperationStatusCodes.succeeded:
21
+ for text_result in read_result.analyze_result.read_results:
22
+ for line in text_result.lines:
23
+ words.append(line.text)
24
+ all_text = ' '.join(words)
25
+ return all_text
26
+ except Exception as e:
27
+ raise Exception(e)
28
+ def extract_text_from_url(test_pdf_url):
29
+ try:
30
+ preprocessing_function(test_pdf_url)
31
+ my_blob = 'test_clean_pdf' + datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
32
+ blob = BlobClient.from_connection_string(conn_str=connection_string, container_name= my_container, blob_name=my_blob)
33
+ with open("answer_paper.pdf", "rb") as data:
34
+ blob.upload_blob(data)
35
+ computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))
36
+ text = azure_ocr(blob.url, computervision_client)
37
+ text = text.lower()
38
+ n = text.find("150 word")
39
+ if n > 0:
40
+ text = text[n+10:]
41
+
42
+ elif text.find("150 ward") > 0:
43
+ nn = text.find("150 ward")
44
+ text = text[nn+10:]
45
+ return text
46
+ except Exception as e:
47
+ raise Exception(e)
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ poppler-utils
preprocessing_images.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #importing packages
2
+ from pdf2image import convert_from_path
3
+ from fpdf import FPDF
4
+ import cv2
5
+ import numpy as np
6
+ import requests
7
+ from PIL import Image
8
+ from PIL import ImageEnhance
9
+
10
+ def boxcutter(img):
11
+ img = np.array(img)
12
+ height, width = img.shape[:2]
13
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # convert to grayscale
14
+ blurred = cv2.GaussianBlur(gray, (5, 5), 0)
15
+ (T, threshInv) = cv2.threshold(blurred, 230, 255, cv2.THRESH_BINARY_INV)
16
+ contours, hierarchy = cv2.findContours(threshInv,cv2.RETR_LIST, \
17
+ cv2.CHAIN_APPROX_SIMPLE)
18
+ mx = (0,0,0,0) # biggest bounding box so far
19
+ mx_area =0
20
+ my_dict = {}
21
+ for cont in contours:
22
+ x,y,w,h = cv2.boundingRect(cont)
23
+ area = w*h
24
+ nw = x,y,w,h
25
+ my_dict[nw] = area
26
+ if len(my_dict)<=2:
27
+ return img
28
+ else:
29
+ x,y,w,h = sorted(my_dict, key=my_dict.get)[-2]
30
+ roi=img[y:y+h,x:x+w]
31
+ height_r, width_r = roi.shape[:2]
32
+ if height_r > int(height*0.6) and width_r > int(width*0.6):
33
+ return roi
34
+ # cv2.imwrite('C:/Users/Meet/Desktop/crop/'+ img_path.split('/')[-1], roi)
35
+ # cv2.rectangle(img,(x,y),(x+w,y+h),(200,0,0),2)
36
+ # cv2.imwrite('C:/Users/Meet/Desktop/cont/' + img_path.split('/')[-1], img)
37
+
38
+ else:
39
+ return img
40
+ # cv2.imwrite('C:/Users/Meet/Desktop/crop/'+ img_path.split('/')[-1], img)
41
+ # cv2.rectangle(img,(x,y),(x+w,y+h),(200,0,0),2)
42
+ # cv2.imwrite('C:/Users/Meet/Desktop/cont/' + img_path.split('/')[-1], img)
43
+
44
+ def noise_removal(image):
45
+ #increasing contrast of text
46
+ image=Image.fromarray(image)
47
+ enhancer=ImageEnhance.Contrast(image)
48
+ enhanced_image=enhancer.enhance(2.5)
49
+ image=np.array(enhanced_image)
50
+ #noise removel using dilate,erode, morphology and median blur
51
+ kernel = np.ones((1, 1), np.uint8)
52
+ image=cv2.dilate(image,kernel,iterations=1)
53
+ kernel=np.ones((1,1),np.uint8)
54
+ image=cv2.erode(image,kernel,iterations=1)
55
+ image = cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel,iterations=1)
56
+ image=cv2.medianBlur(image,3)
57
+ image=cv2.bitwise_not(image)
58
+ kernel=np.ones((1,1),np.uint8)
59
+ image=cv2.erode(image,kernel,iterations=3)
60
+ image=cv2.bitwise_not(image)
61
+ image=cv2.bitwise_not(image)
62
+ kernel=np.ones((1,1),np.uint8)
63
+ image=cv2.dilate(image,kernel,iterations=3)
64
+ image=cv2.bitwise_not(image)
65
+ return (image)
66
+
67
+ def preprocessing_function(url):
68
+ #reading and converting the pdf into image format
69
+ response = requests.get(url)
70
+ with open('metadata.pdf', 'wb') as f:
71
+ f.write(response.content)
72
+ images = convert_from_path('metadata.pdf', 200)
73
+ #removing red color from the answer paper
74
+ for i in range(len(images)):
75
+ img = boxcutter(images[i])
76
+ image = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
77
+ img_hsv=cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
78
+ lower_red = np.array([161,50,50])
79
+ upper_red = np.array([189,255,255])
80
+ mask1 = cv2.inRange(img_hsv, lower_red, upper_red)
81
+ image[np.where(mask1==255)] = 255
82
+ #using noise removal function to remove noise and enhance the images
83
+ img_bw = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
84
+ no_noise = noise_removal(img_bw)
85
+ cv2.imwrite("no_noise"+str(i)+".jpg", no_noise)
86
+ #saving the the final preprocessed images as pdf
87
+ pdf = FPDF()
88
+ for j in range(len(images)):
89
+ pdf.add_page()
90
+ pdf.image("no_noise"+str(j)+".jpg", 0, 0, 200, 300)
91
+ pdf.output("answer_paper.pdf", "F")
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==1.0.0
2
+ astunparse==1.6.3
3
+ azure-cognitiveservices-vision-computervision==0.9.0
4
+ azure-common==1.1.27
5
+ azure-core==1.21.1
6
+ azure-storage-blob==12.9.0
7
+ flatbuffers==2.0
8
+ fpdf==1.7.2
9
+ h5py==3.6.0
10
+ jsonify==0.5
11
+ joblib==1.1.0
12
+ numpy==1.22.1
13
+ msrest==0.6.21
14
+ nltk==3.7
15
+ opencv-python-headless==4.6.0.66
16
+ Pillow==9.2.0
17
+ pdf2image==1.16.0
18
+ PyPDF2==2.9.0
19
+ pandas==1.4.3
20
+ requests==2.28.1
21
+ scikit-learn==1.1.1
22
+ scipy==1.8.1
23
+ urllib3==1.26.11
24
+ gradio==3.26.0
25
+ openai==0.27.8