Genzo1010 commited on
Commit
125b268
·
verified ·
1 Parent(s): e832cbb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -26
app.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  from fastapi import FastAPI, File, UploadFile
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from fastapi.middleware.gzip import GZipMiddleware
@@ -16,41 +19,47 @@ app.add_middleware(
16
  allow_methods=["*"],
17
  allow_headers=["*"]
18
  )
 
19
 
20
  # Initialize models once at startup
21
  ocr_model = ocr_predictor(pretrained=True)
22
- paddle_ocr = PaddleOCR(lang='en', use_angle_cls=True)
23
-
24
- def ocr_with_doctr(file):
25
- text_output = ''
26
- doc = DocumentFile.from_pdf(file)
27
- ocr_model = ocr_predictor(pretrained=True)
28
- result = ocr_model(doc)
29
- for page in result.pages:
30
- for block in page.blocks:
31
- for line in block.lines:
32
- text_output += " ".join([word.value for word in line.words]) + "\n"
33
- return text_output
34
-
35
- def ocr_with_paddle(img):
36
- finaltext = ''
37
- result = paddle_ocr.ocr(img)
38
- for i in range(len(result[0])):
39
- text = result[0][i][1][0]
40
- finaltext += ' ' + text
41
- return finaltext
42
-
43
- def generate_text_from_image(img):
44
- return ocr_with_paddle(img)
 
 
 
 
45
 
46
  @app.post("/ocr/")
47
  async def perform_ocr(file: UploadFile = File(...)):
48
  file_bytes = await file.read()
49
  if file.filename.endswith('.pdf'):
50
- text_output = ocr_with_doctr(io.BytesIO(file_bytes))
51
  else:
52
- img = np.array(Image.open(io.BytesIO(file_bytes)))
53
- text_output = generate_text_from_image(img)
 
54
  return {"ocr_text": text_output}
55
 
56
  @app.get("/test/")
 
1
+ import os
2
+ import asyncio
3
+ from asyncio import BoundedSemaphore
4
  from fastapi import FastAPI, File, UploadFile
5
  from fastapi.middleware.cors import CORSMiddleware
6
  from fastapi.middleware.gzip import GZipMiddleware
 
19
  allow_methods=["*"],
20
  allow_headers=["*"]
21
  )
22
+ app.add_middleware(GZipMiddleware, minimum_size=1000)
23
 
24
  # Initialize models once at startup
25
  ocr_model = ocr_predictor(pretrained=True)
26
+ paddle_ocr = PaddleOCR(lang='en', use_angle_cls=True, use_gpu=False)
27
+
28
+ # Create a bounded semaphore to limit concurrent requests
29
+ semaphore = BoundedSemaphore(10)
30
+
31
+ async def ocr_with_doctr(file):
32
+ async with semaphore:
33
+ doc = DocumentFile.from_pdf(file)
34
+ result = ocr_model(doc)
35
+ text_output = ''
36
+ for page in result.pages:
37
+ for block in page.blocks:
38
+ for line in block.lines:
39
+ text_output += " ".join([word.value for word in line.words]) + "\n"
40
+ return text_output
41
+
42
+ async def ocr_with_paddle(img):
43
+ async with semaphore:
44
+ result = paddle_ocr.ocr(img)
45
+ finaltext = ''
46
+ for i in range(len(result[0])):
47
+ text = result[0][i][1][0]
48
+ finaltext +='' + text
49
+ return finaltext
50
+
51
+ async def generate_text_from_image(img):
52
+ return await ocr_with_paddle(img)
53
 
54
  @app.post("/ocr/")
55
  async def perform_ocr(file: UploadFile = File(...)):
56
  file_bytes = await file.read()
57
  if file.filename.endswith('.pdf'):
58
+ text_output = await ocr_with_doctr(io.BytesIO(file_bytes))
59
  else:
60
+ img = Image.open(io.BytesIO(file_bytes))
61
+ img.thumbnail((1024, 1024)) # Reduce image size
62
+ text_output = await generate_text_from_image(img)
63
  return {"ocr_text": text_output}
64
 
65
  @app.get("/test/")