Spaces:
Sleeping
Sleeping
UPDATE: New Endpoints
Browse files- functions.py +5 -12
- requirements.txt +1 -2
functions.py
CHANGED
|
@@ -20,7 +20,7 @@ from qdrant_client import QdrantClient
|
|
| 20 |
from langchain_groq import ChatGroq
|
| 21 |
from pdf2image import convert_from_bytes
|
| 22 |
import numpy as np
|
| 23 |
-
|
| 24 |
from bs4 import BeautifulSoup
|
| 25 |
from urllib.parse import urlparse, urljoin
|
| 26 |
from supabase import create_client
|
|
@@ -40,7 +40,7 @@ vectorEmbeddings = HuggingFaceEmbeddings(
|
|
| 40 |
model_kwargs = model_kwargs,
|
| 41 |
encode_kwargs = encode_kwargs
|
| 42 |
)
|
| 43 |
-
|
| 44 |
sparseEmbeddings = FastEmbedSparse(model = "Qdrant/BM25")
|
| 45 |
prompt = """
|
| 46 |
INSTRUCTIONS:
|
|
@@ -290,15 +290,8 @@ def getLinks(url: str, timeout = 30):
|
|
| 290 |
|
| 291 |
|
| 292 |
def getTextFromImagePDF(pdfBytes):
|
| 293 |
-
global
|
| 294 |
allImages = convert_from_bytes(pdfBytes)
|
| 295 |
allImages = [np.array(image) for image in allImages]
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
result = ocr.ocr(page)
|
| 299 |
-
if result[0]:
|
| 300 |
-
retrievedText = "\n".join([result[0][x][1][0] for x in range(len(result[0]))])
|
| 301 |
-
else:
|
| 302 |
-
retrievedText = ""
|
| 303 |
-
pageWiseText.append(retrievedText)
|
| 304 |
-
return "\n\n\n".join(pageWiseText)
|
|
|
|
| 20 |
from langchain_groq import ChatGroq
|
| 21 |
from pdf2image import convert_from_bytes
|
| 22 |
import numpy as np
|
| 23 |
+
import easyocr
|
| 24 |
from bs4 import BeautifulSoup
|
| 25 |
from urllib.parse import urlparse, urljoin
|
| 26 |
from supabase import create_client
|
|
|
|
| 40 |
model_kwargs = model_kwargs,
|
| 41 |
encode_kwargs = encode_kwargs
|
| 42 |
)
|
| 43 |
+
reader = easyocr.Reader(['en'], gpu = True)
|
| 44 |
sparseEmbeddings = FastEmbedSparse(model = "Qdrant/BM25")
|
| 45 |
prompt = """
|
| 46 |
INSTRUCTIONS:
|
|
|
|
| 290 |
|
| 291 |
|
| 292 |
def getTextFromImagePDF(pdfBytes):
|
| 293 |
+
global reader
|
| 294 |
allImages = convert_from_bytes(pdfBytes)
|
| 295 |
allImages = [np.array(image) for image in allImages]
|
| 296 |
+
text = "\n\n\n".join(["\n".join([text[1] for text in reader.readtext(image, paragraph=True)]) for image in allImages])
|
| 297 |
+
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -17,8 +17,7 @@ PyPDF2
|
|
| 17 |
python-dotenv
|
| 18 |
pydantic
|
| 19 |
pandas
|
| 20 |
-
|
| 21 |
-
paddleocr
|
| 22 |
pdf2image
|
| 23 |
sentence-transformers
|
| 24 |
supabase
|
|
|
|
| 17 |
python-dotenv
|
| 18 |
pydantic
|
| 19 |
pandas
|
| 20 |
+
easyocr
|
|
|
|
| 21 |
pdf2image
|
| 22 |
sentence-transformers
|
| 23 |
supabase
|