Spaces:
Sleeping
Sleeping
from django.shortcuts import render, redirect | |
from django.http import HttpResponse, JsonResponse, StreamingHttpResponse | |
import requests | |
import uuid | |
import json | |
import os | |
from pdf2image import convert_from_path, convert_from_bytes | |
from django.views.decorators.csrf import csrf_exempt | |
from django.core.files.storage import FileSystemStorage | |
import threading | |
import random | |
import google.generativeai as genai | |
import google.ai.generativelanguage as glm | |
import io | |
import base64 | |
import os | |
from .models import UseCases, DocumentTypes | |
# host_url = "http://16.170.244.54" | |
host_url = "https://thejagstudio-absoluteai.hf.space/" | |
googleAPIKey = "AIzaSyBeo4NGA__U6Xxy-aBE6yFm19pgq8TY-TM" | |
genai.configure(api_key='AIzaSyCg9NGsLygb0sVKpviMkgV4eMPLd9nXW7w') | |
def getAnswer(images): | |
url = "https://content-vision.googleapis.com/v1/images:annotate?alt=json&key="+googleAPIKey | |
payload = {"requests": []} | |
for img in images: | |
# temp = { | |
# "image": {"source": {"imageUri": i}}, | |
# "features": [ | |
# { | |
# "type": "DOCUMENT_TEXT_DETECTION", | |
# "maxResults": 50, | |
# "model": "builtin/latest", | |
# } | |
# ], | |
# } | |
temp = { | |
"image": {"content": img}, | |
"features": [ | |
{ | |
"type": "DOCUMENT_TEXT_DETECTION", | |
"maxResults": 50, | |
"model": "builtin/latest", | |
} | |
], | |
} | |
payload["requests"].append(temp) | |
headers = { | |
"authority": "content-vision.googleapis.com", | |
"accept": "*/*", | |
"accept-language": "en-US,en;q=0.9,gu;q=0.8", | |
"content-type": "application/json", | |
"origin": "https://content-vision.googleapis.com", | |
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", | |
"x-origin": "https://explorer.apis.google.com", | |
"x-requested-with": "XMLHttpRequest", | |
} | |
response = requests.request("POST", url, headers=headers, data=json.dumps(payload)) | |
OCRString = "" | |
try: | |
for i in range(len(response.json()["responses"])): | |
OCRString += "\n\n\n"+response.json()["responses"][i]["fullTextAnnotation"]["text"] | |
except Exception as e: | |
print(e, response.text) | |
return OCRString | |
def dataExtract(request, link): | |
if request.method == "POST": | |
documentData = DocumentTypes.objects.filter(url=link).first() | |
pdf_file = request.FILES["pdf"] | |
randomUUID = str(uuid.uuid4()) | |
fs = FileSystemStorage(location="static/pdf/") | |
# filename = fs.save(f"{randomUUID}.pdf", pdf_file) | |
# os.mkdir(f"./static/pages/{randomUUID}") | |
image_list = [] | |
images = convert_from_bytes( | |
pdf_file.read(), | |
dpi=150, | |
fmt="png", | |
output_file=f"image", | |
thread_count=5 | |
) | |
for img in images: | |
buffer = io.BytesIO() | |
img.save(buffer, format='PNG') | |
img_bytes = buffer.getvalue() | |
# Encode the bytes to base64 | |
img_base64 = base64.b64encode(img_bytes).decode() | |
image_list.append(img_base64) | |
# images = convert_from_path( | |
# f"./static/pdf/{randomUUID}.pdf", | |
# dpi=150, | |
# output_folder=f"./static/pages/{randomUUID}", | |
# fmt="png", | |
# output_file=f"image", | |
# thread_count=5, | |
# poppler_path="./poppler-23.05.0/Library/bin/" | |
# ) | |
# for filename in os.listdir(f"./static/pages/{randomUUID}"): | |
# image_list.append(f"/static/pages/{randomUUID}/{filename}") | |
# image_Array = [] | |
# for i in range(len(image_list)): | |
# image_Array.append(host_url + image_list[i]) | |
OCRString = getAnswer(image_list) | |
fields = documentData.fields | |
properties = {} | |
for field in fields: | |
properties[field] = {'type_': 'STRING'} | |
entityTool = { | |
'function_declarations': [ | |
{ | |
'name': 'entityTool', | |
'description': 'List of entities and value extracted from the text.', | |
'parameters': { | |
'type_': 'OBJECT', | |
'properties': properties, | |
'required': [] | |
} | |
} | |
] | |
} | |
safety_settings = [ | |
{ | |
"category": "HARM_CATEGORY_HARASSMENT", | |
"threshold": "BLOCK_NONE" | |
}, | |
{ | |
"category": "HARM_CATEGORY_HATE_SPEECH", | |
"threshold": "BLOCK_NONE" | |
}, | |
{ | |
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", | |
"threshold": "BLOCK_NONE" | |
}, | |
{ | |
"category": "HARM_CATEGORY_DANGEROUS_CONTENT", | |
"threshold": "BLOCK_NONE" | |
}, | |
] | |
model = genai.GenerativeModel(model_name='gemini-2.0-flash-lite', tools=entityTool, safety_settings=safety_settings) | |
chat = model.start_chat() | |
response = chat.send_message('PDF Data : \n\n'+OCRString) | |
fc = response.candidates[0].content.parts[0].function_call | |
data = {} | |
if fc.name == "entityTool": | |
for field in fields: | |
try: | |
data[field] = fc.args[field] | |
except: | |
pass | |
print(data) | |
return HttpResponse( | |
json.dumps({"images": image_list, "data": data}), content_type="application/json" | |
) | |
else: | |
return HttpResponse("Error") | |
def imageToText(request): | |
if request.method == "POST": | |
jsonData = json.loads(request.body) | |
imageArr = jsonData["images"] | |
for i in range(len(imageArr)): | |
imageArr[i] = host_url + imageArr[i] | |
imageArr = [imageArr[i: i + 10] for i in range(0, len(imageArr), 10)] | |
answers = [] | |
text = [] | |
box = [] | |
for i in range(len(imageArr)): | |
textTemp, boxTemp = getAnswer(imageArr[i]) | |
text.extend(textTemp) | |
box.extend(boxTemp) | |
return HttpResponse( | |
json.dumps({"text": text, "box": box}), content_type="application/json" | |
) | |
else: | |
return HttpResponse("Error") | |
def documentAIData(request): | |
usecases = UseCases.objects.all() | |
documentTypes = DocumentTypes.objects.all() | |
usecasesArr = [] | |
documentTypesArr = [] | |
for doc in documentTypes: | |
temp = {} | |
temp["img"] = doc.img | |
temp["name"] = doc.name | |
temp["url"] = doc.url | |
temp["usecases"] = [] | |
for usecase in doc.usecases.all(): | |
temp["usecases"].append(usecase.heading) | |
documentTypesArr.append(temp) | |
for usecase in usecases: | |
usecasesArr.append(usecase.heading) | |
return HttpResponse(json.dumps({"usecases": usecasesArr, "docTypes": documentTypesArr}), content_type="application/json") | |
def docPages(request, link): | |
documentData = DocumentTypes.objects.filter(url=link).first() | |
usecases = documentData.usecases.all() | |
data = { | |
"title": documentData.title, | |
"name": documentData.name, | |
"subtitle": documentData.subtitle, | |
"img": documentData.img, | |
"usecases": [], | |
"fields": documentData.fields, | |
"url": documentData.url, | |
} | |
for usecase in usecases: | |
data["usecases"].append({"heading": usecase.heading, "paragraph": usecase.paragraph}) | |
return HttpResponse(json.dumps(data), content_type="application/json") | |
def edditor(request): | |
with open("./api/nanonetProducts2.json", "r") as f: | |
data = json.load(f) | |
documents = DocumentTypes.objects.all() | |
for doc in documents: | |
for entry in data: | |
if entry["name"] == doc.name: | |
doc.url = entry["link"] | |
doc.save() | |
print(doc.name, "Updated", entry["link"]) | |
break | |
return HttpResponse("Hello World") | |