Spaces:
Running
on
T4
Running
on
T4
import boto3 | |
import json | |
import os | |
import shutil | |
import time | |
from unstructured.partition.pdf import partition_pdf | |
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth | |
import streamlit as st | |
from PIL import Image | |
import base64 | |
import re | |
import requests | |
import utilities.invoke_models as invoke_models | |
from requests.auth import HTTPBasicAuth | |
import generate_csv_for_tables | |
#from pdf2image import convert_from_bytes,convert_from_path | |
#import langchain | |
bedrock_runtime_client = boto3.client('bedrock-runtime',aws_access_key_id=st.secrets['user_access_key'], | |
aws_secret_access_key=st.secrets['user_secret_key'],region_name='us-east-1') | |
textract_client = boto3.client('textract',aws_access_key_id=st.secrets['user_access_key'], | |
aws_secret_access_key=st.secrets['user_secret_key'],region_name='us-east-1') | |
region = 'us-east-1' | |
service = 'es' | |
credentials = boto3.Session().get_credentials() | |
auth = HTTPBasicAuth('master',st.secrets['ml_search_demo_api_access']) | |
#"https://search-opensearchservi-shjckef2t7wo-iyv6rajdgxg6jas25aupuxev6i.us-west-2.es.amazonaws.com/" | |
ospy_client = OpenSearch( | |
hosts = [{'host': 'search-opensearchservi-shjckef2t7wo-iyv6rajdgxg6jas25aupuxev6i.us-west-2.es.amazonaws.com', 'port': 443}], | |
http_auth = auth, | |
use_ssl = True, | |
verify_certs = True, | |
connection_class = RequestsHttpConnection, | |
pool_maxsize = 20 | |
) | |
summary_prompt = """You are an assistant tasked with summarizing tables and text. \ | |
Give a detailed summary of the table or text. Table or text chunk: {element} """ | |
parent_dirname = "/".join((os.path.dirname(__file__)).split("/")[0:-1]) | |
def generate_image_captions_(image_paths): | |
images = [] | |
for image_path in image_paths: | |
i_image = Image.open(image_path) | |
if i_image.mode != "RGB": | |
i_image = i_image.convert(mode="RGB") | |
images.append(i_image) | |
pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values | |
pixel_values = pixel_values.to(device) | |
output_ids = model.generate(pixel_values, **gen_kwargs) | |
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) | |
preds = [pred.strip() for pred in preds] | |
return preds | |
def load_docs(inp): | |
print("input_doc") | |
print(inp) | |
extracted_elements_list = [] | |
data_dir = parent_dirname+"/pdfs" | |
target_files = [os.path.join(data_dir,inp["key"])] | |
Image.MAX_IMAGE_PIXELS = 100000000 | |
width = 2048 | |
height = 2048 | |
for target_file in target_files: | |
tables_textract = generate_csv_for_tables.main_(target_file) | |
#tables_textract = {} | |
index_ = re.sub('[^A-Za-z0-9]+', '', (target_file.split("/")[-1].split(".")[0]).lower()) | |
st.session_state.input_index = index_ | |
if os.path.isdir(parent_dirname+'/figures/') == False: | |
os.mkdir(parent_dirname+'/figures/') | |
image_output_dir = parent_dirname+'/figures/'+st.session_state.input_index+"/" | |
if os.path.isdir(image_output_dir): | |
shutil.rmtree(image_output_dir) | |
os.mkdir(image_output_dir) | |
print("***") | |
print(target_file) | |
#image_output_dir_path = os.path.join(image_output_dir,target_file.split('/')[-1].split('.')[0]) | |
#os.mkdir(image_output_dir_path) | |
# with open(target_file, "rb") as pdf_file: | |
# encoded_string_pdf = bytearray(pdf_file.read()) | |
#images_pdf = convert_from_path(target_file) | |
# for index,image in enumerate(images_pdf): | |
# image.save(image_output_dir_pdf+"/"+st.session_state.input_index+"/"+str(index)+"_pdf.jpeg", 'JPEG') | |
# with open(image_output_dir_pdf+"/"+st.session_state.input_index+"/"+str(index)+"_pdf.jpeg", "rb") as read_img: | |
# input_encoded = base64.b64encode(read_img.read()) | |
# print(encoded_string_pdf) | |
# tables_= textract_client.analyze_document( | |
# Document={'Bytes': encoded_string_pdf}, | |
# FeatureTypes=['TABLES'] | |
# ) | |
# print(tables_) | |
table_and_text_elements = partition_pdf( | |
filename=target_file, | |
extract_images_in_pdf=True, | |
infer_table_structure=False, | |
chunking_strategy="by_title", #Uses title elements to identify sections within the document for chunking | |
max_characters=4000, | |
new_after_n_chars=3800, | |
combine_text_under_n_chars=2000, | |
extract_image_block_output_dir=parent_dirname+'/figures/'+st.session_state.input_index+'/', | |
) | |
tables = [] | |
texts = [] | |
print(table_and_text_elements) | |
for table in tables_textract.keys(): | |
print(table) | |
#print(tables_textract[table]) | |
tables.append({'table_name':table,'raw':tables_textract[table],'summary':invoke_models.invoke_llm_model(summary_prompt.format(element=tables_textract[table]),False)}) | |
time.sleep(4) | |
for element in table_and_text_elements: | |
# if "unstructured.documents.elements.Table" in str(type(element)): | |
# tables.append({'raw':str(element),'summary':invoke_models.invoke_llm_model(summary_prompt.format(element=str(element)),False)}) | |
# tables_source.append({'raw':element,'summary':invoke_models.invoke_llm_model(summary_prompt.format(element=str(element)),False)}) | |
if "unstructured.documents.elements.CompositeElement" in str(type(element)): | |
texts.append(str(element)) | |
image_captions = {} | |
for image_file in os.listdir(image_output_dir): | |
print("image_processing") | |
photo_full_path = image_output_dir+image_file | |
photo_full_path_no_format = photo_full_path.replace('.jpg',"") | |
with Image.open(photo_full_path) as image: | |
image.verify() | |
with Image.open(photo_full_path) as image: | |
file_type = 'jpg' | |
path = image.filename.rsplit(".", 1)[0] | |
image.thumbnail((width, height)) | |
image.save(photo_full_path_no_format+"-resized.jpg") | |
with open(photo_full_path_no_format+"-resized.jpg", "rb") as read_img: | |
input_encoded = base64.b64encode(read_img.read()).decode("utf8") | |
image_captions[image_file] = {"caption":invoke_models.generate_image_captions_llm(input_encoded, "What's in this image?"), | |
"encoding":input_encoded | |
} | |
print("image_processing done") | |
#print(image_captions) | |
#print(os.path.join('figures',image_file)) | |
extracted_elements_list = [] | |
extracted_elements_list.append({ | |
'source': target_file, | |
'tables': tables, | |
'texts': texts, | |
'images': image_captions | |
}) | |
documents = [] | |
documents_mm = [] | |
for extracted_element in extracted_elements_list: | |
print("prepping data") | |
texts = extracted_element['texts'] | |
tables = extracted_element['tables'] | |
images_data = extracted_element['images'] | |
src_doc = extracted_element['source'] | |
for text in texts: | |
embedding = invoke_models.invoke_model(text) | |
document = prep_document(text,text,'text',src_doc,'none',embedding) | |
documents.append(document) | |
for table in tables: | |
table_raw = table['raw'] | |
table_summary = table['summary'] | |
embedding = invoke_models.invoke_model(table_summary) | |
document = prep_document(table_raw,table_summary,'table*'+table['table_name'],src_doc,'none',embedding) | |
documents.append(document) | |
for file_name in images_data.keys(): | |
embedding = invoke_models.invoke_model_mm(image_captions[file_name]['caption'],image_captions[file_name]['encoding']) | |
document = prep_document(image_captions[file_name]['caption'],image_captions[file_name]['caption'],'image_'+file_name,src_doc,image_captions[file_name]['encoding'],embedding) | |
documents_mm.append(document) | |
embedding = invoke_models.invoke_model(image_captions[file_name]['caption']) | |
document = prep_document(image_captions[file_name]['caption'],image_captions[file_name]['caption'],'image_'+file_name,src_doc,'none',embedding) | |
documents.append(document) | |
os_ingest(index_, documents) | |
os_ingest_mm(index_, documents_mm) | |
def prep_document(raw_element,processed_element,doc_type,src_doc,encoding,embedding): | |
if('image' in doc_type): | |
img_ = doc_type.split("_")[1] | |
else: | |
img_ = "None" | |
document = { | |
"processed_element": re.sub(r"[^a-zA-Z0-9]+", ' ', processed_element) , | |
"raw_element_type": doc_type.split("*")[0], | |
"raw_element": re.sub(r"[^a-zA-Z0-9]+", ' ', raw_element) , | |
"src_doc": src_doc.replace(","," "), | |
"image": img_, | |
} | |
if(encoding!="none"): | |
document["image_encoding"] = encoding | |
document["processed_element_embedding_bedrock-multimodal"] = embedding | |
else: | |
document["processed_element_embedding"] = embedding | |
if('table' in doc_type): | |
document["table"] = doc_type.split("*")[1] | |
return document | |
def os_ingest(index_,documents): | |
print("ingesting data") | |
#host = 'your collection id.region.aoss.amazonaws.com' | |
if(ospy_client.indices.exists(index=index_)): | |
ospy_client.indices.delete(index = index_) | |
index_body = { | |
"settings": { | |
"index": { | |
"knn": True, | |
"default_pipeline": "rag-ingest-pipeline", | |
"number_of_shards": 4 | |
} | |
}, | |
"mappings": { | |
"properties": { | |
"processed_element": { | |
"type": "text" | |
}, | |
"raw_element": { | |
"type": "text" | |
}, | |
"processed_element_embedding": { | |
"type": "knn_vector", | |
"dimension":1536, | |
"method": { | |
"engine": "faiss", | |
"space_type": "l2", | |
"name": "hnsw", | |
"parameters": {} | |
} | |
}, | |
# "processed_element_embedding_bedrock-multimodal": { | |
# "type": "knn_vector", | |
# "dimension": 1024, | |
# "method": { | |
# "engine": "faiss", | |
# "space_type": "l2", | |
# "name": "hnsw", | |
# "parameters": {} | |
# } | |
# }, | |
# "image_encoding": { | |
# "type": "binary" | |
# }, | |
"raw_element_type": { | |
"type": "text" | |
}, | |
"processed_element_embedding_sparse": { | |
"type": "rank_features" | |
}, | |
"src_doc": { | |
"type": "text" | |
}, | |
"image":{ "type": "text"} | |
} | |
} | |
} | |
response = ospy_client.indices.create(index_, body=index_body) | |
for doc in documents: | |
print("----------doc------------") | |
if(doc['image']!='None'): | |
print("image insert") | |
print(doc['image']) | |
response = ospy_client.index( | |
index = index_, | |
body = doc, | |
) | |
def os_ingest_mm(index_,documents_mm): | |
#host = 'your collection id.region.aoss.amazonaws.com' | |
index_ = index_+"_mm" | |
if(ospy_client.indices.exists(index=index_)): | |
ospy_client.indices.delete(index = index_) | |
index_body = { | |
"settings": { | |
"index": { | |
"knn": True, | |
# "default_pipeline": "rag-ingest-pipeline", | |
"number_of_shards": 4 | |
} | |
}, | |
"mappings": { | |
"properties": { | |
"processed_element": { | |
"type": "text" | |
}, | |
"raw_element": { | |
"type": "text" | |
}, | |
"processed_element_embedding_bedrock-multimodal": { | |
"type": "knn_vector", | |
"dimension": 1024, | |
"method": { | |
"engine": "faiss", | |
"space_type": "l2", | |
"name": "hnsw", | |
"parameters": {} | |
} | |
}, | |
"image_encoding": { | |
"type": "binary" | |
}, | |
"raw_element_type": { | |
"type": "text" | |
}, | |
"src_doc": { | |
"type": "text" | |
}, | |
"image":{ "type": "text"} | |
} | |
} | |
} | |
response = ospy_client.indices.create(index_, body=index_body) | |
for doc in documents_mm: | |
#print("----------doc------------") | |
#print(doc) | |
response = ospy_client.index( | |
index = index_, | |
body = doc, | |
) | |