import boto3 import json import os import shutil import time from unstructured.partition.pdf import partition_pdf from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth import streamlit as st from PIL import Image import base64 import re import requests import utilities.invoke_models as invoke_models from requests.auth import HTTPBasicAuth import generate_csv_for_tables #from pdf2image import convert_from_bytes,convert_from_path #import langchain bedrock_runtime_client = boto3.client('bedrock-runtime',aws_access_key_id=st.secrets['user_access_key'], aws_secret_access_key=st.secrets['user_secret_key'],region_name='us-east-1') textract_client = boto3.client('textract',aws_access_key_id=st.secrets['user_access_key'], aws_secret_access_key=st.secrets['user_secret_key'],region_name='us-east-1') region = 'us-east-1' service = 'es' credentials = boto3.Session().get_credentials() auth = HTTPBasicAuth('master',st.secrets['ml_search_demo_api_access']) #"https://search-opensearchservi-shjckef2t7wo-iyv6rajdgxg6jas25aupuxev6i.us-west-2.es.amazonaws.com/" ospy_client = OpenSearch( hosts = [{'host': 'search-opensearchservi-shjckef2t7wo-iyv6rajdgxg6jas25aupuxev6i.us-west-2.es.amazonaws.com', 'port': 443}], http_auth = auth, use_ssl = True, verify_certs = True, connection_class = RequestsHttpConnection, pool_maxsize = 20 ) summary_prompt = """You are an assistant tasked with summarizing tables and text. \ Give a detailed summary of the table or text. Table or text chunk: {element} """ parent_dirname = "/".join((os.path.dirname(__file__)).split("/")[0:-1]) def generate_image_captions_(image_paths): images = [] for image_path in image_paths: i_image = Image.open(image_path) if i_image.mode != "RGB": i_image = i_image.convert(mode="RGB") images.append(i_image) pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values pixel_values = pixel_values.to(device) output_ids = model.generate(pixel_values, **gen_kwargs) preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) preds = [pred.strip() for pred in preds] return preds def load_docs(inp): print("input_doc") print(inp) extracted_elements_list = [] data_dir = parent_dirname+"/pdfs" target_files = [os.path.join(data_dir,inp["key"])] Image.MAX_IMAGE_PIXELS = 100000000 width = 2048 height = 2048 for target_file in target_files: tables_textract = generate_csv_for_tables.main_(target_file) #tables_textract = {} index_ = re.sub('[^A-Za-z0-9]+', '', (target_file.split("/")[-1].split(".")[0]).lower()) st.session_state.input_index = index_ if os.path.isdir(parent_dirname+'/figures/') == False: os.mkdir(parent_dirname+'/figures/') image_output_dir = parent_dirname+'/figures/'+st.session_state.input_index+"/" if os.path.isdir(image_output_dir): shutil.rmtree(image_output_dir) os.mkdir(image_output_dir) print("***") print(target_file) #image_output_dir_path = os.path.join(image_output_dir,target_file.split('/')[-1].split('.')[0]) #os.mkdir(image_output_dir_path) # with open(target_file, "rb") as pdf_file: # encoded_string_pdf = bytearray(pdf_file.read()) #images_pdf = convert_from_path(target_file) # for index,image in enumerate(images_pdf): # image.save(image_output_dir_pdf+"/"+st.session_state.input_index+"/"+str(index)+"_pdf.jpeg", 'JPEG') # with open(image_output_dir_pdf+"/"+st.session_state.input_index+"/"+str(index)+"_pdf.jpeg", "rb") as read_img: # input_encoded = base64.b64encode(read_img.read()) # print(encoded_string_pdf) # tables_= textract_client.analyze_document( # Document={'Bytes': encoded_string_pdf}, # FeatureTypes=['TABLES'] # ) # print(tables_) table_and_text_elements = partition_pdf( filename=target_file, extract_images_in_pdf=True, infer_table_structure=False, chunking_strategy="by_title", #Uses title elements to identify sections within the document for chunking max_characters=4000, new_after_n_chars=3800, combine_text_under_n_chars=2000, extract_image_block_output_dir=parent_dirname+'/figures/'+st.session_state.input_index+'/', ) tables = [] texts = [] print(table_and_text_elements) for table in tables_textract.keys(): print(table) #print(tables_textract[table]) tables.append({'table_name':table,'raw':tables_textract[table],'summary':invoke_models.invoke_llm_model(summary_prompt.format(element=tables_textract[table]),False)}) time.sleep(4) for element in table_and_text_elements: # if "unstructured.documents.elements.Table" in str(type(element)): # tables.append({'raw':str(element),'summary':invoke_models.invoke_llm_model(summary_prompt.format(element=str(element)),False)}) # tables_source.append({'raw':element,'summary':invoke_models.invoke_llm_model(summary_prompt.format(element=str(element)),False)}) if "unstructured.documents.elements.CompositeElement" in str(type(element)): texts.append(str(element)) image_captions = {} for image_file in os.listdir(image_output_dir): print("image_processing") photo_full_path = image_output_dir+image_file photo_full_path_no_format = photo_full_path.replace('.jpg',"") with Image.open(photo_full_path) as image: image.verify() with Image.open(photo_full_path) as image: file_type = 'jpg' path = image.filename.rsplit(".", 1)[0] image.thumbnail((width, height)) image.save(photo_full_path_no_format+"-resized.jpg") with open(photo_full_path_no_format+"-resized.jpg", "rb") as read_img: input_encoded = base64.b64encode(read_img.read()).decode("utf8") image_captions[image_file] = {"caption":invoke_models.generate_image_captions_llm(input_encoded, "What's in this image?"), "encoding":input_encoded } print("image_processing done") #print(image_captions) #print(os.path.join('figures',image_file)) extracted_elements_list = [] extracted_elements_list.append({ 'source': target_file, 'tables': tables, 'texts': texts, 'images': image_captions }) documents = [] documents_mm = [] for extracted_element in extracted_elements_list: print("prepping data") texts = extracted_element['texts'] tables = extracted_element['tables'] images_data = extracted_element['images'] src_doc = extracted_element['source'] for text in texts: embedding = invoke_models.invoke_model(text) document = prep_document(text,text,'text',src_doc,'none',embedding) documents.append(document) for table in tables: table_raw = table['raw'] table_summary = table['summary'] embedding = invoke_models.invoke_model(table_summary) document = prep_document(table_raw,table_summary,'table*'+table['table_name'],src_doc,'none',embedding) documents.append(document) for file_name in images_data.keys(): embedding = invoke_models.invoke_model_mm(image_captions[file_name]['caption'],image_captions[file_name]['encoding']) document = prep_document(image_captions[file_name]['caption'],image_captions[file_name]['caption'],'image_'+file_name,src_doc,image_captions[file_name]['encoding'],embedding) documents_mm.append(document) embedding = invoke_models.invoke_model(image_captions[file_name]['caption']) document = prep_document(image_captions[file_name]['caption'],image_captions[file_name]['caption'],'image_'+file_name,src_doc,'none',embedding) documents.append(document) os_ingest(index_, documents) os_ingest_mm(index_, documents_mm) def prep_document(raw_element,processed_element,doc_type,src_doc,encoding,embedding): if('image' in doc_type): img_ = doc_type.split("_")[1] else: img_ = "None" document = { "processed_element": re.sub(r"[^a-zA-Z0-9]+", ' ', processed_element) , "raw_element_type": doc_type.split("*")[0], "raw_element": re.sub(r"[^a-zA-Z0-9]+", ' ', raw_element) , "src_doc": src_doc.replace(","," "), "image": img_, } if(encoding!="none"): document["image_encoding"] = encoding document["processed_element_embedding_bedrock-multimodal"] = embedding else: document["processed_element_embedding"] = embedding if('table' in doc_type): document["table"] = doc_type.split("*")[1] return document def os_ingest(index_,documents): print("ingesting data") #host = 'your collection id.region.aoss.amazonaws.com' if(ospy_client.indices.exists(index=index_)): ospy_client.indices.delete(index = index_) index_body = { "settings": { "index": { "knn": True, "default_pipeline": "rag-ingest-pipeline", "number_of_shards": 4 } }, "mappings": { "properties": { "processed_element": { "type": "text" }, "raw_element": { "type": "text" }, "processed_element_embedding": { "type": "knn_vector", "dimension":1536, "method": { "engine": "faiss", "space_type": "l2", "name": "hnsw", "parameters": {} } }, # "processed_element_embedding_bedrock-multimodal": { # "type": "knn_vector", # "dimension": 1024, # "method": { # "engine": "faiss", # "space_type": "l2", # "name": "hnsw", # "parameters": {} # } # }, # "image_encoding": { # "type": "binary" # }, "raw_element_type": { "type": "text" }, "processed_element_embedding_sparse": { "type": "rank_features" }, "src_doc": { "type": "text" }, "image":{ "type": "text"} } } } response = ospy_client.indices.create(index_, body=index_body) for doc in documents: print("----------doc------------") if(doc['image']!='None'): print("image insert") print(doc['image']) response = ospy_client.index( index = index_, body = doc, ) def os_ingest_mm(index_,documents_mm): #host = 'your collection id.region.aoss.amazonaws.com' index_ = index_+"_mm" if(ospy_client.indices.exists(index=index_)): ospy_client.indices.delete(index = index_) index_body = { "settings": { "index": { "knn": True, # "default_pipeline": "rag-ingest-pipeline", "number_of_shards": 4 } }, "mappings": { "properties": { "processed_element": { "type": "text" }, "raw_element": { "type": "text" }, "processed_element_embedding_bedrock-multimodal": { "type": "knn_vector", "dimension": 1024, "method": { "engine": "faiss", "space_type": "l2", "name": "hnsw", "parameters": {} } }, "image_encoding": { "type": "binary" }, "raw_element_type": { "type": "text" }, "src_doc": { "type": "text" }, "image":{ "type": "text"} } } } response = ospy_client.indices.create(index_, body=index_body) for doc in documents_mm: #print("----------doc------------") #print(doc) response = ospy_client.index( index = index_, body = doc, )