import requests import os import google.generativeai as genai from typing import List from utils import encode_image from PIL import Image class Rag: def get_answer_from_gemini(self, query, imagePaths): print(f"Querying Gemini for query={query}, imagePaths={imagePaths}") try: genai.configure(api_key="AIzaSyCwRr9054tCuh2S8yGpwKFvOAxYMT4WNIs") model = genai.GenerativeModel('gemini-1.5-flash') images = [Image.open(path) for path in imagePaths] chat = model.start_chat() response = chat.send_message([*images, query]) answer = response.text print(answer) return answer except Exception as e: print(f"An error occurred while querying Gemini: {e}") return f"Error: {str(e)}" #os.environ['OPENAI_API_KEY'] = "for the love of Jesus let this work" def get_answer_from_openai(self, query, imagesPaths): """ #scuffed local hf inference (transformers incompatible to colpali version req, use ollama, more reliable, easier to use plus web server ready) print(f"Querying for query={query}, imagesPaths={imagesPaths}") model = AutoModel.from_pretrained( 'openbmb/MiniCPM-o-2_6-int4', trust_remote_code=True, attn_implementation='flash_attention_2', # sdpa or flash_attention_2 torch_dtype=torch.bfloat16, init_vision=True, ) model = model.eval().cuda() tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6-int4', trust_remote_code=True) image = Image.open(imagesPaths[0]).convert('RGB') msgs = [{'role': 'user', 'content': [image, query]}] answer = model.chat( image=None, msgs=msgs, tokenizer=tokenizer ) print(answer) return answer """ #ollama method below os.environ['OLLAMA_FLASH_ATTENTION'] = '1' # Close model thread (colpali) print(f"Querying OpenAI for query={query}, imagesPaths={imagesPaths}") try: response = chat( model='minicpm-v:8b-2.6-q8_0', messages=[ { 'role': 'user', 'content': query, 'images': imagesPaths, } ], ) answer = response.message.content print(answer) return answer except Exception as e: print(f"An error occurred while querying OpenAI: {e}") return None def __get_openai_api_payload(self, query:str, imagesPaths:List[str]): image_payload = [] for imagePath in imagesPaths: base64_image = encode_image(imagePath) image_payload.append({ "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}" } }) payload = { "model": "Llama3.2-vision", #change model here as needed "messages": [ { "role": "user", "content": [ { "type": "text", "text": query }, *image_payload ] } ], "max_tokens": 1024 #reduce token size to reduce processing time } return payload # if __name__ == "__main__": # rag = Rag() # query = "Based on attached images, how many new cases were reported during second wave peak" # imagesPaths = ["covid_slides_page_8.png", "covid_slides_page_8.png"] # rag.get_answer_from_gemini(query, imagesPaths)