|
import os, random, re |
|
|
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
import llama_index |
|
from llama_index import Document |
|
|
|
import google.generativeai as genai |
|
|
|
from llama_index.schema import MetadataMode, NodeRelationship |
|
from llama_index.text_splitter import TokenTextSplitter |
|
from llama_index import SimpleDirectoryReader |
|
|
|
from copy import deepcopy |
|
|
|
import time |
|
import fitz |
|
import errno |
|
import typing |
|
import requests |
|
|
|
import networkx as nx |
|
from base64 import b64encode |
|
|
|
from typing import Optional |
|
from typing import Tuple, List |
|
from typing import Dict, List, Union, Any, Iterable |
|
|
|
from IPython.display import Markdown, display |
|
|
|
import PIL |
|
from PIL import Image |
|
|
|
from tqdm import tqdm |
|
|
|
import json |
|
|
|
|
|
|
|
|
|
|
|
|
|
def classify_image(image_path:str, model:genai.GenerativeModel) -> str: |
|
""" |
|
Given an image path, classify the image as floor plan, equipment, etc... |
|
INPUT: image_path: the path to the image |
|
model: LLM model |
|
OUTPUT: the type of the image in a string |
|
""" |
|
|
|
image_for_gemini = Image.open(image_path) |
|
|
|
|
|
|
|
image_description_prompt = """ |
|
Analyze and classify the image into one of the following categories: |
|
floor plan, flow chart, HAVC equipment, sign, and other. Ouput one and |
|
only one category names. |
|
""" |
|
|
|
model_input = [image_description_prompt, image_for_gemini] |
|
|
|
response = model.generate_content( |
|
model_input |
|
) |
|
|
|
return response.text |
|
|
|
|
|
def combine_node_fields(row): |
|
result = "" |
|
result = result + "KEYWORDS: " + row['node_keywords'] + ";\n" |
|
|
|
result = result + "TRIPLES: " + row['node_triples'] + ";\n" |
|
|
|
result = result + "ANSWERABLE_QUESTIONS: " + row['node_answerable_questions'] + ";\n" |
|
|
|
result = result + "TEXT: " + row['node_text'] +".\n" |
|
|
|
return result |
|
|
|
def display_images( |
|
images: Iterable[Union[str, PIL.Image.Image]], resize_ratio: float = 0.5 |
|
) -> None: |
|
""" |
|
Displays a series of images provided as paths or PIL Image objects. |
|
|
|
Args: |
|
images: An iterable of image paths or PIL Image objects. |
|
resize_ratio: The factor by which to resize each image (default 0.5). |
|
|
|
Returns: |
|
None (displays images using IPython or Jupyter notebook). |
|
""" |
|
|
|
|
|
pil_images = [] |
|
for image in images: |
|
if isinstance(image, str): |
|
pil_images.append(PIL.Image.open(image)) |
|
else: |
|
pil_images.append(image) |
|
|
|
|
|
for img in pil_images: |
|
original_width, original_height = img.size |
|
new_width = int(original_width * resize_ratio) |
|
new_height = int(original_height * resize_ratio) |
|
resized_img = img.resize((new_width, new_height)) |
|
display(resized_img) |
|
print("\n") |
|
|
|
|
|
def doc_images_description_dict(fdocs:fitz.Document, fpage: fitz.Document, lpage: |
|
llama_index.Document, image_save_dir:str, |
|
image_description_prompt:str, model:genai.GenerativeModel) -> List[dict]: |
|
|
|
file_name = lpage.metadata['file_name'] |
|
page_label = lpage.metadata['page_label'] |
|
|
|
images = fpage.get_images() |
|
|
|
dict_list = [] |
|
|
|
for image_no, image in enumerate(images): |
|
|
|
image_dict = {} |
|
|
|
xref = image[0] |
|
pix = fitz.Pixmap(fitz_docs, xref) |
|
|
|
|
|
image_name = f"{image_save_dir}/{file_name}_image_{page_label}_{image_no}_{xref}.jpeg" |
|
|
|
|
|
pix.save(image_name) |
|
|
|
|
|
image_for_gemini = Image.open(io.BytesIO(pix.tobytes("jpeg"))) |
|
|
|
model_input = [image_description_prompt, image_for_gemini] |
|
|
|
response = gemini_pro_model.generate_content( |
|
model_input |
|
) |
|
|
|
image_dict['doc_id'] = lpage.doc_id |
|
|
|
image_dict['image_id'] = image_no |
|
|
|
image_dict['image_name'] = image_name |
|
|
|
mdict = lpage.metadata |
|
|
|
image_dict['page_label'] = mdict['page_label'] |
|
image_dict['file_name'] = mdict['file_name'] |
|
image_dict['file_path'] = mdict['file_path'] |
|
image_dict['file_type'] = mdict['file_type'] |
|
|
|
image_dict['course_material_type'] = mdict['course_material_type'] |
|
image_dict['course_material_week'] = mdict['course_material_week'] |
|
|
|
image_dict['description'] = response.text |
|
|
|
dict_list.append(image_dict) |
|
|
|
return dict_list |
|
|
|
|
|
def docs_to_df(docs:llama_index.schema.Document, gemini_pro:genai.GenerativeModel) -> pd.DataFrame: |
|
""" |
|
extract titles for docs, embed the documents and titles, and convert it to dataframe |
|
INPUT: docs: the documents extacted from a file |
|
gemini_pro: genai gemini pro model |
|
OUTPUT: docs_df: a dataframe containing the information of the docs extracted from the input file |
|
""" |
|
|
|
docs_df = llamaindex_docs_df(docs) |
|
|
|
tqdm.pandas(desc="Processing rows for extracting document titles...") |
|
|
|
docs_df['doc_title'] = docs_df.progress_apply(lambda row: node_text_title(row['text'], gemini_pro), axis=1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
doc_summary_list = [] |
|
for _, row in tqdm(docs_df.iterrows(), total=len(docs_df)): |
|
try: |
|
doc_summary_list.append(text_summary(row['text'], gemini_pro)) |
|
except: |
|
|
|
doc_summary_list.append(None) |
|
|
|
docs_df['doc_summary'] = doc_summary_list |
|
|
|
tqdm.pandas(desc="Processing rows for embedding documents and titles...") |
|
|
|
docs_df['doc_embedding'] = docs_df.progress_apply(lambda row: text_retrieval_document_embedding(row['text'], row['doc_title']), axis=1) |
|
|
|
return docs_df |
|
|
|
|
|
def extract_image_description_df(image_path:str, category:str, model:genai.GenerativeModel) -> pd.DataFrame: |
|
""" |
|
Extract description of the given image in the given category |
|
INPUT: image_path: the path to the image |
|
category: a string containing the category of the image |
|
model: a generative model |
|
OUTPUT: a DataFrame containing the metadata of the extracted images |
|
""" |
|
|
|
image_for_gemini = Image.open(image_path) |
|
|
|
|
|
|
|
image_description_prompt = """Explain what is going on in the image. |
|
If it's a table, extract all elements of the table. |
|
If it's a graph, explain the findings in the graph. |
|
Do not include any numbers that are not mentioned in the image: |
|
""" |
|
|
|
if "floor plan" in category.lower(): |
|
image_description_prompt = ''' |
|
Please analyze the provided floor plan image and extract the following information |
|
related to rooms, locations, connections, HVAC equipment, and sensors: |
|
1. Room Labels/Names: Identify and list all room labels or names shown on the floor plan. |
|
2. Room Connectivity: Indicate how different rooms are connected (doors, hallways, openings, etc.). |
|
3. HVAC Equipment: Locate and list all HVAC equipment depicted on the floor plan (e.g., air handling units, ductwork, vents, thermostats, etc.). |
|
4. Sensor Locations: Note the locations of any sensors or control devices related to the HVAC system (e.g., temperature sensors, occupancy sensors, etc.). |
|
5. Zoning/Partitions: If the floor plan shows any zoning or partitions related to HVAC control, please describe them. |
|
6. Special Areas: Highlight any areas that may have unique HVAC requirements (e.g., server rooms, laboratories, etc.). |
|
Please provide the extracted information in a structured format, separating the different categories as needed. Let me know if you need any clarification or have additional requirements for the information to be extracted from the floor plan. |
|
''' |
|
elif "flow chart" in category.lower(): |
|
image_description_prompt = ''' |
|
Please analyze the provided HVAC flow chart image and extract the following information: |
|
|
|
1. System Components: Identify and list all the major HVAC components shown in the flow chart (e.g., air handling units, chillers, boilers, pumps, cooling towers, etc.). |
|
2. Component Connections: Describe how the different HVAC components are connected, indicating the direction of airflow, water flow, refrigerant flow, etc. |
|
3. System Inputs/Outputs: Note any system inputs (e.g., outside air intake) or outputs (e.g., exhaust air) shown in the flow chart. |
|
4. Control Points: Locate any control points, sensors, or valves that regulate the flow or operation of the system components. |
|
5. Subsystems/Zones: If the flow chart illustrates subsystems or zones within the overall HVAC system, please describe them and their components. |
|
6. Operational Modes: Identify any operational modes or sequences depicted in the flow chart (e.g., heating mode, cooling mode, economizer mode, etc.). |
|
|
|
Please provide the extracted information in a clear and structured format, separating the different categories as needed. If any abbreviations or symbols are used in the flow chart, please include a legend or clarify their meanings. Let me know if you need any clarification or have additional requirements for the information to be extracted. |
|
''' |
|
elif "havc equipment" in category.lower(): |
|
image_description_prompt = ''' |
|
Please analyze the image I will provide, which contains HVAC (heating, ventilation, and |
|
air conditioning) equipment. Describe the different components you can identify, such |
|
as the type of equipment (furnace, air conditioner, ductwork, etc.), the apparent |
|
condition of the equipment, and any other relevant details you can discern from the |
|
image. Your analysis should help someone understand what is depicted in the HVAC system |
|
shown in the picture. |
|
''' |
|
else: |
|
image_description_prompt = '''Explain what is going on in the image. |
|
If it's a table, extract all elements of the table. |
|
If it's a graph, explain the findings in the graph. |
|
Do not include any numbers that are not mentioned in the image: |
|
''' |
|
|
|
dict_list = [] |
|
|
|
path_last_sep_idx = image_path.rfind("/") |
|
file_name = image_path[path_last_sep_idx+1:] |
|
print("Processing the image: {}".format(file_name)) |
|
|
|
model_input = [image_description_prompt, image_for_gemini] |
|
|
|
response = model.generate_content( |
|
model_input |
|
) |
|
|
|
image_dict = {} |
|
|
|
image_dict['image_path'] = image_path |
|
image_dict['file_name'] = file_name |
|
|
|
try: |
|
image_dict['image_description'] = response.text |
|
except Exception as e: |
|
print("Some errors happend in the response from Gemini.") |
|
image_dict['image_description'] = None |
|
|
|
dict_list.append(image_dict) |
|
|
|
return pd.DataFrame(dict_list) |
|
|
|
|
|
def get_cosine_score( |
|
dataframe: pd.DataFrame, column_name: str, input_text_embd: np.ndarray |
|
) -> float: |
|
""" |
|
Calculates the cosine similarity between the user query embedding and the |
|
dataframe embedding for a specific column. |
|
|
|
Args: |
|
dataframe: The pandas DataFrame containing the data to compare against. |
|
column_name: The name of the column containing the embeddings to compare with. |
|
input_text_embd: The NumPy array representing the user query embedding. |
|
|
|
Returns: |
|
The cosine similarity score (rounded to two decimal places) between the user query embedding and the dataframe embedding. |
|
""" |
|
|
|
text_cosine_score = round(np.dot(dataframe[column_name], input_text_embd), 2) |
|
|
|
return text_cosine_score |
|
|
|
def get_cosine_score_lists( |
|
dataframe: pd.DataFrame, column_name: str, query_embs: list |
|
) -> float: |
|
""" |
|
Calculates the cosine similarity between the user query embedding and the dataframe embedding for a specific column. Both embeddings are in lists |
|
|
|
Args: |
|
dataframe: The pandas DataFrame containing the data to compare against. |
|
column_name: The name of the column containing the embeddings to compare with. |
|
input_text_embd: The query embeddings as a list of numbers |
|
|
|
Returns: |
|
The cosine similarity score (rounded to two decimal places) between the user query embedding and the dataframe embedding. |
|
""" |
|
|
|
text_cosine_score = round(np.dot(np.array(dataframe[column_name]), np.array(query_embs)), 2) |
|
return text_cosine_score |
|
|
|
|
|
def get_relevant_images_from_query( |
|
query: str, |
|
images_df: pd.DataFrame, |
|
column_name: str = "", |
|
top_n: int = 3, |
|
embedding_size: int = 768, |
|
print_citation: bool = True, |
|
) -> Dict[int, Dict[str, Any]]: |
|
""" |
|
Finds the top N most similar images from a metadata DataFrame based on a text query. |
|
|
|
Args: |
|
query: The text query used for finding similar passages. |
|
images_df: A Pandas DataFrame containing the image metadata to search. |
|
column_name: The column name in the text_metadata_df containing the text embeddings or |
|
text itself. |
|
top_n: The number of most similar text passages to return. |
|
embedding_size: The dimensionality of the text embeddings (only used if text embeddings |
|
are stored in the column specified by `column_name`). |
|
print_citation: Whether to immediately print formatted citations for the matched text |
|
passages (True) or just return the dictionary (False). |
|
|
|
Returns: |
|
A dictionary containing information about the top N most similar images, |
|
including cosine scores, image_path, file_name, and description text. |
|
|
|
Raises: |
|
KeyError: If the specified `column_name` is not present in the `text_metadata_df`. |
|
""" |
|
|
|
if column_name not in images_df.columns: |
|
raise KeyError(f"Column '{column_name}' not found in the 'images_df'") |
|
|
|
query_embs = text_query_embedding(query) |
|
|
|
|
|
cosine_scores = images_df.apply( |
|
lambda row: get_cosine_score_lists( |
|
row, |
|
column_name, |
|
query_embs, |
|
), |
|
axis=1, |
|
) |
|
|
|
|
|
top_n_indices = cosine_scores.nlargest(top_n).index.tolist() |
|
top_n_scores = cosine_scores.nlargest(top_n).values.tolist() |
|
|
|
|
|
final_images = {} |
|
|
|
for matched_no, index in enumerate(top_n_indices): |
|
|
|
final_images[matched_no] = {} |
|
|
|
|
|
final_images[matched_no]["image_path"] = images_df.iloc[index][ |
|
"image_path" |
|
] |
|
|
|
|
|
final_images[matched_no]["cosine_score"] = top_n_scores[matched_no] |
|
|
|
|
|
|
|
final_images[matched_no]["file_name"] = images_df.iloc[index]["file_name"] |
|
|
|
|
|
final_images[matched_no]["image_description"] = images_df["image_description"][index] |
|
|
|
|
|
final_images[matched_no]["image_object"] = Image.open(images_df.iloc[index]['image_path']) |
|
|
|
|
|
if print_citation: |
|
print_text_to_image_citation(final_images) |
|
|
|
return final_images |
|
|
|
|
|
def get_similar_text_from_query( |
|
query: str, |
|
nodes_df: pd.DataFrame, |
|
column_name: str = "", |
|
top_n: int = 3, |
|
embedding_size: int = 768, |
|
print_citation: bool = True, |
|
) -> Dict[int, Dict[str, Any]]: |
|
""" |
|
Finds the top N most similar text passages from a metadata DataFrame based on a text query. |
|
|
|
Args: |
|
query: The text query used for finding similar passages. |
|
nodes_df: A Pandas DataFrame containing the text metadata to search. |
|
column_name: The column name in the text_metadata_df containing the text embeddings or |
|
text itself. |
|
top_n: The number of most similar text passages to return. |
|
embedding_size: The dimensionality of the text embeddings (only used if text embeddings |
|
are stored in the column specified by `column_name`). |
|
print_citation: Whether to immediately print formatted citations for the matched text |
|
passages (True) or just return the dictionary (False). |
|
|
|
Returns: |
|
A dictionary containing information about the top N most similar text passages, |
|
including cosine scores, page numbers, chunk numbers (optional), and chunk text or |
|
page text (depending on `chunk_text`). |
|
|
|
Raises: |
|
KeyError: If the specified `column_name` is not present in the `text_metadata_df`. |
|
""" |
|
|
|
if column_name not in nodes_df.columns: |
|
raise KeyError(f"Column '{column_name}' not found in the 'nodes_df'") |
|
|
|
query_embs = text_query_embedding(query) |
|
|
|
|
|
cosine_scores = nodes_df.apply( |
|
lambda row: get_cosine_score_lists( |
|
row, |
|
column_name, |
|
query_embs, |
|
), |
|
axis=1, |
|
) |
|
|
|
|
|
top_n_indices = cosine_scores.nlargest(top_n).index.tolist() |
|
top_n_scores = cosine_scores.nlargest(top_n).values.tolist() |
|
|
|
|
|
final_text = {} |
|
|
|
for matched_textno, index in enumerate(top_n_indices): |
|
|
|
final_text[matched_textno] = {} |
|
|
|
|
|
final_text[matched_textno]["page_num"] = nodes_df.iloc[index][ |
|
"page_label" |
|
] |
|
|
|
|
|
final_text[matched_textno]["cosine_score"] = top_n_scores[matched_textno] |
|
|
|
|
|
|
|
final_text[matched_textno]["node_id"] = nodes_df.iloc[index]["node_id"] |
|
|
|
|
|
final_text[matched_textno]["node_text"] = nodes_df["node_text"][index] |
|
|
|
|
|
if print_citation: |
|
print_text_to_text_citation(final_text) |
|
|
|
return final_text |
|
|
|
|
|
def llamaindex_doc_dict(doc: llama_index.schema.Document) -> dict: |
|
""" |
|
convert a LlamaIndex Document object to a dictionary |
|
""" |
|
|
|
doc_dict = {} |
|
|
|
doc_dict['doc_id'] = doc.doc_id |
|
|
|
mdict = doc.metadata |
|
|
|
doc_dict['page_label'] = mdict['page_label'] |
|
doc_dict['file_name'] = mdict['file_name'] |
|
doc_dict['file_path'] = mdict['file_path'] |
|
doc_dict['file_type'] = mdict['file_type'] |
|
|
|
doc_dict['file_title'] = mdict['file_title'] |
|
doc_dict['file_date'] = mdict['file_date'] |
|
doc_dict['file_subtitle'] = mdict['file_subtitle'] |
|
doc_dict['table_of_content'] = mdict['table_of_content'] |
|
|
|
doc_dict['text'] = doc.text |
|
|
|
return doc_dict |
|
|
|
|
|
def llamaindex_docs_df(docs: List[llama_index.schema.Document]) -> pd.DataFrame: |
|
""" |
|
convert a list of LlamaIndex Document object to a Pandas DataFrame with columns |
|
""" |
|
|
|
recs = [] |
|
for doc in docs: |
|
recs.append(llamaindex_doc_dict(doc)) |
|
|
|
return pd.DataFrame(recs) |
|
|
|
|
|
def llamaindex_docs_from_path(path_input:str, |
|
gemini_pro:genai.GenerativeModel) -> llama_index.schema.Document: |
|
|
|
""" |
|
extract llama_index Document from the file given the path_input |
|
INPUT: path_input: the path pointing to the file in the disk |
|
gemini_pro: the gemini pro model for extracting course metadata |
|
OUTPUT: docs: llama_index Document extracted from the file by the path_input |
|
""" |
|
|
|
docs = SimpleDirectoryReader(input_files=[path_input]).load_data() |
|
|
|
first2pages = docs[0].text + " " + docs[1].text |
|
|
|
metadata_extraction_sys_content = ''' |
|
You are a helpful assistant focusing on extracting the metadata describing the input document. |
|
''' |
|
|
|
metadata_extraction_prompt = ''' |
|
{}\n |
|
Please perform metadata extraction on the given text. |
|
Focuse on the following metadata fields: |
|
title: what the document is about; |
|
date: when the document was created; |
|
subtitle: what specific content the document is about; |
|
table of content: section titles and their page numbers. |
|
Output NA if there is no value for a metadata field. |
|
Output the results in a dictionary. |
|
TEXT: ```{}``` |
|
''' |
|
|
|
msg = metadata_extraction_prompt.format(metadata_extraction_sys_content, first2pages) |
|
|
|
response = gemini_pro.generate_content( |
|
msg |
|
) |
|
|
|
response_string = response.text.strip('`') |
|
|
|
extracted_meta_dict = {} |
|
|
|
try: |
|
extracted_meta_dict = json.loads(response_string) |
|
except json.decoder.JSONDecodeError as e: |
|
|
|
extracted_meta_dict = {} |
|
|
|
for doc in tqdm(docs, total=len(docs), desc="Adding metadata to docs..."): |
|
if 'title' in extracted_meta_dict: |
|
doc.metadata['file_title'] = extracted_meta_dict['title'] |
|
else: |
|
doc.metadata['file_title'] = None |
|
|
|
if 'date' in extracted_meta_dict: |
|
doc.metadata['file_date'] = extracted_meta_dict['date'] |
|
else: |
|
doc.metadata['file_date'] = None |
|
|
|
if 'subtitle' in extracted_meta_dict: |
|
doc.metadata['file_subtitle'] = extracted_meta_dict['subtitle'] |
|
else: |
|
doc.metadata['file_subtitle'] = None |
|
|
|
if 'table of content' in extracted_meta_dict: |
|
doc.metadata['table_of_content'] = extracted_meta_dict['table of content'] |
|
else: |
|
doc.metadata['table_of_content'] = None |
|
|
|
return docs |
|
|
|
def llamaindex_node_dict(node: llama_index.schema.TextNode) -> dict: |
|
""" |
|
convert a LlamaIndex TextNode object to a dictionary |
|
INPUT: doc_id: the document from where the node extracted |
|
node_order: an integer for the order of the node in the parent document |
|
node: a TextNode extracted from the parent document |
|
OUTPUT: dictionary for the node's information |
|
""" |
|
|
|
node_dict = {} |
|
|
|
node_dict['node_id'] = node.node_id |
|
|
|
mdict = node.metadata |
|
|
|
node_dict['page_label'] = mdict['page_label'] |
|
node_dict['file_name'] = mdict['file_name'] |
|
node_dict['file_path'] = mdict['file_path'] |
|
node_dict['file_type'] = mdict['file_type'] |
|
|
|
|
|
|
|
|
|
node_dict['file_title'] = mdict['file_title'] |
|
node_dict['file_date'] = mdict['file_date'] |
|
node_dict['file_subtitle'] = mdict['file_subtitle'] |
|
|
|
node_dict['node_text'] = node.text |
|
|
|
node_dict['start_char_idx'] = node.start_char_idx |
|
node_dict['end_char_idx'] = node.end_char_idx |
|
|
|
rdict = node.relationships |
|
|
|
if NodeRelationship.SOURCE in rdict.keys(): |
|
node_dict['doc_id'] = rdict[NodeRelationship.SOURCE].node_id |
|
else: |
|
node_dict['doc_id'] = None |
|
|
|
if NodeRelationship.PREVIOUS in rdict.keys(): |
|
node_dict['previous_node'] = rdict[NodeRelationship.PREVIOUS].node_id |
|
else: |
|
node_dict['previous_node'] = None |
|
|
|
if NodeRelationship.NEXT in rdict.keys(): |
|
node_dict['next_node'] = rdict[NodeRelationship.NEXT].node_id |
|
else: |
|
node_dict['next_node'] = None |
|
|
|
|
|
return node_dict |
|
|
|
|
|
def llamaindex_nodes_df(nodes: List[llama_index.schema.TextNode]) -> pd.DataFrame: |
|
""" |
|
convert a list of LlamaIndex TextNode object to a Pandas DataFrame with columns |
|
""" |
|
|
|
recs = [] |
|
for node in nodes: |
|
recs.append(llamaindex_node_dict(node)) |
|
|
|
return pd.DataFrame(recs) |
|
|
|
|
|
def node_text_title(text:str, model:genai.GenerativeModel) -> str: |
|
""" |
|
use gemini to generate a title for the input text |
|
""" |
|
|
|
prompt = ''' |
|
Please summairze the given input text |
|
enclosed within the three backticks. Generate a short |
|
title for the text. Correct misspells and syntactic errors. |
|
Output a short title string only. |
|
TEXT: ```{}``` |
|
''' |
|
msg = prompt.format(text) |
|
|
|
response = model.generate_content( |
|
msg |
|
) |
|
|
|
return response.text |
|
|
|
def pdf_extract_images(pdf_path:str, image_save_dir:str): |
|
""" |
|
Given a PDF path, extract images from the PDf file and save in disk |
|
INPUT: pdf_path: the path to the PDF file |
|
image_save_dir: the directory for storing the extracted images |
|
OUTPUT: None |
|
""" |
|
|
|
fitz_docs = fitz.open(pdf_path) |
|
|
|
path_last_sep_idx = pdf_path.rfind("/") |
|
file_name = pdf_path[path_last_sep_idx+1:] |
|
print("Processing the images from the pages of {}".format(file_name)) |
|
|
|
for idx, fpage in tqdm(enumerate(fitz_docs), total=len(fitz_docs)): |
|
|
|
images = fpage.get_images() |
|
|
|
page_label = idx + 1 |
|
|
|
for image_no, image in enumerate(images): |
|
|
|
xref = image[0] |
|
pix = fitz.Pixmap(fitz_docs, xref) |
|
|
|
|
|
image_name = f"{image_save_dir}/extracted_from_{file_name}_{page_label}_{image_no}_{xref}.jpeg" |
|
|
|
|
|
pix.save(image_name) |
|
|
|
|
|
|
|
def pdf_images_description_df(pdf_path:str, docs_df_path:str, image_save_dir:str) -> pd.DataFrame: |
|
""" |
|
Given a PDF path and the path to the DataFrame containing the metadata of the pages extracted from the PDF file, extract the metadata of images from the PDf file as a DataFrame |
|
INPUT: pdf_path: the path to the PDF file |
|
docs_df_path: the path to the DataFrame containing page metadata extracted from the PDF file |
|
image_save_dir: the directory for storing the extracted images |
|
OUTPUT: a DataFrame containing the metadata of the extracted images |
|
""" |
|
|
|
fitz_docs = fitz.open(pdf_path) |
|
|
|
doc_df = pd.read_csv(docs_df_path) |
|
|
|
|
|
image_description_prompt = """Explain what is going on in the image. |
|
If it's a table, extract all elements of the table. |
|
If it's a graph, explain the findings in the graph. |
|
Do not include any numbers that are not mentioned in the image: |
|
""" |
|
|
|
dict_list = [] |
|
|
|
path_last_sep_idx = pdf_path.rfind("/") |
|
file_name = pdf_path[path_last_sep_idx+1:] |
|
print("Processing the images from the pages of {}".format(file_name)) |
|
|
|
for idx, fpage in tqdm(enumerate(fitz_docs), total=len(fitz_docs)): |
|
|
|
images = fpage.get_images() |
|
|
|
page_label = idx + 1 |
|
|
|
for image_no, image in enumerate(images): |
|
|
|
image_dict = {} |
|
|
|
xref = image[0] |
|
pix = fitz.Pixmap(fitz_docs, xref) |
|
|
|
|
|
image_name = f"{image_save_dir}/{file_name}_image_{page_label}_{image_no}_{xref}.jpeg" |
|
|
|
|
|
pix.save(image_name) |
|
|
|
|
|
image_for_gemini = Image.open(io.BytesIO(pix.tobytes("jpeg"))) |
|
|
|
model_input = [image_description_prompt, image_for_gemini] |
|
|
|
response = gemini_pro_vision.generate_content( |
|
model_input |
|
) |
|
|
|
image_dict['image_id'] = image_no |
|
image_dict['image_name'] = image_name |
|
image_dict['page_label'] = page_label |
|
|
|
try: |
|
doc_page = doc_df[doc_df.page_label == page_label].iloc[0] |
|
|
|
image_dict['doc_id'] = doc_page['doc_id'] |
|
image_dict['file_name'] = doc_page['file_name'] |
|
image_dict['file_path'] = doc_page['file_path'] |
|
image_dict['file_type'] = doc_page['file_type'] |
|
image_dict['course_material_type'] = doc_page['course_material_type'] |
|
image_dict['course_material_week'] = doc_page['course_material_week'] |
|
|
|
except Exception as e: |
|
print("Some errors happened in the doc_page of the doc_df.") |
|
image_dict['doc_id'] = None |
|
image_dict['file_name'] = None |
|
image_dict['file_path'] = None |
|
image_dict['file_type'] = None |
|
image_dict['course_material_type'] = None |
|
image_dict['course_material_week'] = None |
|
|
|
try: |
|
image_dict['image_description'] = response.text |
|
except Exception as e: |
|
print("Some errors happend in the response from Gemini.") |
|
|
|
image_dict['image_description'] = None |
|
|
|
dict_list.append(image_dict) |
|
|
|
time.sleep(2) |
|
|
|
return pd.DataFrame(dict_list) |
|
|
|
|
|
|
|
class Color: |
|
""" |
|
This class defines a set of color codes that can be used to print text in different colors. |
|
This will be used later to print citations and results to make outputs more readable. |
|
""" |
|
|
|
PURPLE: str = "\033[95m" |
|
CYAN: str = "\033[96m" |
|
DARKCYAN: str = "\033[36m" |
|
BLUE: str = "\033[94m" |
|
GREEN: str = "\033[92m" |
|
YELLOW: str = "\033[93m" |
|
RED: str = "\033[91m" |
|
BOLD: str = "\033[1m" |
|
UNDERLINE: str = "\033[4m" |
|
END: str = "\033[0m" |
|
|
|
def print_text_to_image_citation( |
|
final_images: Dict[int, Dict[str, Any]], print_top: bool = True |
|
) -> None: |
|
""" |
|
Prints a formatted citation for each matched image in a dictionary. |
|
|
|
Args: |
|
final_images: A dictionary containing information about matched images, |
|
with keys as image number and values as dictionaries containing |
|
image path, page number, page text, cosine similarity score, and image description. |
|
print_top: A boolean flag indicating whether to only print the first citation (True) or all citations (False). |
|
|
|
Returns: |
|
None (prints formatted citations to the console). |
|
""" |
|
|
|
color = Color() |
|
|
|
|
|
for imageno, image_dict in final_images.items(): |
|
|
|
print( |
|
color.RED + f"Citation {imageno + 1}:", |
|
"Mached image path, page number and page text: \n" + color.END, |
|
) |
|
|
|
|
|
print(color.BLUE + f"score: " + color.END, image_dict["cosine_score"]) |
|
|
|
|
|
print(color.BLUE + f"path: " + color.END, image_dict["image_path"]) |
|
|
|
|
|
print(color.BLUE + f"file name: " + color.END, image_dict["file_name"]) |
|
|
|
|
|
print( |
|
color.BLUE + f"image description: " + color.END, |
|
image_dict["image_description"], |
|
) |
|
|
|
|
|
display_images([image_dict["image_object"]]) |
|
|
|
|
|
if print_top and imageno == 0: |
|
break |
|
|
|
|
|
def print_text_to_text_citation( |
|
final_text: Dict[int, Dict[str, Any]], |
|
print_top: bool = True, |
|
) -> None: |
|
""" |
|
Prints a formatted citation for each matched text in a dictionary. |
|
|
|
Args: |
|
final_text: A dictionary containing information about matched text passages, |
|
with keys as text number and values as dictionaries containing |
|
page number, cosine similarity score, chunk number (optional), |
|
chunk text (optional), and page text (optional). |
|
print_top: A boolean flag indicating whether to only print the first citation (True) or all citations (False). |
|
chunk_text: A boolean flag indicating whether to print individual text chunks (True) or the entire page text (False). |
|
|
|
Returns: |
|
None (prints formatted citations to the console). |
|
""" |
|
|
|
color = Color() |
|
|
|
|
|
for textno, text_dict in final_text.items(): |
|
|
|
print(color.RED + f"Citation {textno + 1}:", "Matched text:" + color.END) |
|
|
|
|
|
print(color.BLUE + f"score: " + color.END, text_dict["cosine_score"]) |
|
|
|
|
|
print(color.BLUE + f"page_number: " + color.END, text_dict["page_num"]) |
|
|
|
|
|
print(color.BLUE + f"node_id: " + color.END, text_dict["node_id"]) |
|
print(color.BLUE + f"node_text: " + color.END, text_dict["node_text"]) |
|
print() |
|
|
|
|
|
if print_top and textno == 0: |
|
break |
|
|
|
|
|
def sentence_df_triples_df(sentence_df: pd.DataFrame) -> pd.DataFrame: |
|
""" |
|
Extract (subject, predicate, object) triples from the input sentence DataFrame |
|
INPUT: sentence_df: a DataFrame ('sent_id', 'node_id', 'course_material_type', |
|
'course_material_week', 'sent_text') |
|
OUTPUT: triple_df: a DataFrame (triple_id, sent_id, course_material_type, course_material_week, |
|
triples_to_process) |
|
""" |
|
|
|
model = genai.GenerativeModel('gemini-pro') |
|
|
|
count = 0 |
|
|
|
dict_list = [] |
|
|
|
for idx, row in tqdm(sentence_df.iterrows(), total=len(sentence_df)): |
|
if count < len(sentence_df) + 1: |
|
count += 1 |
|
dict_list.append(sentence_triple_dict_list(row, model)) |
|
else: |
|
break |
|
|
|
return pd.DataFrame(dict_list) |
|
|
|
|
|
def sentence_triple_dict_list(row: pd.Series, model) -> dict: |
|
""" |
|
Extract (subject, predicate, object) triples from a row of a sentence dataframe |
|
INPUT: row: a row with the following columns: ('sent_id', 'node_id', 'course_material_type', |
|
'course_material_week', 'sent_text') |
|
model: llm model |
|
OUTPUT: a list of dictionaries each of which has the following keys: triple_id, sent_id, |
|
course_material_type, course_material_week, triples_to_process |
|
""" |
|
|
|
triple_extraction_prompt = ''' |
|
Please perform structured triple extraction on the given text enclosed within the |
|
three backticks. |
|
Convert the text into a set of (subject, predicate, object) triples. |
|
Treat a math expression or a block of programming statements as a single concept. |
|
Use the previous extraction text and results as context. |
|
Correct misspells and syntactic errors. |
|
Don't summarize. Don't rewrite the original text. Don't decode the original text. |
|
Output the results as a set of ("subject":extracted subject, "predicate":extracted predicate, |
|
"object":extracted object). Don't add extra explanation to the results. |
|
TEXT: ```{}``` |
|
''' |
|
|
|
asent = row['sent_text'] |
|
|
|
|
|
msg = triple_extraction_prompt.format(asent) |
|
|
|
|
|
response = model.generate_content( |
|
msg |
|
) |
|
|
|
|
|
|
|
pattern = r'\{([^}]+)\}|\(([^)]+)\)' |
|
|
|
|
|
|
|
|
|
|
|
response_text = response.text |
|
|
|
matches = re.findall(pattern, response_text) |
|
|
|
|
|
text_to_process = [ "{" + match[0].strip() + "}" if match[0] |
|
else "{" + match[1].strip() + "}" for match in matches if match[0] or match[1]] |
|
|
|
|
|
|
|
tri_dict = {} |
|
|
|
tri_dict['triple_id'] = row['sent_id'] + "_triples" |
|
tri_dict['sent_id'] = row['sent_id'] |
|
tri_dict['course_material_type'] = row['course_material_type'] |
|
tri_dict['course_material_week'] = row['course_material_week'] |
|
|
|
tri_dict['triples_to_process'] = text_to_process |
|
|
|
return tri_dict |
|
|
|
|
|
|
|
def split_nodes_sentences_df(nodes: List[llama_index.schema.TextNode]) -> pd.DataFrame: |
|
""" |
|
split the text of each node into sentences by spacy |
|
""" |
|
|
|
recs = [] |
|
|
|
nlp = spacy.load('en_core_web_sm') |
|
|
|
for node in nodes: |
|
dict_list = split_nodeText_sentences_dict_list(nlp, node) |
|
|
|
recs.extend(dict_list) |
|
|
|
return pd.DataFrame(recs) |
|
|
|
|
|
def split_nodeText_sentences_dict_list(nlp: Any, node: llama_index.schema.TextNode) -> list: |
|
""" |
|
split the text of the given TextNode into sentences |
|
INPUT: nlp: the spacy model |
|
node: a TextNode |
|
OUTPUT: a list of dictionaries each of which contains the information for a sentence. |
|
""" |
|
|
|
dict_list = [] |
|
|
|
node_text = node.text |
|
text_doc = nlp(node_text) |
|
text_sentences = list(text_doc.sents) |
|
|
|
for idx, sent in enumerate(text_sentences): |
|
|
|
order = idx + 1 |
|
|
|
sent_dict = {} |
|
sent_dict['sent_id'] = node.node_id + "_sent" + str(order) |
|
|
|
sent_dict['node_id'] = node.node_id |
|
|
|
mdict = node.metadata |
|
|
|
sent_dict['course_material_type'] = mdict['course_material_type'] |
|
sent_dict['course_material_week'] = mdict['course_material_week'] |
|
|
|
sent_dict['sent_text'] = sent |
|
|
|
dict_list.append(sent_dict) |
|
|
|
return dict_list |
|
|
|
|
|
def text_keyconcepts(text:str, model:genai.GenerativeModel) -> str: |
|
""" |
|
use gemini to generate a set of key learning concepts from the input text |
|
""" |
|
|
|
prompt = ''' |
|
You are an expert AI assistant trained on extracting key concepts from the text. |
|
Please analyze the following material. |
|
Extract the key concepts that can be used to find related materials. |
|
Output the results as a list of key concepts only. Only keywords in the output list. |
|
No definitions. Separate the keywords by comma. |
|
TEXT: ```{}``` |
|
''' |
|
|
|
msg = prompt.format(text) |
|
|
|
response = model.generate_content( |
|
msg |
|
) |
|
|
|
input_string = response.text |
|
|
|
items_list = [item.strip('-').strip() for item in re.split(r'[\n,]', input_string) if item] |
|
|
|
return items_list |
|
|
|
def text_query_embedding(query:str): |
|
|
|
""" |
|
Use Gemini to Embed the given query by the type of retrieval_query |
|
INPUT: query: str |
|
OUTPUT: embedding as a list of numbers |
|
""" |
|
embedding = genai.embed_content(model="models/embedding-001", |
|
content=query, |
|
task_type="retrieval_query") |
|
|
|
return embedding['embedding'] |
|
|
|
|
|
def text_questions_answered(text:str, model:genai.GenerativeModel) -> str: |
|
""" |
|
use gemini to extract a set of questions that can be answered by the input text |
|
""" |
|
|
|
prompt = ''' |
|
You are an expert AI assistant trained on creating a list of specific, |
|
answerable questions that can be extracted from input text enclosed within the three backticks. |
|
Identify the most pertinent questions that could be asked based on its content. |
|
Compose these questions in a clear and concise manner, ensuring they directly |
|
align with the information presented in the text. Output the results in JSON format. |
|
TEXT: ```{}``` |
|
''' |
|
|
|
msg = prompt.format(text) |
|
|
|
response = model.generate_content( |
|
msg |
|
) |
|
|
|
return response.text |
|
|
|
|
|
|
|
def text_retrieval_document_embedding(text:str, title:str): |
|
|
|
""" |
|
Use Gemini to Embed the given text and title by the type of retrieval_document |
|
INPUT: text: str |
|
title: str |
|
OUTPUT: embedding as a list of numbers |
|
""" |
|
embedding = genai.embed_content(model="models/embedding-001", |
|
content=text, |
|
task_type="retrieval_document", |
|
title=title) |
|
|
|
return embedding['embedding'] |
|
|
|
|
|
def text_semantic_triples(text:str, model:genai.GenerativeModel) -> str: |
|
""" |
|
use gemini to extract a set of semantic triples from the input text |
|
""" |
|
|
|
prompt = ''' |
|
You are an expert AI assistant trained on extracting semantic triples from the given |
|
text enclosed within the three backticks. |
|
Genearate a set of (subject, predicate, object) triples for the identified relationships. |
|
Correct misspells and syntactic errors. |
|
Don't summarize. Don't rewrite the original text. Don't decode the original text. |
|
Output the results as JSON format. Don't add extra explanation to the results. |
|
TEXT: ```{}``` |
|
''' |
|
|
|
msg = prompt.format(text) |
|
|
|
response = model.generate_content( |
|
msg |
|
) |
|
|
|
return response.text |
|
|
|
|
|
|
|
def text_summary(text:str, model:genai.GenerativeModel) -> str: |
|
""" |
|
use gemini to generate a summary from the input text |
|
""" |
|
|
|
prompt = ''' |
|
You are an expert AI summarization assistant and ready to condense any text into a |
|
clear and concise overview. Please help me summairze the text within the backticks below. |
|
Please extract the key topics and concepts. Plus, please ensure there are no typos or |
|
grammatical errors in the summary. The summary will be used as surrounding context of additional |
|
content to answer specific questions. |
|
TEXT: ```{}``` |
|
''' |
|
msg = prompt.format(text) |
|
|
|
response = model.generate_content( |
|
msg |
|
) |
|
|
|
return response.text |
|
|