raghuv-aditya's picture
Upload 18 files
e46379d verified
raw
history blame
5.97 kB
import os
from tqdm import tqdm
import numpy as np
from transformers import ViTModel, ViTFeatureExtractor, ViTImageProcessor
from PIL import Image
import re
from fpdf import FPDF
from datetime import datetime
import fitz
import joblib
import json
model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
def create_pdf(input_text):
# Create instance of FPDF class
pdf = FPDF()
# Add a page
pdf.add_page()
# Set font
pdf.set_font("Arial", size=10)
# Split the input text into multiple lines if necessary
# This ensures that the text fits the page and multiple pages are handled
pdf.multi_cell(0, 5, txt=input_text)
# Create a unique file name with the current time
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
file_name = f"temp/PDFs/{timestamp}.pdf"
# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(file_name), exist_ok=True)
# Save the PDF
pdf.output(file_name)
# Return the file path
return file_name
def pdf_to_image(pdf_path, zoom=2.0):
# Open the PDF file
pdf_document = fitz.open(pdf_path)
# Create a list to store image paths
image_paths = []
# Create an 'Images' directory if it doesn't exist
os.makedirs("temp/Images", exist_ok=True)
# Iterate over PDF pages and convert each to an image
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num) # Load the page
# Set zoom level to improve quality
mat = fitz.Matrix(zoom, zoom) # Create a transformation matrix with the zoom level
pix = page.get_pixmap(matrix=mat) # Render the page to an image with the specified zoom
image_file = f'temp/Images/{os.path.basename(pdf_path)}_page_{page_num}.png'
pix.save(image_file) # Save the image as PNG
image_paths.append(image_file)
# Return the list containing paths of all images
return image_paths
def sanitize_text(text):
"""
Cleans and standardizes text by keeping only alphanumeric characters and spaces.
Args:
text (str): Text to sanitize.
Returns:
str: Sanitized text.
"""
if isinstance(text, str):
# Use regex to keep only alphanumeric characters and spaces
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
# Optionally, collapse multiple spaces into a single space
text = re.sub(r'\s+', ' ', text).strip()
return text
def text_to_images(text):
text = sanitize_text(text)
pdf_path = create_pdf(text)
image_paths = pdf_to_image(pdf_path)
return image_paths
def documents_to_images(path):
document_set = []
for filename in os.listdir(path):
file_path = os.path.join(path, filename)
if os.path.isfile(file_path):
with open(file_path, "r") as f:
content = f.read()
document_set.append(content)
document_image_paths = []
for document in document_set:
image_paths = text_to_images(document)
document_image_paths.append(image_paths)
return document_image_paths
def single_unit_embedding(text):
image_paths = text_to_images(text)
temp = []
for image_path in image_paths:
image = Image.open(image_path)
inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)
vector = outputs.last_hidden_state.mean(dim=1).detach().numpy()
temp.append(vector)
return np.mean(np.array(temp), axis=0)
def single_image_embedding(image):
inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)
vector = outputs.last_hidden_state.mean(dim=1).detach().numpy()
return vector
def documents_to_vision_embeddings(documents):
document_vision_embeddings = []
for document in tqdm(documents):
vector = single_unit_embedding(document)
document_vision_embeddings.append(vector)
return document_vision_embeddings
def queries_to_vision_embeddings(queries):
query_vision_embeddings = []
for query in tqdm(queries):
vector = single_unit_embedding(query)
query_vision_embeddings.append(vector)
return query_vision_embeddings
def get_documents_from_scores(scores):
rankings = []
for score in scores:
rankings.append(score[0])
return rankings
def cosine_similarity(v1, v2):
v1 = np.array(v1)
v2 = np.array(v2)
if(np.linalg.norm(v1) != 0 and np.linalg.norm(v2) != 0):
sim = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
else:
sim = 0
return sim
def vision_rankings(query_embedding, document_embeddings, k):
# query_embedding = single_unit_embedding(query)
scores = []
for idx, embedding in enumerate(document_embeddings):
scores.append((idx, cosine_similarity(query_embedding[0], embedding[0])))
scores = sorted(scores, key=lambda x: x[1], reverse=True)
scores = scores[:k]
rankings = get_documents_from_scores(scores)
return rankings, scores
def vision_pipeline(query, document_embeddings_path="Retrieval/savedModels/document-vision-embeddings.json", ids_path="Retrieval/savedModels/ids.pkl", k=100):
# document_embeddings = joblib.load(document_embeddings_path)
ids = joblib.load(ids_path)
with open(document_embeddings_path, "r") as f:
document_vision_embeddings2 = json.load(f)
document_vision_embeddings = []
for embedding in tqdm(document_vision_embeddings2):
document_vision_embeddings.append(np.array(embedding))
print("loaded embeddings")
query_embedding = single_unit_embedding(query)
rankings, scores = vision_rankings(query_embedding, document_vision_embeddings, k)
rankings2 = []
for ranking in rankings:
rankings2.append(ids[ranking])
return rankings2