File size: 5,967 Bytes
e46379d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import os
from tqdm import tqdm
import numpy as np
from transformers import ViTModel, ViTFeatureExtractor, ViTImageProcessor
from PIL import Image
import re
from fpdf import FPDF
from datetime import datetime
import fitz
import joblib
import json

model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')

def create_pdf(input_text):
    # Create instance of FPDF class
    pdf = FPDF()
    
    # Add a page
    pdf.add_page()
    
    # Set font
    pdf.set_font("Arial", size=10)
    
    # Split the input text into multiple lines if necessary
    # This ensures that the text fits the page and multiple pages are handled
    pdf.multi_cell(0, 5, txt=input_text)
    
    # Create a unique file name with the current time
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    file_name = f"temp/PDFs/{timestamp}.pdf"
    
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(file_name), exist_ok=True)
    
    # Save the PDF
    pdf.output(file_name)
    
    # Return the file path
    return file_name

def pdf_to_image(pdf_path, zoom=2.0):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    # Create a list to store image paths
    image_paths = []
    
    # Create an 'Images' directory if it doesn't exist
    os.makedirs("temp/Images", exist_ok=True)
    
    # Iterate over PDF pages and convert each to an image
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)  # Load the page
        
        # Set zoom level to improve quality
        mat = fitz.Matrix(zoom, zoom)  # Create a transformation matrix with the zoom level
        pix = page.get_pixmap(matrix=mat)  # Render the page to an image with the specified zoom
        
        image_file = f'temp/Images/{os.path.basename(pdf_path)}_page_{page_num}.png'
        pix.save(image_file)  # Save the image as PNG
        image_paths.append(image_file)
    
    # Return the list containing paths of all images
    return image_paths

def sanitize_text(text):
    """
    Cleans and standardizes text by keeping only alphanumeric characters and spaces.
    Args:
        text (str): Text to sanitize.
    Returns:
        str: Sanitized text.
    """
    if isinstance(text, str):
        # Use regex to keep only alphanumeric characters and spaces
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        # Optionally, collapse multiple spaces into a single space
        text = re.sub(r'\s+', ' ', text).strip()
    return text

def text_to_images(text):
    text = sanitize_text(text)
    pdf_path = create_pdf(text)
    image_paths = pdf_to_image(pdf_path)
    return image_paths

def documents_to_images(path):
    document_set = []
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        if os.path.isfile(file_path):
            with open(file_path, "r") as f:
                content = f.read()
                document_set.append(content)
    document_image_paths = []
    for document in document_set:
        image_paths = text_to_images(document)
        document_image_paths.append(image_paths)
    return document_image_paths

def single_unit_embedding(text):
    image_paths = text_to_images(text)
    temp = []
    for image_path in image_paths:
        image = Image.open(image_path)
        inputs = processor(images=image, return_tensors="pt")
        outputs = model(**inputs)
        vector = outputs.last_hidden_state.mean(dim=1).detach().numpy()
        temp.append(vector)
    return np.mean(np.array(temp), axis=0)

def single_image_embedding(image):
    inputs = processor(images=image, return_tensors="pt")
    outputs = model(**inputs)
    vector = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return vector

def documents_to_vision_embeddings(documents):
    document_vision_embeddings = []
    for document in tqdm(documents):
        vector = single_unit_embedding(document)
        document_vision_embeddings.append(vector)
    return document_vision_embeddings

def queries_to_vision_embeddings(queries):
    query_vision_embeddings = []
    for query in tqdm(queries):
        vector = single_unit_embedding(query)
        query_vision_embeddings.append(vector)
    return query_vision_embeddings

def get_documents_from_scores(scores):
    rankings = []
    for score in scores:
        rankings.append(score[0])
    return rankings

def cosine_similarity(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2)
    if(np.linalg.norm(v1) != 0 and np.linalg.norm(v2) != 0):
        sim = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    else:
        sim = 0
    return sim

def vision_rankings(query_embedding, document_embeddings, k):
    # query_embedding = single_unit_embedding(query)
    scores = []
    for idx, embedding in enumerate(document_embeddings):
        scores.append((idx, cosine_similarity(query_embedding[0], embedding[0])))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    scores = scores[:k]
    rankings = get_documents_from_scores(scores)
    return rankings, scores


def vision_pipeline(query, document_embeddings_path="Retrieval/savedModels/document-vision-embeddings.json", ids_path="Retrieval/savedModels/ids.pkl", k=100):
    # document_embeddings = joblib.load(document_embeddings_path)
    ids = joblib.load(ids_path)
    with open(document_embeddings_path, "r") as f:
        document_vision_embeddings2 = json.load(f)
    document_vision_embeddings = []
    for embedding in tqdm(document_vision_embeddings2):
        document_vision_embeddings.append(np.array(embedding))
    print("loaded embeddings")
    query_embedding = single_unit_embedding(query)
    rankings, scores = vision_rankings(query_embedding, document_vision_embeddings, k)
    rankings2 = []
    for ranking in rankings:
        rankings2.append(ids[ranking])
    return rankings2