Nechba commited on
Commit
26f42da
·
verified ·
1 Parent(s): ccb6cf3

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +76 -0
utils.py CHANGED
@@ -32,7 +32,83 @@ def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]:
32
  images.append(img)
33
  return images
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def process_local_pdf(pdf_bytes: bytes):
37
  """
38
  Process a local PDF file with Gemini AI.
 
32
  images.append(img)
33
  return images
34
 
35
+ def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]:
36
+ """Convert PDF to PIL Images using PyMuPDF (no poppler needed)."""
37
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
38
+ images = []
39
+ for page in doc:
40
+ pix = page.get_pixmap()
41
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
42
+ images.append(img)
43
+ return images
44
+
45
+ def pil_image_to_bytes(pil_image: Image.Image) -> bytes:
46
+ """Convert a PIL image to JPEG bytes."""
47
+ with io.BytesIO() as output:
48
+ pil_image.save(output, format="JPEG")
49
+ return output.getvalue()
50
+
51
+ def analyze_pdf_images_with_gemini(pdf_bytes):
52
+ question="""Can you let me know that signatures and initials are missing and if certain checkboxes are checked prompt me with a secondary question? and also determine seller 1 seller 2 buyer 1 buyer 2 ... , WITH MAXIMUM details like Signature line for ....
53
+ can we also add line item?
54
+ the last part would be remember certain check boxes need extra paper work and so it needs to let me know that the file has it or missing it is that possible
55
+ give me the results in CSV FORM and the columns are as follow
56
+ 1. **Category:** This column groups the findings or checklist items based on their nature or the type of issue. Examples include:
57
+ * `Parties`: Information about the buyers and sellers involved.
58
+ * `Missing Item...`: Indicates required signatures, initials, or information that is absent from the specified document.
59
+ * `Checked B...`: Refers to items where a specific checkbox (likely designated as "Box B" or similar in the original form, though here it just seems to indicate *a* box was checked) has been marked, signifying a particular choice or agreement.
60
+ * `Conflict`: Highlights inconsistencies or contradictory clauses found within the documents.
61
+
62
+ 2. **Location:** This column specifies exactly *where* in the transaction documents the item being discussed can be found. It typically includes:
63
+ * The name of the document (e.g., `Sale Contract`, `Addendum`, `Release Agreement`, `Exclusive Buyer Agency`, `Lead Based Paint`, `In Present Condition Addendum`).
64
+ * An internal reference, possibly to an image scan (`Image X`).
65
+ * The page number within that document (`Pg Y`).
66
+
67
+ 3. **Line Item(s):** This provides more specific location detail *within* the page mentioned in the "Location" column. It can be:
68
+ * A page number (often repeating the one from "Location").
69
+ * Specific line numbers or a range of lines (e.g., `725-726`, `10 & 22`, `696; 697`).
70
+ * Sometimes blank if the item applies to the whole page or a general section.
71
+
72
+ 4. **Item Type:** This column identifies the *specific element* or clause being reviewed or checked on that line. Examples:
73
+ * Identification of parties (`4 Seller 1`, `6 Buyer 2`). The numbers might be internal codes or refer to lines on the original form.
74
+ * Signature or initial fields (`Seller Signature Block`, `Bottom Right Initials`, `Buyer Signature`).
75
+ * Specific clauses or addenda being checked for presence/selection (`Home Warranty Waiver`, `Seller's Disclosure Addendum Attached`, `Inspection Clause 1`).
76
+ * Specific choices within clauses (`Brokerage Consents`, `Seller Knowledge (Presence)`).
77
+
78
+ 5. **Status:** This column indicates the outcome of the review for the specific `Item Type`.
79
+ * `Identified`: The required information (like a party's name) was found.
80
+ * `Missing`: The required item (like a signature, initial, or checked box) was not found.
81
+ * `Checked`: A specific checkbox related to the `Item Type` was found to be marked (ticked).
82
+ * `Conflict Detected`: An inconsistency or contradiction was found relating to this item.
83
+
84
+ 6. **Details:** This column provides specific information elaborating on the `Status`.
85
+ * If `Identified`, it shows the name or value found (e.g., `VB ONE LLC`, `Dimas Miguel Guerra Pena`).
86
+ * If `Missing`, it describes more precisely what is missing (e.g., `Entire block for Seller... is empty`, `Two initial boxes for Seller... are empty`).
87
+ * If `Checked`, it often shows the text associated with the checked box or confirms the selection (e.g., `[X] BUYER waives the opportunity...`, `[X] Seller's Disclosure and Condition...`).
88
+ * If `Conflict Detected`, it might briefly describe the nature of the conflict (e.g., referencing conflicting checked boxes).
89
+
90
+ 7. **Secondary Question (if applicable):** This column raises follow-up questions, potential issues, or points needing clarification based on the finding in the row. It highlights risks or implications that need to be addressed.
91
+ GIVE ME IN CSV FORM DIRECTLY DONT WRITE ANYTHING ELSE
92
+ """
93
+ # Initialize Gemini client
94
+ client = genai.Client(api_key="AIzaSyAP85jSUKncrIGOAhm3Gvo-TYra_e1wmEA")
95
 
96
+ images = pdf_to_images(pdf_bytes)
97
+
98
+ # Prepare image parts
99
+ contents = [question]
100
+ for img in images:
101
+ img_bytes = pil_image_to_bytes(img)
102
+ contents.append(types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg"))
103
+
104
+ # Generate content using Gemini
105
+ response = client.models.generate_content(
106
+ model="gemini-2.0-flash",
107
+ contents=contents
108
+ )
109
+
110
+ return response.text
111
+
112
  def process_local_pdf(pdf_bytes: bytes):
113
  """
114
  Process a local PDF file with Gemini AI.