Spaces:
Sleeping
Sleeping
import google.generativeai as genai | |
genai.configure(api_key="AIzaSyAP85jSUKncrIGOAhm3Gvo-TYra_e1wmEA") | |
import os | |
import pandas as pd | |
import io | |
import tempfile | |
from PyPDF2 import PdfReader | |
import re | |
import csv | |
from PIL import Image | |
import os | |
import pandas as pd | |
import io | |
import tempfile | |
from PyPDF2 import PdfReader | |
from pdf2image import convert_from_bytes | |
def configure_gemini(api_key: str): | |
"""Configure Gemini API with the provided key""" | |
genai.configure(api_key=api_key) | |
def pdf_to_images(pdf_bytes: bytes) -> list: | |
"""Convert PDF bytes to list of PIL Images""" | |
return convert_from_bytes(pdf_bytes) | |
def analyze_single_document(images: list, prompt: str) -> dict: | |
"""Analyze a single document and return results""" | |
model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp-01-21') | |
response = model.generate_content([prompt] + images) | |
return response.text | |
def analyze_pdf_directly(pdf_bytes: bytes, prompt: str, model_name: str = "gemini-1.5-pro"): | |
"""Analyze a PDF directly using Gemini's PDF support""" | |
model = genai.GenerativeModel(model_name) | |
# Create a temporary PDF file | |
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file: | |
tmp_file.write(pdf_bytes) | |
tmp_file_path = tmp_file.name | |
try: | |
# Use the file upload feature | |
response = model.generate_content( | |
[prompt, genai.upload_file(tmp_file_path)] | |
) | |
print(f"Response: {response}") | |
return response.text | |
finally: | |
# Clean up temporary file | |
if os.path.exists(tmp_file_path): | |
os.unlink(tmp_file_path) | |
def extract_response_text(response) -> str: | |
"""Extract text content from Gemini response object""" | |
try: | |
if hasattr(response, 'text'): | |
return response.text | |
elif hasattr(response, 'result') and hasattr(response.result, 'candidates'): | |
for candidate in response.result.candidates: | |
if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts'): | |
for part in candidate.content.parts: | |
if hasattr(part, 'text'): | |
return part.text | |
return str(response) | |
except Exception as e: | |
print(f"Error extracting response text: {str(e)}") | |
return str(response) | |
def extract_csv_from_response(response) -> str: | |
"""Extract CSV data from Gemini response""" | |
try: | |
# Get the text content from the response | |
response_text = extract_response_text(response) | |
# Extract CSV content between ```csv markers | |
csv_match = re.search(r'```csv(.*?)```', response_text, re.DOTALL) | |
if csv_match: | |
return csv_match.group(1).strip() | |
# Fallback: Try to find any CSV-like content | |
lines = [] | |
in_csv = False | |
for line in response_text.split('\n'): | |
if ',' in line and ('Category,' in line or 'Location,' in line): | |
in_csv = True | |
if in_csv: | |
lines.append(line) | |
if lines: | |
return '\n'.join(lines) | |
return response_text # Return full response if no CSV found | |
except Exception as e: | |
print(f"Error extracting CSV: {str(e)}") | |
return response.text if hasattr(response, 'text') else str(response) | |
def csv_to_dataframe(csv_data: str) -> pd.DataFrame: | |
"""Convert CSV string to pandas DataFrame with error handling""" | |
if not csv_data.strip(): | |
return pd.DataFrame() | |
try: | |
# Clean line breaks and extra spaces | |
cleaned_data = "\n".join([line.strip() for line in csv_data.split('\n') if line.strip()]) | |
# Use CSV reader to handle irregular fields | |
rows = [] | |
reader = csv.reader(io.StringIO(cleaned_data), | |
delimiter=',', | |
quotechar='"', | |
skipinitialspace=True) | |
header = next(reader) | |
for row in reader: | |
if len(row) > len(header): | |
# Combine extra fields into the last column | |
row = row[:len(header)-1] + [','.join(row[len(header)-1:])] | |
rows.append(row) | |
return pd.DataFrame(rows, columns=header) | |
except Exception as e: | |
print(f"CSV conversion error: {str(e)}") | |
try: | |
# Fallback to pandas with flexible parsing | |
return pd.read_csv(io.StringIO(cleaned_data), | |
on_bad_lines='warn', | |
engine='python', | |
quotechar='"', | |
skipinitialspace=True) | |
except Exception as fallback_error: | |
print(f"Fallback conversion failed: {str(fallback_error)}") | |
return pd.DataFrame() | |
def save_csv(csv_data: str, filename: str) -> str: | |
"""Save CSV data to file""" | |
with open(filename, 'w', newline='', encoding='utf-8') as csvfile: | |
csvfile.write(csv_data.strip()) | |
return filename | |
def get_pdf_metadata(pdf_bytes: bytes) -> dict: | |
"""Extract basic PDF metadata""" | |
reader = PdfReader(io.BytesIO(pdf_bytes)) | |
return { | |
'page_count': len(reader.pages), | |
'author': reader.metadata.author if reader.metadata else None, | |
'title': reader.metadata.title if reader.metadata else None | |
} |