Spaces:

Nechba
/

cvs-generation-using-images

Sleeping

App Files Files Community

cvs-generation-using-images / utils.py

Nechba

Update utils.py

37a0914 verified 3 months ago

raw

history blame

5.35 kB

	import google.generativeai as genai
	genai.configure(api_key="AIzaSyAP85jSUKncrIGOAhm3Gvo-TYra_e1wmEA")
	import os
	import pandas as pd
	import io
	import tempfile
	from PyPDF2 import PdfReader
	import re
	import csv
	from PIL import Image
	import os
	import pandas as pd
	import io
	import tempfile
	from PyPDF2 import PdfReader
	from pdf2image import convert_from_bytes

	def configure_gemini(api_key: str):
	"""Configure Gemini API with the provided key"""
	genai.configure(api_key=api_key)

	def pdf_to_images(pdf_bytes: bytes) -> list:
	"""Convert PDF bytes to list of PIL Images"""
	return convert_from_bytes(pdf_bytes)

	def analyze_single_document(images: list, prompt: str) -> dict:
	"""Analyze a single document and return results"""
	model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp-01-21')
	response = model.generate_content([prompt] + images)
	return response.text
	def analyze_pdf_directly(pdf_bytes: bytes, prompt: str, model_name: str = "gemini-1.5-pro"):
	"""Analyze a PDF directly using Gemini's PDF support"""
	model = genai.GenerativeModel(model_name)

	# Create a temporary PDF file
	with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
	tmp_file.write(pdf_bytes)
	tmp_file_path = tmp_file.name

	try:
	# Use the file upload feature
	response = model.generate_content(
	[prompt, genai.upload_file(tmp_file_path)]
	)
	print(f"Response: {response}")
	return response.text
	finally:
	# Clean up temporary file
	if os.path.exists(tmp_file_path):
	os.unlink(tmp_file_path)

	def extract_response_text(response) -> str:
	"""Extract text content from Gemini response object"""
	try:
	if hasattr(response, 'text'):
	return response.text
	elif hasattr(response, 'result') and hasattr(response.result, 'candidates'):
	for candidate in response.result.candidates:
	if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts'):
	for part in candidate.content.parts:
	if hasattr(part, 'text'):
	return part.text
	return str(response)
	except Exception as e:
	print(f"Error extracting response text: {str(e)}")
	return str(response)

	def extract_csv_from_response(response) -> str:
	"""Extract CSV data from Gemini response"""
	try:
	# Get the text content from the response
	response_text = extract_response_text(response)

	# Extract CSV content between ```csv markers
	csv_match = re.search(r'```csv(.*?)```', response_text, re.DOTALL)
	if csv_match:
	return csv_match.group(1).strip()

	# Fallback: Try to find any CSV-like content
	lines = []
	in_csv = False
	for line in response_text.split('\n'):
	if ',' in line and ('Category,' in line or 'Location,' in line):
	in_csv = True
	if in_csv:
	lines.append(line)
	if lines:
	return '\n'.join(lines)

	return response_text # Return full response if no CSV found
	except Exception as e:
	print(f"Error extracting CSV: {str(e)}")
	return response.text if hasattr(response, 'text') else str(response)

	def csv_to_dataframe(csv_data: str) -> pd.DataFrame:
	"""Convert CSV string to pandas DataFrame with error handling"""
	if not csv_data.strip():
	return pd.DataFrame()

	try:
	# Clean line breaks and extra spaces
	cleaned_data = "\n".join([line.strip() for line in csv_data.split('\n') if line.strip()])

	# Use CSV reader to handle irregular fields
	rows = []
	reader = csv.reader(io.StringIO(cleaned_data),
	delimiter=',',
	quotechar='"',
	skipinitialspace=True)

	header = next(reader)
	for row in reader:
	if len(row) > len(header):
	# Combine extra fields into the last column
	row = row[:len(header)-1] + [','.join(row[len(header)-1:])]
	rows.append(row)

	return pd.DataFrame(rows, columns=header)

	except Exception as e:
	print(f"CSV conversion error: {str(e)}")
	try:
	# Fallback to pandas with flexible parsing
	return pd.read_csv(io.StringIO(cleaned_data),
	on_bad_lines='warn',
	engine='python',
	quotechar='"',
	skipinitialspace=True)
	except Exception as fallback_error:
	print(f"Fallback conversion failed: {str(fallback_error)}")
	return pd.DataFrame()


	def save_csv(csv_data: str, filename: str) -> str:
	"""Save CSV data to file"""
	with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
	csvfile.write(csv_data.strip())
	return filename

	def get_pdf_metadata(pdf_bytes: bytes) -> dict:
	"""Extract basic PDF metadata"""
	reader = PdfReader(io.BytesIO(pdf_bytes))
	return {
	'page_count': len(reader.pages),
	'author': reader.metadata.author if reader.metadata else None,
	'title': reader.metadata.title if reader.metadata else None
	}