Svngoku commited on
Commit
84661cc
·
verified ·
1 Parent(s): 8a5a9ab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -6
app.py CHANGED
@@ -17,7 +17,7 @@ from mistralai import Mistral
17
  logging.basicConfig(level=logging.INFO)
18
  logger = logging.getLogger(__name__)
19
 
20
- # Mistral OCR setup (ensure you have your API key set)
21
  api_key = os.environ.get("MISTRAL_API_KEY")
22
  if not api_key:
23
  raise ValueError("MISTRAL_API_KEY environment variable not set")
@@ -109,7 +109,7 @@ def chunk_markdown(
109
  document = Document(page_content=markdown_text, metadata={"source": "ocr_output"})
110
 
111
  separators = (
112
- ["\n\d+\.\s+", "\n\n", "\n", ".", " ", ""]
113
  if preserve_numbering
114
  else ["\n\n", "\n", ".", " ", ""]
115
  )
@@ -118,7 +118,7 @@ def chunk_markdown(
118
  chunk_size=chunk_size,
119
  chunk_overlap=chunk_overlap,
120
  length_function=len,
121
-  on=separators,
122
  keep_separator=True,
123
  add_start_index=True,
124
  is_separator_regex=preserve_numbering
@@ -154,7 +154,7 @@ def chunk_markdown(
154
  logger.error(f"Error processing markdown: {str(e)}")
155
  raise
156
 
157
- # Placeholder image generation (for chunks without images)
158
  def text_to_base64_dummy(text: str, chunk_index: int):
159
  img = Image.new('RGB', (200, 200), color='white')
160
  buffer = io.BytesIO()
@@ -184,10 +184,8 @@ def process_file_and_save(file, chunk_size, chunk_overlap, preserve_numbering, h
184
  data["chunk_id"].append(i)
185
  data["content"].append(chunk.page_content)
186
  data["metadata"].append(chunk.metadata)
187
- # Extract base64 images from markdown if present, else use placeholder
188
  img_base64 = None
189
  if "![image" in chunk.page_content:
190
- # Simple extraction (assumes one image per chunk for simplicity)
191
  start = chunk.page_content.find("data:image")
192
  if start != -1:
193
  end = chunk.page_content.find(")", start)
 
17
  logging.basicConfig(level=logging.INFO)
18
  logger = logging.getLogger(__name__)
19
 
20
+ # Mistral OCR setup
21
  api_key = os.environ.get("MISTRAL_API_KEY")
22
  if not api_key:
23
  raise ValueError("MISTRAL_API_KEY environment variable not set")
 
109
  document = Document(page_content=markdown_text, metadata={"source": "ocr_output"})
110
 
111
  separators = (
112
+ ["\n\d+\.\s+", "\n\n", "\n", ".", " ", ""]
113
  if preserve_numbering
114
  else ["\n\n", "\n", ".", " ", ""]
115
  )
 
118
  chunk_size=chunk_size,
119
  chunk_overlap=chunk_overlap,
120
  length_function=len,
121
+ separators=separators, # Fixed parameter name
122
  keep_separator=True,
123
  add_start_index=True,
124
  is_separator_regex=preserve_numbering
 
154
  logger.error(f"Error processing markdown: {str(e)}")
155
  raise
156
 
157
+ # Placeholder image generation
158
  def text_to_base64_dummy(text: str, chunk_index: int):
159
  img = Image.new('RGB', (200, 200), color='white')
160
  buffer = io.BytesIO()
 
184
  data["chunk_id"].append(i)
185
  data["content"].append(chunk.page_content)
186
  data["metadata"].append(chunk.metadata)
 
187
  img_base64 = None
188
  if "![image" in chunk.page_content:
 
189
  start = chunk.page_content.find("data:image")
190
  if start != -1:
191
  end = chunk.page_content.find(")", start)