Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -17,7 +17,7 @@ from mistralai import Mistral
|
|
17 |
logging.basicConfig(level=logging.INFO)
|
18 |
logger = logging.getLogger(__name__)
|
19 |
|
20 |
-
# Mistral OCR setup
|
21 |
api_key = os.environ.get("MISTRAL_API_KEY")
|
22 |
if not api_key:
|
23 |
raise ValueError("MISTRAL_API_KEY environment variable not set")
|
@@ -109,7 +109,7 @@ def chunk_markdown(
|
|
109 |
document = Document(page_content=markdown_text, metadata={"source": "ocr_output"})
|
110 |
|
111 |
separators = (
|
112 |
-
["\n\d+\.\s+", "\n\n", "\n", ".", " ", ""]
|
113 |
if preserve_numbering
|
114 |
else ["\n\n", "\n", ".", " ", ""]
|
115 |
)
|
@@ -118,7 +118,7 @@ def chunk_markdown(
|
|
118 |
chunk_size=chunk_size,
|
119 |
chunk_overlap=chunk_overlap,
|
120 |
length_function=len,
|
121 |
-
|
122 |
keep_separator=True,
|
123 |
add_start_index=True,
|
124 |
is_separator_regex=preserve_numbering
|
@@ -154,7 +154,7 @@ def chunk_markdown(
|
|
154 |
logger.error(f"Error processing markdown: {str(e)}")
|
155 |
raise
|
156 |
|
157 |
-
# Placeholder image generation
|
158 |
def text_to_base64_dummy(text: str, chunk_index: int):
|
159 |
img = Image.new('RGB', (200, 200), color='white')
|
160 |
buffer = io.BytesIO()
|
@@ -184,10 +184,8 @@ def process_file_and_save(file, chunk_size, chunk_overlap, preserve_numbering, h
|
|
184 |
data["chunk_id"].append(i)
|
185 |
data["content"].append(chunk.page_content)
|
186 |
data["metadata"].append(chunk.metadata)
|
187 |
-
# Extract base64 images from markdown if present, else use placeholder
|
188 |
img_base64 = None
|
189 |
if "![image" in chunk.page_content:
|
190 |
-
# Simple extraction (assumes one image per chunk for simplicity)
|
191 |
start = chunk.page_content.find("data:image")
|
192 |
if start != -1:
|
193 |
end = chunk.page_content.find(")", start)
|
|
|
17 |
logging.basicConfig(level=logging.INFO)
|
18 |
logger = logging.getLogger(__name__)
|
19 |
|
20 |
+
# Mistral OCR setup
|
21 |
api_key = os.environ.get("MISTRAL_API_KEY")
|
22 |
if not api_key:
|
23 |
raise ValueError("MISTRAL_API_KEY environment variable not set")
|
|
|
109 |
document = Document(page_content=markdown_text, metadata={"source": "ocr_output"})
|
110 |
|
111 |
separators = (
|
112 |
+
["\n\d+\.\s+", "\n\n", "\n", ".", " ", ""]
|
113 |
if preserve_numbering
|
114 |
else ["\n\n", "\n", ".", " ", ""]
|
115 |
)
|
|
|
118 |
chunk_size=chunk_size,
|
119 |
chunk_overlap=chunk_overlap,
|
120 |
length_function=len,
|
121 |
+
separators=separators, # Fixed parameter name
|
122 |
keep_separator=True,
|
123 |
add_start_index=True,
|
124 |
is_separator_regex=preserve_numbering
|
|
|
154 |
logger.error(f"Error processing markdown: {str(e)}")
|
155 |
raise
|
156 |
|
157 |
+
# Placeholder image generation
|
158 |
def text_to_base64_dummy(text: str, chunk_index: int):
|
159 |
img = Image.new('RGB', (200, 200), color='white')
|
160 |
buffer = io.BytesIO()
|
|
|
184 |
data["chunk_id"].append(i)
|
185 |
data["content"].append(chunk.page_content)
|
186 |
data["metadata"].append(chunk.metadata)
|
|
|
187 |
img_base64 = None
|
188 |
if "![image" in chunk.page_content:
|
|
|
189 |
start = chunk.page_content.find("data:image")
|
190 |
if start != -1:
|
191 |
end = chunk.page_content.find(")", start)
|