Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -437,7 +437,7 @@ def bot(history, choice, tts_choice, retrieval_mode, model_choice):
|
|
437 |
|
438 |
|
439 |
|
440 |
-
import re
|
441 |
|
442 |
# def clean_response(response_text):
|
443 |
# # Remove system and user tags
|
@@ -454,37 +454,57 @@ import re
|
|
454 |
|
455 |
# return cleaned_response
|
456 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
457 |
|
458 |
-
|
459 |
-
import re
|
460 |
-
|
461 |
-
def clean_response(response_text):
|
462 |
"""
|
463 |
-
|
464 |
-
|
|
|
|
|
465 |
"""
|
|
|
|
|
|
|
466 |
# Remove metadata section from the response
|
467 |
response_text = re.sub(r'Document\(metadata=.*?,page_content="', '', response_text)
|
|
|
|
|
|
|
468 |
|
469 |
-
# Replace encoded characters
|
470 |
response_text = response_text.replace('\\u2019', "'") # replace unicode apostrophe
|
471 |
response_text = response_text.replace('\\u00e8', 'è') # replace accented characters
|
472 |
response_text = response_text.replace('\\u00e0', 'à')
|
473 |
response_text = response_text.replace('\\n', '\n') # newline characters
|
474 |
response_text = response_text.replace('\\\\', '\\') # backslashes
|
475 |
|
476 |
-
# Remove any trailing document information
|
477 |
-
response_text = re.sub(r'\\.*$', '', response_text)
|
478 |
-
|
479 |
-
# Ensure proper spacing for better readability
|
480 |
-
response_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', response_text) # Add spaces between words joined together
|
481 |
|
482 |
-
#
|
483 |
response_text = response_text.strip() # Remove leading/trailing whitespace
|
484 |
response_text = re.sub(r' +', ' ', response_text) # Replace multiple spaces with a single space
|
485 |
response_text = re.sub(r'\n+', '\n', response_text) # Replace multiple newlines with a single newline
|
|
|
|
|
|
|
|
|
|
|
|
|
486 |
|
487 |
-
return response_text
|
488 |
|
489 |
|
490 |
# Define a new template specifically for GPT-4o-mini in VDB Details mode
|
|
|
437 |
|
438 |
|
439 |
|
440 |
+
# import re
|
441 |
|
442 |
# def clean_response(response_text):
|
443 |
# # Remove system and user tags
|
|
|
454 |
|
455 |
# return cleaned_response
|
456 |
|
457 |
+
def extract_metadata(response_text):
|
458 |
+
"""
|
459 |
+
Extract document metadata like document name and page number from the response.
|
460 |
+
"""
|
461 |
+
# Extract document name (source) and page number
|
462 |
+
doc_name_match = re.search(r"'source':\s?'([^']*)'", response_text)
|
463 |
+
page_number_match = re.search(r"'page':\s?(\d+)", response_text)
|
464 |
+
|
465 |
+
# Get the document name and page number from the matches
|
466 |
+
document_name = doc_name_match.group(1) if doc_name_match else "Unknown Document"
|
467 |
+
page_number = page_number_match.group(1) if page_number_match else "Unknown Page"
|
468 |
+
|
469 |
+
return document_name, page_number
|
470 |
|
471 |
+
def clean_and_format_response(response_text):
|
|
|
|
|
|
|
472 |
"""
|
473 |
+
Clean the response and format it into a structured format:
|
474 |
+
- Document Name
|
475 |
+
- Document Page No
|
476 |
+
- Response Content
|
477 |
"""
|
478 |
+
# Extract metadata (document name and page number)
|
479 |
+
document_name, page_number = extract_metadata(response_text)
|
480 |
+
|
481 |
# Remove metadata section from the response
|
482 |
response_text = re.sub(r'Document\(metadata=.*?,page_content="', '', response_text)
|
483 |
+
response_text = re.sub(r'<\|system\|>.*?<\|end\|>', '', response_text, flags=re.DOTALL)
|
484 |
+
response_text = re.sub(r'<\|user\|>.*?<\|end\|>', '', response_text, flags=re.DOTALL)
|
485 |
+
response_text = re.sub(r'<\|assistant\|>', '', response_text, flags=re.DOTALL)
|
486 |
|
487 |
+
# Replace encoded characters and clean the content
|
488 |
response_text = response_text.replace('\\u2019', "'") # replace unicode apostrophe
|
489 |
response_text = response_text.replace('\\u00e8', 'è') # replace accented characters
|
490 |
response_text = response_text.replace('\\u00e0', 'à')
|
491 |
response_text = response_text.replace('\\n', '\n') # newline characters
|
492 |
response_text = response_text.replace('\\\\', '\\') # backslashes
|
493 |
|
494 |
+
# Remove any trailing document information and unwanted characters
|
495 |
+
response_text = re.sub(r'\\.*$', '', response_text)
|
|
|
|
|
|
|
496 |
|
497 |
+
# Clean up spaces and new lines
|
498 |
response_text = response_text.strip() # Remove leading/trailing whitespace
|
499 |
response_text = re.sub(r' +', ' ', response_text) # Replace multiple spaces with a single space
|
500 |
response_text = re.sub(r'\n+', '\n', response_text) # Replace multiple newlines with a single newline
|
501 |
+
|
502 |
+
# Return the formatted output
|
503 |
+
return f"Document Name: {document_name}\nDocument Page No: {page_number}\nResponse:\n{response_text}"
|
504 |
+
|
505 |
+
|
506 |
+
|
507 |
|
|
|
508 |
|
509 |
|
510 |
# Define a new template specifically for GPT-4o-mini in VDB Details mode
|