Pijush2023 commited on
Commit
7357061
·
verified ·
1 Parent(s): 9ade06a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -15
app.py CHANGED
@@ -437,7 +437,7 @@ def bot(history, choice, tts_choice, retrieval_mode, model_choice):
437
 
438
 
439
 
440
- import re
441
 
442
  # def clean_response(response_text):
443
  # # Remove system and user tags
@@ -454,37 +454,57 @@ import re
454
 
455
  # return cleaned_response
456
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
 
458
-
459
- import re
460
-
461
- def clean_response(response_text):
462
  """
463
- This function removes metadata and unnecessary symbols from the document response
464
- and formats the output in a readable way.
 
 
465
  """
 
 
 
466
  # Remove metadata section from the response
467
  response_text = re.sub(r'Document\(metadata=.*?,page_content="', '', response_text)
 
 
 
468
 
469
- # Replace encoded characters
470
  response_text = response_text.replace('\\u2019', "'") # replace unicode apostrophe
471
  response_text = response_text.replace('\\u00e8', 'è') # replace accented characters
472
  response_text = response_text.replace('\\u00e0', 'à')
473
  response_text = response_text.replace('\\n', '\n') # newline characters
474
  response_text = response_text.replace('\\\\', '\\') # backslashes
475
 
476
- # Remove any trailing document information
477
- response_text = re.sub(r'\\.*$', '', response_text)
478
-
479
- # Ensure proper spacing for better readability
480
- response_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', response_text) # Add spaces between words joined together
481
 
482
- # Properly format new lines and spacing
483
  response_text = response_text.strip() # Remove leading/trailing whitespace
484
  response_text = re.sub(r' +', ' ', response_text) # Replace multiple spaces with a single space
485
  response_text = re.sub(r'\n+', '\n', response_text) # Replace multiple newlines with a single newline
 
 
 
 
 
 
486
 
487
- return response_text
488
 
489
 
490
  # Define a new template specifically for GPT-4o-mini in VDB Details mode
 
437
 
438
 
439
 
440
+ # import re
441
 
442
  # def clean_response(response_text):
443
  # # Remove system and user tags
 
454
 
455
  # return cleaned_response
456
 
457
+ def extract_metadata(response_text):
458
+ """
459
+ Extract document metadata like document name and page number from the response.
460
+ """
461
+ # Extract document name (source) and page number
462
+ doc_name_match = re.search(r"'source':\s?'([^']*)'", response_text)
463
+ page_number_match = re.search(r"'page':\s?(\d+)", response_text)
464
+
465
+ # Get the document name and page number from the matches
466
+ document_name = doc_name_match.group(1) if doc_name_match else "Unknown Document"
467
+ page_number = page_number_match.group(1) if page_number_match else "Unknown Page"
468
+
469
+ return document_name, page_number
470
 
471
+ def clean_and_format_response(response_text):
 
 
 
472
  """
473
+ Clean the response and format it into a structured format:
474
+ - Document Name
475
+ - Document Page No
476
+ - Response Content
477
  """
478
+ # Extract metadata (document name and page number)
479
+ document_name, page_number = extract_metadata(response_text)
480
+
481
  # Remove metadata section from the response
482
  response_text = re.sub(r'Document\(metadata=.*?,page_content="', '', response_text)
483
+ response_text = re.sub(r'<\|system\|>.*?<\|end\|>', '', response_text, flags=re.DOTALL)
484
+ response_text = re.sub(r'<\|user\|>.*?<\|end\|>', '', response_text, flags=re.DOTALL)
485
+ response_text = re.sub(r'<\|assistant\|>', '', response_text, flags=re.DOTALL)
486
 
487
+ # Replace encoded characters and clean the content
488
  response_text = response_text.replace('\\u2019', "'") # replace unicode apostrophe
489
  response_text = response_text.replace('\\u00e8', 'è') # replace accented characters
490
  response_text = response_text.replace('\\u00e0', 'à')
491
  response_text = response_text.replace('\\n', '\n') # newline characters
492
  response_text = response_text.replace('\\\\', '\\') # backslashes
493
 
494
+ # Remove any trailing document information and unwanted characters
495
+ response_text = re.sub(r'\\.*$', '', response_text)
 
 
 
496
 
497
+ # Clean up spaces and new lines
498
  response_text = response_text.strip() # Remove leading/trailing whitespace
499
  response_text = re.sub(r' +', ' ', response_text) # Replace multiple spaces with a single space
500
  response_text = re.sub(r'\n+', '\n', response_text) # Replace multiple newlines with a single newline
501
+
502
+ # Return the formatted output
503
+ return f"Document Name: {document_name}\nDocument Page No: {page_number}\nResponse:\n{response_text}"
504
+
505
+
506
+
507
 
 
508
 
509
 
510
  # Define a new template specifically for GPT-4o-mini in VDB Details mode