Pijush2023 commited on
Commit
893b484
·
verified ·
1 Parent(s): 7357061

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -58
app.py CHANGED
@@ -437,70 +437,26 @@ def bot(history, choice, tts_choice, retrieval_mode, model_choice):
437
 
438
 
439
 
440
- # import re
441
 
442
- # def clean_response(response_text):
443
- # # Remove system and user tags
444
- # response_text = re.sub(r'<\|system\|>.*?<\|end\|>', '', response_text, flags=re.DOTALL)
445
- # response_text = re.sub(r'<\|user\|>.*?<\|end\|>', '', response_text, flags=re.DOTALL)
446
- # response_text = re.sub(r'<\|assistant\|>', '', response_text, flags=re.DOTALL)
447
 
448
- # # Clean up the text by removing extra whitespace
449
- # cleaned_response = response_text.strip()
450
- # cleaned_response = re.sub(r'\s+', ' ', cleaned_response)
 
 
 
 
 
451
 
452
- # # Ensure the response is conversational and organized
453
- # cleaned_response = cleaned_response.replace('1.', '\n1.').replace('2.', '\n2.').replace('3.', '\n3.').replace('4.', '\n4.').replace('5.', '\n5.')
454
 
455
- # return cleaned_response
456
 
457
- def extract_metadata(response_text):
458
- """
459
- Extract document metadata like document name and page number from the response.
460
- """
461
- # Extract document name (source) and page number
462
- doc_name_match = re.search(r"'source':\s?'([^']*)'", response_text)
463
- page_number_match = re.search(r"'page':\s?(\d+)", response_text)
464
-
465
- # Get the document name and page number from the matches
466
- document_name = doc_name_match.group(1) if doc_name_match else "Unknown Document"
467
- page_number = page_number_match.group(1) if page_number_match else "Unknown Page"
468
-
469
- return document_name, page_number
470
 
471
- def clean_and_format_response(response_text):
472
- """
473
- Clean the response and format it into a structured format:
474
- - Document Name
475
- - Document Page No
476
- - Response Content
477
- """
478
- # Extract metadata (document name and page number)
479
- document_name, page_number = extract_metadata(response_text)
480
-
481
- # Remove metadata section from the response
482
- response_text = re.sub(r'Document\(metadata=.*?,page_content="', '', response_text)
483
- response_text = re.sub(r'<\|system\|>.*?<\|end\|>', '', response_text, flags=re.DOTALL)
484
- response_text = re.sub(r'<\|user\|>.*?<\|end\|>', '', response_text, flags=re.DOTALL)
485
- response_text = re.sub(r'<\|assistant\|>', '', response_text, flags=re.DOTALL)
486
-
487
- # Replace encoded characters and clean the content
488
- response_text = response_text.replace('\\u2019', "'") # replace unicode apostrophe
489
- response_text = response_text.replace('\\u00e8', 'è') # replace accented characters
490
- response_text = response_text.replace('\\u00e0', 'à')
491
- response_text = response_text.replace('\\n', '\n') # newline characters
492
- response_text = response_text.replace('\\\\', '\\') # backslashes
493
-
494
- # Remove any trailing document information and unwanted characters
495
- response_text = re.sub(r'\\.*$', '', response_text)
496
-
497
- # Clean up spaces and new lines
498
- response_text = response_text.strip() # Remove leading/trailing whitespace
499
- response_text = re.sub(r' +', ' ', response_text) # Replace multiple spaces with a single space
500
- response_text = re.sub(r'\n+', '\n', response_text) # Replace multiple newlines with a single newline
501
-
502
- # Return the formatted output
503
- return f"Document Name: {document_name}\nDocument Page No: {page_number}\nResponse:\n{response_text}"
504
 
505
 
506
 
 
437
 
438
 
439
 
440
+ import re
441
 
442
+ def clean_response(response_text):
443
+ # Remove system and user tags
444
+ response_text = re.sub(r'<\|system\|>.*?<\|end\|>', '', response_text, flags=re.DOTALL)
445
+ response_text = re.sub(r'<\|user\|>.*?<\|end\|>', '', response_text, flags=re.DOTALL)
446
+ response_text = re.sub(r'<\|assistant\|>', '', response_text, flags=re.DOTALL)
447
 
448
+ # Clean up the text by removing extra whitespace
449
+ cleaned_response = response_text.strip()
450
+ cleaned_response = re.sub(r'\s+', ' ', cleaned_response)
451
+
452
+ # Ensure the response is conversational and organized
453
+ cleaned_response = cleaned_response.replace('1.', '\n1.').replace('2.', '\n2.').replace('3.', '\n3.').replace('4.', '\n4.').replace('5.', '\n5.')
454
+
455
+ return cleaned_response
456
 
 
 
457
 
 
458
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
 
461
 
462