Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -288,6 +288,28 @@ button:hover {
|
|
288 |
border-radius: 10px;
|
289 |
background-color: rgba(255,255,255,0.3);
|
290 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
/* Hide elements */
|
292 |
footer, .gradio-footer, .hide, [data-testid="Use via API"], [data-testid="mmsettings"],
|
293 |
#sentiment-analysis, #risk-visualization {
|
@@ -314,6 +336,13 @@ footer, .gradio-footer, .hide, [data-testid="Use via API"], [data-testid="mmsett
|
|
314 |
.dark .count-item:hover {
|
315 |
background-color: rgba(255,255,255,0.05);
|
316 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
317 |
"""
|
318 |
|
319 |
# Salesforce credentials
|
@@ -551,18 +580,59 @@ def extract_text_from_pdf(pdf_path: str) -> str:
|
|
551 |
for page in pdf.pages:
|
552 |
page_text = page.extract_text()
|
553 |
if page_text:
|
554 |
-
text += page_text
|
555 |
return text
|
556 |
except Exception as e:
|
557 |
logger.error(f"PDF text extraction failed: {str(e)}")
|
558 |
raise Exception(f"PDF text extraction failed: {str(e)}")
|
559 |
|
560 |
-
def
|
561 |
-
"""
|
562 |
-
|
|
|
|
|
563 |
for keyword in keywords:
|
564 |
-
|
565 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
566 |
|
567 |
def find_penalty_values(text: str) -> List[float]:
|
568 |
"""Find penalty amounts in the text"""
|
@@ -692,6 +762,23 @@ def format_clause_example(example: str, index: int) -> str:
|
|
692 |
</div>
|
693 |
"""
|
694 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
695 |
def analyze_pdf(file_obj) -> List:
|
696 |
"""Main analysis function for Gradio interface"""
|
697 |
try:
|
@@ -707,8 +794,6 @@ def analyze_pdf(file_obj) -> List:
|
|
707 |
text = extract_text_from_pdf(file_obj.name)
|
708 |
if not text.strip():
|
709 |
raise Exception("No text extracted from PDF. It might be a scanned document.")
|
710 |
-
# Split text into lines for line number tracking
|
711 |
-
lines = text.split('\n')
|
712 |
except Exception as e:
|
713 |
raise Exception(f"PDF text extraction failed: {str(e)}")
|
714 |
|
@@ -722,15 +807,16 @@ def analyze_pdf(file_obj) -> List:
|
|
722 |
obligation_keywords = ["shall", "must", "required to", "obligated to", "duty"]
|
723 |
delay_keywords = ["delay", "late", "overdue", "extension", "time is of the essence"]
|
724 |
|
725 |
-
|
726 |
-
|
727 |
-
|
|
|
728 |
|
729 |
-
|
|
|
|
|
730 |
|
731 |
-
|
732 |
-
total_obligations = sum(obligation_counts.values())
|
733 |
-
total_delays = sum(delay_counts.values())
|
734 |
|
735 |
# Generate warning messages with emojis
|
736 |
penalty_warning = format_warning_message(total_penalties, "penalty", "💰")
|
@@ -751,49 +837,60 @@ def analyze_pdf(file_obj) -> List:
|
|
751 |
except Exception as e:
|
752 |
raise Exception(f"Visual generation failed: {str(e)}")
|
753 |
|
754 |
-
#
|
755 |
-
|
756 |
-
|
757 |
-
|
758 |
-
|
759 |
-
|
760 |
-
|
761 |
-
|
762 |
-
|
763 |
-
|
764 |
-
|
|
|
765 |
{penalty_warning}
|
766 |
<div class='penalty-box'>
|
767 |
<div class='section-title'>💰 Penalty Clause Details</div>
|
768 |
-
{"".join(
|
769 |
-
</div>
|
770 |
-
<div class='penalty-box'>
|
771 |
-
<div class='section-title'>💰 Detailed Penalty Line References</div>
|
772 |
-
{"".join([f"<div class='count-item'><span class='count-label'><span style='color: var(--danger-color)'>•</span> {kw} (Line(s): {lines})</span></div>" for kw, lines in penalty_line_refs.items()]) or '<div class="success-box">✅ No penalty keywords found in specific lines.</div>'}
|
773 |
</div>
|
774 |
"""
|
775 |
|
776 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
777 |
{obligation_warning}
|
778 |
<div class='obligation-box'>
|
779 |
<div class='section-title'>📝 Obligation Clause Details</div>
|
780 |
-
{"".join(
|
781 |
-
</div>
|
782 |
-
<div class='obligation-box'>
|
783 |
-
<div class='section-title'>📝 Detailed Obligation Line References</div>
|
784 |
-
{"".join([f"<div class='count-item'><span class='count-label'><span style='color: var(--warning-color)'>•</span> {kw} (Line(s): {lines})</span></div>" for kw, lines in obligation_line_refs.items()]) or '<div class="success-box">✅ No obligation keywords found in specific lines.</div>'}
|
785 |
</div>
|
786 |
"""
|
787 |
|
788 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
789 |
{delay_warning}
|
790 |
<div class='delay-box'>
|
791 |
<div class='section-title'>⏱ Delay Clause Details</div>
|
792 |
-
{"".join(
|
793 |
-
</div>
|
794 |
-
<div class='delay-box'>
|
795 |
-
<div class='section-title'>⏱ Detailed Delay Line References</div>
|
796 |
-
{"".join([f"<div class='count-item'><span class='count-label'><span style='color: var(--info-color)'>•</span> {kw} (Line(s): {lines})</span></div>" for kw, lines in delay_line_refs.items()]) or '<div class="success-box">✅ No delay keywords found in specific lines.</div>'}
|
797 |
</div>
|
798 |
"""
|
799 |
|
@@ -813,10 +910,10 @@ def analyze_pdf(file_obj) -> List:
|
|
813 |
'risk_level': risk_level,
|
814 |
'record_id': record_id,
|
815 |
'penalty_examples': extracted_data,
|
816 |
-
'penalty_details': "\n".join([f"{kw}: {count}" for kw,
|
817 |
'penalty_amounts': "\n".join([f"${amt:,.2f}" for amt in penalty_values[:5]]) if penalty_values else "",
|
818 |
-
'obligation_details': "\n".join([f"{kw}: {count}" for kw,
|
819 |
-
'delay_details': "\n".join([f"{kw}: {count}" for kw,
|
820 |
}
|
821 |
|
822 |
try:
|
@@ -896,10 +993,10 @@ def analyze_pdf(file_obj) -> List:
|
|
896 |
</div>
|
897 |
""",
|
898 |
"", # Empty string for hidden risk visualization
|
899 |
-
|
900 |
f"<div class='penalty-box'><div class='section-title'>💰 Penalty Amounts Found</div>{penalty_amounts}</div>",
|
901 |
-
|
902 |
-
|
903 |
f"<div class='result-box'><div class='section-title'>📜 Extracted Data</div>{extracted_data}</div>",
|
904 |
sentiment_analysis_output,
|
905 |
temp_file_path # Return temporary file path for PDF download
|
|
|
288 |
border-radius: 10px;
|
289 |
background-color: rgba(255,255,255,0.3);
|
290 |
}
|
291 |
+
.keyword-match {
|
292 |
+
background-color: rgba(255, 255, 0, 0.3);
|
293 |
+
padding: 2px 4px;
|
294 |
+
border-radius: 3px;
|
295 |
+
font-weight: bold;
|
296 |
+
}
|
297 |
+
.match-detail {
|
298 |
+
margin-top: 5px;
|
299 |
+
padding: 8px;
|
300 |
+
background-color: rgba(0,0,0,0.05);
|
301 |
+
border-radius: 5px;
|
302 |
+
font-size: 14px;
|
303 |
+
}
|
304 |
+
.match-line {
|
305 |
+
font-family: monospace;
|
306 |
+
white-space: pre-wrap;
|
307 |
+
margin-bottom: 5px;
|
308 |
+
}
|
309 |
+
.match-context {
|
310 |
+
font-style: italic;
|
311 |
+
color: var(--secondary-color);
|
312 |
+
}
|
313 |
/* Hide elements */
|
314 |
footer, .gradio-footer, .hide, [data-testid="Use via API"], [data-testid="mmsettings"],
|
315 |
#sentiment-analysis, #risk-visualization {
|
|
|
336 |
.dark .count-item:hover {
|
337 |
background-color: rgba(255,255,255,0.05);
|
338 |
}
|
339 |
+
.dark .keyword-match {
|
340 |
+
background-color: rgba(255, 255, 0, 0.5);
|
341 |
+
color: black;
|
342 |
+
}
|
343 |
+
.dark .match-detail {
|
344 |
+
background-color: rgba(255,255,255,0.05);
|
345 |
+
}
|
346 |
"""
|
347 |
|
348 |
# Salesforce credentials
|
|
|
580 |
for page in pdf.pages:
|
581 |
page_text = page.extract_text()
|
582 |
if page_text:
|
583 |
+
text += page_text + "\n" # Add newline between pages
|
584 |
return text
|
585 |
except Exception as e:
|
586 |
logger.error(f"PDF text extraction failed: {str(e)}")
|
587 |
raise Exception(f"PDF text extraction failed: {str(e)}")
|
588 |
|
589 |
+
def find_keyword_matches(text: str, keywords: List[str]) -> Dict[str, List[Dict[str, str]]]:
|
590 |
+
"""Find all matches for keywords in text with line numbers and context"""
|
591 |
+
matches = {}
|
592 |
+
lines = text.split('\n')
|
593 |
+
|
594 |
for keyword in keywords:
|
595 |
+
keyword_matches = []
|
596 |
+
pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', flags=re.IGNORECASE)
|
597 |
+
|
598 |
+
for line_num, line in enumerate(lines, 1):
|
599 |
+
line_matches = pattern.finditer(line)
|
600 |
+
for match in line_matches:
|
601 |
+
start = max(0, match.start() - 20)
|
602 |
+
end = min(len(line), match.end() + 20)
|
603 |
+
context = line[start:end]
|
604 |
+
|
605 |
+
# Highlight the matched keyword in the context
|
606 |
+
highlighted_context = (
|
607 |
+
context[:match.start()-start] +
|
608 |
+
f"<span class='keyword-match'>{context[match.start()-start:match.end()-start]}</span>" +
|
609 |
+
context[match.end()-start:]
|
610 |
+
)
|
611 |
+
|
612 |
+
keyword_matches.append({
|
613 |
+
'line_number': line_num,
|
614 |
+
'full_line': line.strip(),
|
615 |
+
'context': highlighted_context,
|
616 |
+
'match': match.group()
|
617 |
+
})
|
618 |
+
|
619 |
+
matches[keyword] = keyword_matches
|
620 |
+
|
621 |
+
return matches
|
622 |
+
|
623 |
+
def count_keywords_with_details(text: str, keywords: List[str]) -> Dict[str, Dict]:
|
624 |
+
"""Count keyword occurrences with detailed match information"""
|
625 |
+
keyword_details = {}
|
626 |
+
matches = find_keyword_matches(text, keywords)
|
627 |
+
|
628 |
+
for keyword in keywords:
|
629 |
+
keyword_matches = matches.get(keyword, [])
|
630 |
+
keyword_details[keyword] = {
|
631 |
+
'count': len(keyword_matches),
|
632 |
+
'matches': keyword_matches
|
633 |
+
}
|
634 |
+
|
635 |
+
return keyword_details
|
636 |
|
637 |
def find_penalty_values(text: str) -> List[float]:
|
638 |
"""Find penalty amounts in the text"""
|
|
|
762 |
</div>
|
763 |
"""
|
764 |
|
765 |
+
def format_keyword_matches(matches: List[Dict[str, str]]) -> str:
|
766 |
+
"""Format keyword matches with line numbers and context"""
|
767 |
+
if not matches:
|
768 |
+
return "<div class='success-box'>✅ No matches found for this keyword</div>"
|
769 |
+
|
770 |
+
result = []
|
771 |
+
for i, match in enumerate(matches[:5], 1): # Limit to top 5 matches per keyword
|
772 |
+
result.append(f"""
|
773 |
+
<div class="match-detail">
|
774 |
+
<div><strong>Match {i}:</strong> Line {match['line_number']}</div>
|
775 |
+
<div class="match-context">Context: {match['context']}</div>
|
776 |
+
<div class="match-line">Full line: {match['full_line']}</div>
|
777 |
+
</div>
|
778 |
+
""")
|
779 |
+
|
780 |
+
return "".join(result)
|
781 |
+
|
782 |
def analyze_pdf(file_obj) -> List:
|
783 |
"""Main analysis function for Gradio interface"""
|
784 |
try:
|
|
|
794 |
text = extract_text_from_pdf(file_obj.name)
|
795 |
if not text.strip():
|
796 |
raise Exception("No text extracted from PDF. It might be a scanned document.")
|
|
|
|
|
797 |
except Exception as e:
|
798 |
raise Exception(f"PDF text extraction failed: {str(e)}")
|
799 |
|
|
|
807 |
obligation_keywords = ["shall", "must", "required to", "obligated to", "duty"]
|
808 |
delay_keywords = ["delay", "late", "overdue", "extension", "time is of the essence"]
|
809 |
|
810 |
+
# Get detailed keyword matches with line numbers and context
|
811 |
+
penalty_details = count_keywords_with_details(text, penalty_keywords)
|
812 |
+
obligation_details = count_keywords_with_details(text, obligation_keywords)
|
813 |
+
delay_details = count_keywords_with_details(text, delay_keywords)
|
814 |
|
815 |
+
total_penalties = sum(details['count'] for details in penalty_details.values())
|
816 |
+
total_obligations = sum(details['count'] for details in obligation_details.values())
|
817 |
+
total_delays = sum(details['count'] for details in delay_details.values())
|
818 |
|
819 |
+
penalty_values = find_penalty_values(text)
|
|
|
|
|
820 |
|
821 |
# Generate warning messages with emojis
|
822 |
penalty_warning = format_warning_message(total_penalties, "penalty", "💰")
|
|
|
837 |
except Exception as e:
|
838 |
raise Exception(f"Visual generation failed: {str(e)}")
|
839 |
|
840 |
+
# Format penalty details with match information
|
841 |
+
penalty_html = []
|
842 |
+
for keyword, details in penalty_details.items():
|
843 |
+
penalty_html.append(f"""
|
844 |
+
<div class='count-item'>
|
845 |
+
<span class='count-label'><span style='color: var(--danger-color)'>•</span> {keyword}</span>
|
846 |
+
<span class='count-value'>{details['count']}</span>
|
847 |
+
</div>
|
848 |
+
{format_keyword_matches(details['matches'])}
|
849 |
+
""")
|
850 |
+
|
851 |
+
penalty_details_html = f"""
|
852 |
{penalty_warning}
|
853 |
<div class='penalty-box'>
|
854 |
<div class='section-title'>💰 Penalty Clause Details</div>
|
855 |
+
{"".join(penalty_html)}
|
|
|
|
|
|
|
|
|
856 |
</div>
|
857 |
"""
|
858 |
|
859 |
+
# Format obligation details with match information
|
860 |
+
obligation_html = []
|
861 |
+
for keyword, details in obligation_details.items():
|
862 |
+
obligation_html.append(f"""
|
863 |
+
<div class='count-item'>
|
864 |
+
<span class='count-label'><span style='color: var(--warning-color)'>•</span> {keyword}</span>
|
865 |
+
<span class='count-value'>{details['count']}</span>
|
866 |
+
</div>
|
867 |
+
{format_keyword_matches(details['matches'])}
|
868 |
+
""")
|
869 |
+
|
870 |
+
obligation_details_html = f"""
|
871 |
{obligation_warning}
|
872 |
<div class='obligation-box'>
|
873 |
<div class='section-title'>📝 Obligation Clause Details</div>
|
874 |
+
{"".join(obligation_html)}
|
|
|
|
|
|
|
|
|
875 |
</div>
|
876 |
"""
|
877 |
|
878 |
+
# Format delay details with match information
|
879 |
+
delay_html = []
|
880 |
+
for keyword, details in delay_details.items():
|
881 |
+
delay_html.append(f"""
|
882 |
+
<div class='count-item'>
|
883 |
+
<span class='count-label'><span style='color: var(--info-color)'>•</span> {keyword}</span>
|
884 |
+
<span class='count-value'>{details['count']}</span>
|
885 |
+
</div>
|
886 |
+
{format_keyword_matches(details['matches'])}
|
887 |
+
""")
|
888 |
+
|
889 |
+
delay_details_html = f"""
|
890 |
{delay_warning}
|
891 |
<div class='delay-box'>
|
892 |
<div class='section-title'>⏱ Delay Clause Details</div>
|
893 |
+
{"".join(delay_html)}
|
|
|
|
|
|
|
|
|
894 |
</div>
|
895 |
"""
|
896 |
|
|
|
910 |
'risk_level': risk_level,
|
911 |
'record_id': record_id,
|
912 |
'penalty_examples': extracted_data,
|
913 |
+
'penalty_details': "\n".join([f"{kw}: {details['count']} matches" for kw, details in penalty_details.items()]),
|
914 |
'penalty_amounts': "\n".join([f"${amt:,.2f}" for amt in penalty_values[:5]]) if penalty_values else "",
|
915 |
+
'obligation_details': "\n".join([f"{kw}: {details['count']} matches" for kw, details in obligation_details.items()]),
|
916 |
+
'delay_details': "\n".join([f"{kw}: {details['count']} matches" for kw, details in delay_details.items()])
|
917 |
}
|
918 |
|
919 |
try:
|
|
|
993 |
</div>
|
994 |
""",
|
995 |
"", # Empty string for hidden risk visualization
|
996 |
+
penalty_details_html,
|
997 |
f"<div class='penalty-box'><div class='section-title'>💰 Penalty Amounts Found</div>{penalty_amounts}</div>",
|
998 |
+
obligation_details_html,
|
999 |
+
delay_details_html,
|
1000 |
f"<div class='result-box'><div class='section-title'>📜 Extracted Data</div>{extracted_data}</div>",
|
1001 |
sentiment_analysis_output,
|
1002 |
temp_file_path # Return temporary file path for PDF download
|