Kushalmanda commited on
Commit
5c78870
·
verified ·
1 Parent(s): 4483f37

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -49
app.py CHANGED
@@ -288,6 +288,28 @@ button:hover {
288
  border-radius: 10px;
289
  background-color: rgba(255,255,255,0.3);
290
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  /* Hide elements */
292
  footer, .gradio-footer, .hide, [data-testid="Use via API"], [data-testid="mmsettings"],
293
  #sentiment-analysis, #risk-visualization {
@@ -314,6 +336,13 @@ footer, .gradio-footer, .hide, [data-testid="Use via API"], [data-testid="mmsett
314
  .dark .count-item:hover {
315
  background-color: rgba(255,255,255,0.05);
316
  }
 
 
 
 
 
 
 
317
  """
318
 
319
  # Salesforce credentials
@@ -551,18 +580,59 @@ def extract_text_from_pdf(pdf_path: str) -> str:
551
  for page in pdf.pages:
552
  page_text = page.extract_text()
553
  if page_text:
554
- text += page_text
555
  return text
556
  except Exception as e:
557
  logger.error(f"PDF text extraction failed: {str(e)}")
558
  raise Exception(f"PDF text extraction failed: {str(e)}")
559
 
560
- def count_keywords(text: str, keywords: List[str]) -> Dict[str, int]:
561
- """Count occurrences of keywords in text"""
562
- counts = {}
 
 
563
  for keyword in keywords:
564
- counts[keyword] = len(re.findall(r'\b' + re.escape(keyword) + r'\b', text, flags=re.IGNORECASE))
565
- return counts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
566
 
567
  def find_penalty_values(text: str) -> List[float]:
568
  """Find penalty amounts in the text"""
@@ -692,6 +762,23 @@ def format_clause_example(example: str, index: int) -> str:
692
  </div>
693
  """
694
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
695
  def analyze_pdf(file_obj) -> List:
696
  """Main analysis function for Gradio interface"""
697
  try:
@@ -707,8 +794,6 @@ def analyze_pdf(file_obj) -> List:
707
  text = extract_text_from_pdf(file_obj.name)
708
  if not text.strip():
709
  raise Exception("No text extracted from PDF. It might be a scanned document.")
710
- # Split text into lines for line number tracking
711
- lines = text.split('\n')
712
  except Exception as e:
713
  raise Exception(f"PDF text extraction failed: {str(e)}")
714
 
@@ -722,15 +807,16 @@ def analyze_pdf(file_obj) -> List:
722
  obligation_keywords = ["shall", "must", "required to", "obligated to", "duty"]
723
  delay_keywords = ["delay", "late", "overdue", "extension", "time is of the essence"]
724
 
725
- penalty_counts = count_keywords(text, penalty_keywords)
726
- obligation_counts = count_keywords(text, obligation_keywords)
727
- delay_counts = count_keywords(text, delay_keywords)
 
728
 
729
- penalty_values = find_penalty_values(text)
 
 
730
 
731
- total_penalties = sum(penalty_counts.values())
732
- total_obligations = sum(obligation_counts.values())
733
- total_delays = sum(delay_counts.values())
734
 
735
  # Generate warning messages with emojis
736
  penalty_warning = format_warning_message(total_penalties, "penalty", "💰")
@@ -751,49 +837,60 @@ def analyze_pdf(file_obj) -> List:
751
  except Exception as e:
752
  raise Exception(f"Visual generation failed: {str(e)}")
753
 
754
- # Precompute line numbers for each keyword to avoid f-string backslash issues
755
- def get_line_numbers(keyword, lines_list):
756
- pattern = r'\b' + re.escape(keyword) + r'\b'
757
- return ", ".join(str(i + 1) for i, line in enumerate(lines_list) if re.search(pattern, line, re.IGNORECASE)) or "None"
758
-
759
- penalty_line_refs = {kw: get_line_numbers(kw, lines) for kw in penalty_keywords if penalty_counts.get(kw, 0) > 0}
760
- obligation_line_refs = {kw: get_line_numbers(kw, lines) for kw in obligation_keywords if obligation_counts.get(kw, 0) > 0}
761
- delay_line_refs = {kw: get_line_numbers(kw, lines) for kw in delay_keywords if delay_counts.get(kw, 0) > 0}
762
-
763
- # Format details using precomputed line numbers
764
- penalty_details = f"""
 
765
  {penalty_warning}
766
  <div class='penalty-box'>
767
  <div class='section-title'>💰 Penalty Clause Details</div>
768
- {"".join([f"<div class='count-item'><span class='count-label'><span style='color: var(--danger-color)'>•</span> {kw}</span><span class='count-value'>{count}</span></div>" for kw, count in penalty_counts.items()])}
769
- </div>
770
- <div class='penalty-box'>
771
- <div class='section-title'>💰 Detailed Penalty Line References</div>
772
- {"".join([f"<div class='count-item'><span class='count-label'><span style='color: var(--danger-color)'>•</span> {kw} (Line(s): {lines})</span></div>" for kw, lines in penalty_line_refs.items()]) or '<div class="success-box">✅ No penalty keywords found in specific lines.</div>'}
773
  </div>
774
  """
775
 
776
- obligation_details = f"""
 
 
 
 
 
 
 
 
 
 
 
777
  {obligation_warning}
778
  <div class='obligation-box'>
779
  <div class='section-title'>📝 Obligation Clause Details</div>
780
- {"".join([f"<div class='count-item'><span class='count-label'><span style='color: var(--warning-color)'>•</span> {kw}</span><span class='count-value'>{count}</span></div>" for kw, count in obligation_counts.items()])}
781
- </div>
782
- <div class='obligation-box'>
783
- <div class='section-title'>📝 Detailed Obligation Line References</div>
784
- {"".join([f"<div class='count-item'><span class='count-label'><span style='color: var(--warning-color)'>•</span> {kw} (Line(s): {lines})</span></div>" for kw, lines in obligation_line_refs.items()]) or '<div class="success-box">✅ No obligation keywords found in specific lines.</div>'}
785
  </div>
786
  """
787
 
788
- delay_details = f"""
 
 
 
 
 
 
 
 
 
 
 
789
  {delay_warning}
790
  <div class='delay-box'>
791
  <div class='section-title'>⏱ Delay Clause Details</div>
792
- {"".join([f"<div class='count-item'><span class='count-label'><span style='color: var(--info-color)'>•</span> {kw}</span><span class='count-value'>{count}</span></div>" for kw, count in delay_counts.items()])}
793
- </div>
794
- <div class='delay-box'>
795
- <div class='section-title'>⏱ Detailed Delay Line References</div>
796
- {"".join([f"<div class='count-item'><span class='count-label'><span style='color: var(--info-color)'>•</span> {kw} (Line(s): {lines})</span></div>" for kw, lines in delay_line_refs.items()]) or '<div class="success-box">✅ No delay keywords found in specific lines.</div>'}
797
  </div>
798
  """
799
 
@@ -813,10 +910,10 @@ def analyze_pdf(file_obj) -> List:
813
  'risk_level': risk_level,
814
  'record_id': record_id,
815
  'penalty_examples': extracted_data,
816
- 'penalty_details': "\n".join([f"{kw}: {count}" for kw, count in penalty_counts.items()]),
817
  'penalty_amounts': "\n".join([f"${amt:,.2f}" for amt in penalty_values[:5]]) if penalty_values else "",
818
- 'obligation_details': "\n".join([f"{kw}: {count}" for kw, count in obligation_counts.items()]),
819
- 'delay_details': "\n".join([f"{kw}: {count}" for kw, count in delay_counts.items()])
820
  }
821
 
822
  try:
@@ -896,10 +993,10 @@ def analyze_pdf(file_obj) -> List:
896
  </div>
897
  """,
898
  "", # Empty string for hidden risk visualization
899
- penalty_details,
900
  f"<div class='penalty-box'><div class='section-title'>💰 Penalty Amounts Found</div>{penalty_amounts}</div>",
901
- obligation_details,
902
- delay_details,
903
  f"<div class='result-box'><div class='section-title'>📜 Extracted Data</div>{extracted_data}</div>",
904
  sentiment_analysis_output,
905
  temp_file_path # Return temporary file path for PDF download
 
288
  border-radius: 10px;
289
  background-color: rgba(255,255,255,0.3);
290
  }
291
+ .keyword-match {
292
+ background-color: rgba(255, 255, 0, 0.3);
293
+ padding: 2px 4px;
294
+ border-radius: 3px;
295
+ font-weight: bold;
296
+ }
297
+ .match-detail {
298
+ margin-top: 5px;
299
+ padding: 8px;
300
+ background-color: rgba(0,0,0,0.05);
301
+ border-radius: 5px;
302
+ font-size: 14px;
303
+ }
304
+ .match-line {
305
+ font-family: monospace;
306
+ white-space: pre-wrap;
307
+ margin-bottom: 5px;
308
+ }
309
+ .match-context {
310
+ font-style: italic;
311
+ color: var(--secondary-color);
312
+ }
313
  /* Hide elements */
314
  footer, .gradio-footer, .hide, [data-testid="Use via API"], [data-testid="mmsettings"],
315
  #sentiment-analysis, #risk-visualization {
 
336
  .dark .count-item:hover {
337
  background-color: rgba(255,255,255,0.05);
338
  }
339
+ .dark .keyword-match {
340
+ background-color: rgba(255, 255, 0, 0.5);
341
+ color: black;
342
+ }
343
+ .dark .match-detail {
344
+ background-color: rgba(255,255,255,0.05);
345
+ }
346
  """
347
 
348
  # Salesforce credentials
 
580
  for page in pdf.pages:
581
  page_text = page.extract_text()
582
  if page_text:
583
+ text += page_text + "\n" # Add newline between pages
584
  return text
585
  except Exception as e:
586
  logger.error(f"PDF text extraction failed: {str(e)}")
587
  raise Exception(f"PDF text extraction failed: {str(e)}")
588
 
589
+ def find_keyword_matches(text: str, keywords: List[str]) -> Dict[str, List[Dict[str, str]]]:
590
+ """Find all matches for keywords in text with line numbers and context"""
591
+ matches = {}
592
+ lines = text.split('\n')
593
+
594
  for keyword in keywords:
595
+ keyword_matches = []
596
+ pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', flags=re.IGNORECASE)
597
+
598
+ for line_num, line in enumerate(lines, 1):
599
+ line_matches = pattern.finditer(line)
600
+ for match in line_matches:
601
+ start = max(0, match.start() - 20)
602
+ end = min(len(line), match.end() + 20)
603
+ context = line[start:end]
604
+
605
+ # Highlight the matched keyword in the context
606
+ highlighted_context = (
607
+ context[:match.start()-start] +
608
+ f"<span class='keyword-match'>{context[match.start()-start:match.end()-start]}</span>" +
609
+ context[match.end()-start:]
610
+ )
611
+
612
+ keyword_matches.append({
613
+ 'line_number': line_num,
614
+ 'full_line': line.strip(),
615
+ 'context': highlighted_context,
616
+ 'match': match.group()
617
+ })
618
+
619
+ matches[keyword] = keyword_matches
620
+
621
+ return matches
622
+
623
+ def count_keywords_with_details(text: str, keywords: List[str]) -> Dict[str, Dict]:
624
+ """Count keyword occurrences with detailed match information"""
625
+ keyword_details = {}
626
+ matches = find_keyword_matches(text, keywords)
627
+
628
+ for keyword in keywords:
629
+ keyword_matches = matches.get(keyword, [])
630
+ keyword_details[keyword] = {
631
+ 'count': len(keyword_matches),
632
+ 'matches': keyword_matches
633
+ }
634
+
635
+ return keyword_details
636
 
637
  def find_penalty_values(text: str) -> List[float]:
638
  """Find penalty amounts in the text"""
 
762
  </div>
763
  """
764
 
765
+ def format_keyword_matches(matches: List[Dict[str, str]]) -> str:
766
+ """Format keyword matches with line numbers and context"""
767
+ if not matches:
768
+ return "<div class='success-box'>✅ No matches found for this keyword</div>"
769
+
770
+ result = []
771
+ for i, match in enumerate(matches[:5], 1): # Limit to top 5 matches per keyword
772
+ result.append(f"""
773
+ <div class="match-detail">
774
+ <div><strong>Match {i}:</strong> Line {match['line_number']}</div>
775
+ <div class="match-context">Context: {match['context']}</div>
776
+ <div class="match-line">Full line: {match['full_line']}</div>
777
+ </div>
778
+ """)
779
+
780
+ return "".join(result)
781
+
782
  def analyze_pdf(file_obj) -> List:
783
  """Main analysis function for Gradio interface"""
784
  try:
 
794
  text = extract_text_from_pdf(file_obj.name)
795
  if not text.strip():
796
  raise Exception("No text extracted from PDF. It might be a scanned document.")
 
 
797
  except Exception as e:
798
  raise Exception(f"PDF text extraction failed: {str(e)}")
799
 
 
807
  obligation_keywords = ["shall", "must", "required to", "obligated to", "duty"]
808
  delay_keywords = ["delay", "late", "overdue", "extension", "time is of the essence"]
809
 
810
+ # Get detailed keyword matches with line numbers and context
811
+ penalty_details = count_keywords_with_details(text, penalty_keywords)
812
+ obligation_details = count_keywords_with_details(text, obligation_keywords)
813
+ delay_details = count_keywords_with_details(text, delay_keywords)
814
 
815
+ total_penalties = sum(details['count'] for details in penalty_details.values())
816
+ total_obligations = sum(details['count'] for details in obligation_details.values())
817
+ total_delays = sum(details['count'] for details in delay_details.values())
818
 
819
+ penalty_values = find_penalty_values(text)
 
 
820
 
821
  # Generate warning messages with emojis
822
  penalty_warning = format_warning_message(total_penalties, "penalty", "💰")
 
837
  except Exception as e:
838
  raise Exception(f"Visual generation failed: {str(e)}")
839
 
840
+ # Format penalty details with match information
841
+ penalty_html = []
842
+ for keyword, details in penalty_details.items():
843
+ penalty_html.append(f"""
844
+ <div class='count-item'>
845
+ <span class='count-label'><span style='color: var(--danger-color)'>•</span> {keyword}</span>
846
+ <span class='count-value'>{details['count']}</span>
847
+ </div>
848
+ {format_keyword_matches(details['matches'])}
849
+ """)
850
+
851
+ penalty_details_html = f"""
852
  {penalty_warning}
853
  <div class='penalty-box'>
854
  <div class='section-title'>💰 Penalty Clause Details</div>
855
+ {"".join(penalty_html)}
 
 
 
 
856
  </div>
857
  """
858
 
859
+ # Format obligation details with match information
860
+ obligation_html = []
861
+ for keyword, details in obligation_details.items():
862
+ obligation_html.append(f"""
863
+ <div class='count-item'>
864
+ <span class='count-label'><span style='color: var(--warning-color)'>•</span> {keyword}</span>
865
+ <span class='count-value'>{details['count']}</span>
866
+ </div>
867
+ {format_keyword_matches(details['matches'])}
868
+ """)
869
+
870
+ obligation_details_html = f"""
871
  {obligation_warning}
872
  <div class='obligation-box'>
873
  <div class='section-title'>📝 Obligation Clause Details</div>
874
+ {"".join(obligation_html)}
 
 
 
 
875
  </div>
876
  """
877
 
878
+ # Format delay details with match information
879
+ delay_html = []
880
+ for keyword, details in delay_details.items():
881
+ delay_html.append(f"""
882
+ <div class='count-item'>
883
+ <span class='count-label'><span style='color: var(--info-color)'>•</span> {keyword}</span>
884
+ <span class='count-value'>{details['count']}</span>
885
+ </div>
886
+ {format_keyword_matches(details['matches'])}
887
+ """)
888
+
889
+ delay_details_html = f"""
890
  {delay_warning}
891
  <div class='delay-box'>
892
  <div class='section-title'>⏱ Delay Clause Details</div>
893
+ {"".join(delay_html)}
 
 
 
 
894
  </div>
895
  """
896
 
 
910
  'risk_level': risk_level,
911
  'record_id': record_id,
912
  'penalty_examples': extracted_data,
913
+ 'penalty_details': "\n".join([f"{kw}: {details['count']} matches" for kw, details in penalty_details.items()]),
914
  'penalty_amounts': "\n".join([f"${amt:,.2f}" for amt in penalty_values[:5]]) if penalty_values else "",
915
+ 'obligation_details': "\n".join([f"{kw}: {details['count']} matches" for kw, details in obligation_details.items()]),
916
+ 'delay_details': "\n".join([f"{kw}: {details['count']} matches" for kw, details in delay_details.items()])
917
  }
918
 
919
  try:
 
993
  </div>
994
  """,
995
  "", # Empty string for hidden risk visualization
996
+ penalty_details_html,
997
  f"<div class='penalty-box'><div class='section-title'>💰 Penalty Amounts Found</div>{penalty_amounts}</div>",
998
+ obligation_details_html,
999
+ delay_details_html,
1000
  f"<div class='result-box'><div class='section-title'>📜 Extracted Data</div>{extracted_data}</div>",
1001
  sentiment_analysis_output,
1002
  temp_file_path # Return temporary file path for PDF download