contract-risk-analyzer2

Sleeping

App Files Files Community

Kushalmanda commited on Jun 24

Commit

5c78870

verified ·

1 Parent(s): 4483f37

Update app.py

Browse files

Files changed (1) hide show

app.py +146 -49

app.py CHANGED Viewed

@@ -288,6 +288,28 @@ button:hover {
     border-radius: 10px;
     background-color: rgba(255,255,255,0.3);
 }
 /* Hide elements */
 footer, .gradio-footer, .hide, [data-testid="Use via API"], [data-testid="mmsettings"],
 #sentiment-analysis, #risk-visualization {
@@ -314,6 +336,13 @@ footer, .gradio-footer, .hide, [data-testid="Use via API"], [data-testid="mmsett
 .dark .count-item:hover {
     background-color: rgba(255,255,255,0.05);
 }
 """
 # Salesforce credentials
@@ -551,18 +580,59 @@ def extract_text_from_pdf(pdf_path: str) -> str:
             for page in pdf.pages:
                 page_text = page.extract_text()
                 if page_text:
-                    text += page_text
         return text
     except Exception as e:
         logger.error(f"PDF text extraction failed: {str(e)}")
         raise Exception(f"PDF text extraction failed: {str(e)}")
-def count_keywords(text: str, keywords: List[str]) -> Dict[str, int]:
-    """Count occurrences of keywords in text"""
-    counts = {}
     for keyword in keywords:
-        counts[keyword] = len(re.findall(r'\b' + re.escape(keyword) + r'\b', text, flags=re.IGNORECASE))
-    return counts
 def find_penalty_values(text: str) -> List[float]:
     """Find penalty amounts in the text"""
@@ -692,6 +762,23 @@ def format_clause_example(example: str, index: int) -> str:
     </div>
     """
 def analyze_pdf(file_obj) -> List:
     """Main analysis function for Gradio interface"""
     try:
@@ -707,8 +794,6 @@ def analyze_pdf(file_obj) -> List:
             text = extract_text_from_pdf(file_obj.name)
             if not text.strip():
                 raise Exception("No text extracted from PDF. It might be a scanned document.")
-            # Split text into lines for line number tracking
-            lines = text.split('\n')
         except Exception as e:
             raise Exception(f"PDF text extraction failed: {str(e)}")
@@ -722,15 +807,16 @@ def analyze_pdf(file_obj) -> List:
         obligation_keywords = ["shall", "must", "required to", "obligated to", "duty"]
         delay_keywords = ["delay", "late", "overdue", "extension", "time is of the essence"]
-        penalty_counts = count_keywords(text, penalty_keywords)
-        obligation_counts = count_keywords(text, obligation_keywords)
-        delay_counts = count_keywords(text, delay_keywords)
-        penalty_values = find_penalty_values(text)
-        total_penalties = sum(penalty_counts.values())
-        total_obligations = sum(obligation_counts.values())
-        total_delays = sum(delay_counts.values())
         # Generate warning messages with emojis
         penalty_warning = format_warning_message(total_penalties, "penalty", "💰")
@@ -751,49 +837,60 @@ def analyze_pdf(file_obj) -> List:
         except Exception as e:
             raise Exception(f"Visual generation failed: {str(e)}")
-        # Precompute line numbers for each keyword to avoid f-string backslash issues
-        def get_line_numbers(keyword, lines_list):
-            pattern = r'\b' + re.escape(keyword) + r'\b'
-            return ", ".join(str(i + 1) for i, line in enumerate(lines_list) if re.search(pattern, line, re.IGNORECASE)) or "None"
-        penalty_line_refs = {kw: get_line_numbers(kw, lines) for kw in penalty_keywords if penalty_counts.get(kw, 0) > 0}
-        obligation_line_refs = {kw: get_line_numbers(kw, lines) for kw in obligation_keywords if obligation_counts.get(kw, 0) > 0}
-        delay_line_refs = {kw: get_line_numbers(kw, lines) for kw in delay_keywords if delay_counts.get(kw, 0) > 0}
-        # Format details using precomputed line numbers
-        penalty_details = f"""
         {penalty_warning}
         <div class='penalty-box'>
             <div class='section-title'>💰 Penalty Clause Details</div>
-            {"".join([f"<div class='count-item'><span class='count-label'><span style='color: var(--danger-color)'>•</span> {kw}</span><span class='count-value'>{count}</span></div>" for kw, count in penalty_counts.items()])}
-        </div>
-        <div class='penalty-box'>
-            <div class='section-title'>💰 Detailed Penalty Line References</div>
-            {"".join([f"<div class='count-item'><span class='count-label'><span style='color: var(--danger-color)'>•</span> {kw} (Line(s): {lines})</span></div>" for kw, lines in penalty_line_refs.items()]) or '<div class="success-box">✅ No penalty keywords found in specific lines.</div>'}
         </div>
         """
-        obligation_details = f"""
         {obligation_warning}
         <div class='obligation-box'>
             <div class='section-title'>📝 Obligation Clause Details</div>
-            {"".join([f"<div class='count-item'><span class='count-label'><span style='color: var(--warning-color)'>•</span> {kw}</span><span class='count-value'>{count}</span></div>" for kw, count in obligation_counts.items()])}
-        </div>
-        <div class='obligation-box'>
-            <div class='section-title'>📝 Detailed Obligation Line References</div>
-            {"".join([f"<div class='count-item'><span class='count-label'><span style='color: var(--warning-color)'>•</span> {kw} (Line(s): {lines})</span></div>" for kw, lines in obligation_line_refs.items()]) or '<div class="success-box">✅ No obligation keywords found in specific lines.</div>'}
         </div>
         """
-        delay_details = f"""
         {delay_warning}
         <div class='delay-box'>
             <div class='section-title'>⏱ Delay Clause Details</div>
-            {"".join([f"<div class='count-item'><span class='count-label'><span style='color: var(--info-color)'>•</span> {kw}</span><span class='count-value'>{count}</span></div>" for kw, count in delay_counts.items()])}
-        </div>
-        <div class='delay-box'>
-            <div class='section-title'>⏱ Detailed Delay Line References</div>
-            {"".join([f"<div class='count-item'><span class='count-label'><span style='color: var(--info-color)'>•</span> {kw} (Line(s): {lines})</span></div>" for kw, lines in delay_line_refs.items()]) or '<div class="success-box">✅ No delay keywords found in specific lines.</div>'}
         </div>
         """
@@ -813,10 +910,10 @@ def analyze_pdf(file_obj) -> List:
             'risk_level': risk_level,
             'record_id': record_id,
             'penalty_examples': extracted_data,
-            'penalty_details': "\n".join([f"{kw}: {count}" for kw, count in penalty_counts.items()]),
             'penalty_amounts': "\n".join([f"${amt:,.2f}" for amt in penalty_values[:5]]) if penalty_values else "",
-            'obligation_details': "\n".join([f"{kw}: {count}" for kw, count in obligation_counts.items()]),
-            'delay_details': "\n".join([f"{kw}: {count}" for kw, count in delay_counts.items()])
         }
         try:
@@ -896,10 +993,10 @@ def analyze_pdf(file_obj) -> List:
             </div>
             """,
             "",  # Empty string for hidden risk visualization
-            penalty_details,
             f"<div class='penalty-box'><div class='section-title'>💰 Penalty Amounts Found</div>{penalty_amounts}</div>",
-            obligation_details,
-            delay_details,
             f"<div class='result-box'><div class='section-title'>📜 Extracted Data</div>{extracted_data}</div>",
             sentiment_analysis_output,
             temp_file_path  # Return temporary file path for PDF download

     border-radius: 10px;
     background-color: rgba(255,255,255,0.3);
 }
+.keyword-match {
+    background-color: rgba(255, 255, 0, 0.3);
+    padding: 2px 4px;
+    border-radius: 3px;
+    font-weight: bold;
+}
+.match-detail {
+    margin-top: 5px;
+    padding: 8px;
+    background-color: rgba(0,0,0,0.05);
+    border-radius: 5px;
+    font-size: 14px;
+}
+.match-line {
+    font-family: monospace;
+    white-space: pre-wrap;
+    margin-bottom: 5px;
+}
+.match-context {
+    font-style: italic;
+    color: var(--secondary-color);
+}
 /* Hide elements */
 footer, .gradio-footer, .hide, [data-testid="Use via API"], [data-testid="mmsettings"],
 #sentiment-analysis, #risk-visualization {
 .dark .count-item:hover {
     background-color: rgba(255,255,255,0.05);
 }
+.dark .keyword-match {
+    background-color: rgba(255, 255, 0, 0.5);
+    color: black;
+}
+.dark .match-detail {
+    background-color: rgba(255,255,255,0.05);
+}
 """
 # Salesforce credentials
             for page in pdf.pages:
                 page_text = page.extract_text()
                 if page_text:
+                    text += page_text + "\n"  # Add newline between pages
         return text
     except Exception as e:
         logger.error(f"PDF text extraction failed: {str(e)}")
         raise Exception(f"PDF text extraction failed: {str(e)}")
+def find_keyword_matches(text: str, keywords: List[str]) -> Dict[str, List[Dict[str, str]]]:
+    """Find all matches for keywords in text with line numbers and context"""
+    matches = {}
+    lines = text.split('\n')
     for keyword in keywords:
+        keyword_matches = []
+        pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', flags=re.IGNORECASE)
+        for line_num, line in enumerate(lines, 1):
+            line_matches = pattern.finditer(line)
+            for match in line_matches:
+                start = max(0, match.start() - 20)
+                end = min(len(line), match.end() + 20)
+                context = line[start:end]
+                # Highlight the matched keyword in the context
+                highlighted_context = (
+                    context[:match.start()-start] +
+                    f"<span class='keyword-match'>{context[match.start()-start:match.end()-start]}</span>" +
+                    context[match.end()-start:]
+                )
+                keyword_matches.append({
+                    'line_number': line_num,
+                    'full_line': line.strip(),
+                    'context': highlighted_context,
+                    'match': match.group()
+                })
+        matches[keyword] = keyword_matches
+    return matches
+def count_keywords_with_details(text: str, keywords: List[str]) -> Dict[str, Dict]:
+    """Count keyword occurrences with detailed match information"""
+    keyword_details = {}
+    matches = find_keyword_matches(text, keywords)
+    for keyword in keywords:
+        keyword_matches = matches.get(keyword, [])
+        keyword_details[keyword] = {
+            'count': len(keyword_matches),
+            'matches': keyword_matches
+        }
+    return keyword_details
 def find_penalty_values(text: str) -> List[float]:
     """Find penalty amounts in the text"""
     </div>
     """
+def format_keyword_matches(matches: List[Dict[str, str]]) -> str:
+    """Format keyword matches with line numbers and context"""
+    if not matches:
+        return "<div class='success-box'>✅ No matches found for this keyword</div>"
+    result = []
+    for i, match in enumerate(matches[:5], 1):  # Limit to top 5 matches per keyword
+        result.append(f"""
+        <div class="match-detail">
+            <div><strong>Match {i}:</strong> Line {match['line_number']}</div>
+            <div class="match-context">Context: {match['context']}</div>
+            <div class="match-line">Full line: {match['full_line']}</div>
+        </div>
+        """)
+    return "".join(result)
 def analyze_pdf(file_obj) -> List:
     """Main analysis function for Gradio interface"""
     try:
             text = extract_text_from_pdf(file_obj.name)
             if not text.strip():
                 raise Exception("No text extracted from PDF. It might be a scanned document.")
         except Exception as e:
             raise Exception(f"PDF text extraction failed: {str(e)}")
         obligation_keywords = ["shall", "must", "required to", "obligated to", "duty"]
         delay_keywords = ["delay", "late", "overdue", "extension", "time is of the essence"]
+        # Get detailed keyword matches with line numbers and context
+        penalty_details = count_keywords_with_details(text, penalty_keywords)
+        obligation_details = count_keywords_with_details(text, obligation_keywords)
+        delay_details = count_keywords_with_details(text, delay_keywords)
+        total_penalties = sum(details['count'] for details in penalty_details.values())
+        total_obligations = sum(details['count'] for details in obligation_details.values())
+        total_delays = sum(details['count'] for details in delay_details.values())
+        penalty_values = find_penalty_values(text)
         # Generate warning messages with emojis
         penalty_warning = format_warning_message(total_penalties, "penalty", "💰")
         except Exception as e:
             raise Exception(f"Visual generation failed: {str(e)}")
+        # Format penalty details with match information
+        penalty_html = []
+        for keyword, details in penalty_details.items():
+            penalty_html.append(f"""
+            <div class='count-item'>
+                <span class='count-label'><span style='color: var(--danger-color)'>•</span> {keyword}</span>
+                <span class='count-value'>{details['count']}</span>
+            </div>
+            {format_keyword_matches(details['matches'])}
+            """)
+        penalty_details_html = f"""
         {penalty_warning}
         <div class='penalty-box'>
             <div class='section-title'>💰 Penalty Clause Details</div>
+            {"".join(penalty_html)}
         </div>
         """
+        # Format obligation details with match information
+        obligation_html = []
+        for keyword, details in obligation_details.items():
+            obligation_html.append(f"""
+            <div class='count-item'>
+                <span class='count-label'><span style='color: var(--warning-color)'>•</span> {keyword}</span>
+                <span class='count-value'>{details['count']}</span>
+            </div>
+            {format_keyword_matches(details['matches'])}
+            """)
+        obligation_details_html = f"""
         {obligation_warning}
         <div class='obligation-box'>
             <div class='section-title'>📝 Obligation Clause Details</div>
+            {"".join(obligation_html)}
         </div>
         """
+        # Format delay details with match information
+        delay_html = []
+        for keyword, details in delay_details.items():
+            delay_html.append(f"""
+            <div class='count-item'>
+                <span class='count-label'><span style='color: var(--info-color)'>•</span> {keyword}</span>
+                <span class='count-value'>{details['count']}</span>
+            </div>
+            {format_keyword_matches(details['matches'])}
+            """)
+        delay_details_html = f"""
         {delay_warning}
         <div class='delay-box'>
             <div class='section-title'>⏱ Delay Clause Details</div>
+            {"".join(delay_html)}
         </div>
         """
             'risk_level': risk_level,
             'record_id': record_id,
             'penalty_examples': extracted_data,
+            'penalty_details': "\n".join([f"{kw}: {details['count']} matches" for kw, details in penalty_details.items()]),
             'penalty_amounts': "\n".join([f"${amt:,.2f}" for amt in penalty_values[:5]]) if penalty_values else "",
+            'obligation_details': "\n".join([f"{kw}: {details['count']} matches" for kw, details in obligation_details.items()]),
+            'delay_details': "\n".join([f"{kw}: {details['count']} matches" for kw, details in delay_details.items()])
         }
         try:
             </div>
             """,
             "",  # Empty string for hidden risk visualization
+            penalty_details_html,
             f"<div class='penalty-box'><div class='section-title'>💰 Penalty Amounts Found</div>{penalty_amounts}</div>",
+            obligation_details_html,
+            delay_details_html,
             f"<div class='result-box'><div class='section-title'>📜 Extracted Data</div>{extracted_data}</div>",
             sentiment_analysis_output,
             temp_file_path  # Return temporary file path for PDF download