Spaces:

daniel-wojahn
/

ttm-webapp-hf

Sleeping

App Files Files Community

daniel-wojahn commited on 7 days ago

Commit

671c107

1 Parent(s): e30d4c0

deleted beta feature (will be outsourced in a different app)

Browse files

Files changed (4) hide show

.gitignore +1 -2
app.py +14 -109
pipeline/differential_viz.py +0 -264
pipeline/structural_analysis.py +0 -332

.gitignore CHANGED Viewed

@@ -1,5 +1,4 @@
 venv
 __pycache__
 academic_article.md
-#structural_analysis.py
-#differential_viz.py

 venv
 __pycache__
 academic_article.md
+tibetan_collation_prototype

app.py CHANGED Viewed

@@ -135,17 +135,9 @@ def main_interface():
         metrics_preview = gr.Dataframe(
             label="Similarity Metrics Preview", interactive=False, visible=True
         )
-        # States to hold data for optional structural analysis
         state_text_data = gr.State()
         state_df_results = gr.State()
-        # Deferred structural analysis trigger
-        structural_btn = gr.Button(
-            "Run Structural Analysis (time-consuming)",
-            variant="secondary",
-            interactive=False,
-            elem_id="structural-btn"
-        )
         # LLM Interpretation components
         with gr.Row():
@@ -175,10 +167,10 @@ def main_interface():
         # Heatmap tabs for each metric
         heatmap_titles = {
-            "Jaccard Similarity (%)": "Jaccard Similarity (%): Higher scores (darker) mean more shared unique words.",
-            "Normalized LCS": "Normalized LCS: Higher scores (darker) mean longer shared sequences of words.",
-            "Fuzzy Similarity": "Fuzzy Similarity: Higher scores (darker) mean more similar text with fuzzy matching tolerance for variations.",
-            "Semantic Similarity": "Semantic Similarity (using word embeddings/experimental): Higher scores (darker) mean more similar meanings.",
             "Word Counts": "Word Counts: Bar chart showing the number of words in each segment after tokenization.",
         }
@@ -305,30 +297,7 @@ The structural analysis combines multiple similarity metrics to create a compreh
                         heatmap_tabs[metric_key] = gr.Plot(label=f"Heatmap: {metric_key}", show_label=False, elem_classes="metric-heatmap")
             # Structural Analysis Tab
-            with gr.Tab("Structural Analysis"):
-                with gr.Accordion("ℹ️ About", open=False, elem_classes="metric-info-accordion structural-info"):
-                    if "Structural Analysis" in metric_tooltips:
-                        gr.Markdown(value=metric_tooltips["Structural Analysis"], elem_classes="metric-description")
-                    else:
-                        gr.Markdown(value="### Structural Analysis\nDescription not found.")
-                gr.Markdown("""
-                ### Structural Analysis for Tibetan Legal Manuscripts
-                This analysis identifies potential source-target relationships between text segments, helping to reconstruct stemmatic relationships.
-                Click the "Run Structural Analysis" button below after computing the basic metrics to perform this advanced analysis.
-                """)
-                # Structural analysis outputs
-                structural_heatmap = gr.Plot(label="Structural Changes Summary", show_label=False, elem_classes="structural-heatmap")
-                structural_report = gr.HTML(label="Differential Analysis Report")
-                structural_export = gr.File(label="Export Structural Analysis Report", file_types=[".html", ".md", ".json"])
-        # The outputs in process_btn.click should use the short metric names as keys for heatmap_tabs
-        # e.g., heatmap_tabs["Jaccard Similarity (%)"]
-        # Ensure the plot is part of the layout. This assumes plots are displayed sequentially
-        # within the current gr.Tab("Results"). If they are in specific TabItems, this needs adjustment.
         # For now, this modification focuses on creating the plot object and making it an output.
         # The visual placement depends on how Gradio renders children of gr.Tab or if there's another container.
@@ -365,12 +334,8 @@ The structural analysis combines multiple similarity metrics to create a compreh
             fuzzy_heatmap_res = None
             semantic_heatmap_res = None
             warning_update_res = gr.update(visible=False)
-            structural_heatmap_res = None
-            structural_report_res = None
-            structural_export_res = None
             state_text_data_res = None
             state_df_results_res = None
-            structural_btn_update_res = gr.update(interactive=False)  # Default: disabled
             # Create a ProgressiveUI instance for handling progressive updates
             progressive_ui = ProgressiveUI(
@@ -382,8 +347,7 @@ The structural analysis combines multiple similarity metrics to create a compreh
                 semantic_heatmap=heatmap_tabs["Semantic Similarity"],
                 warning_box=warning_box,
                 progress_container=progress_container,
-                heatmap_titles=heatmap_titles,
-                structural_btn=structural_btn
             )
             # Make progress container visible during analysis
@@ -402,12 +366,8 @@ The structural analysis combines multiple similarity metrics to create a compreh
                     None,  # fuzzy_heatmap
                     None,  # semantic_heatmap
                     None,  # warning update
-                    None,  # structural_heatmap
-                    None,  # structural_report
-                    None,  # structural_export
                     None,  # state_text_data
-                    None,  # state_df_results
-                    gr.update(interactive=False),  # structural_btn
                 )
             # Check file size limits (10MB per file)
@@ -423,12 +383,8 @@ The structural analysis combines multiple similarity metrics to create a compreh
                         None,  # fuzzy_heatmap
                         None,  # semantic_heatmap
                         gr.update(value=f"Error: File '{Path(file.name).name}' exceeds the 10MB size limit.", visible=True),
-                        None,  # structural_heatmap
-                        None,  # structural_report
-                        None,  # structural_export
                         None,  # state_text_data
-                        None,  # state_df_results
-                        gr.update(interactive=False),  # structural_btn
                     )
             try:
@@ -470,12 +426,8 @@ The structural analysis combines multiple similarity metrics to create a compreh
                                 None,  # fuzzy_heatmap
                                 None,  # semantic_heatmap
                                 gr.update(value=f"Error: Could not decode file '{filename}'.", visible=True),
-                                None,  # structural_heatmap
-                                None,  # structural_report
-                                None,  # structural_export
                                 None,  # state_text_data
-                                None,  # state_df_results
-                                gr.update(interactive=False),  # structural_btn
                             )
                 # Configure semantic similarity and fuzzy matching
@@ -519,8 +471,7 @@ The structural analysis combines multiple similarity metrics to create a compreh
                     warning_message = "No common chapters found or results are empty. " + (warning_raw or "")
                     metrics_preview_df_res = pd.DataFrame({"Message": [warning_message]})
                     warning_update_res = gr.update(value=warning_md or warning_message, visible=True)
-                    # keep structural disabled
-                    structural_btn_update_res = gr.update(interactive=False)
                 else:
                     # Generate visualizations
                     if progress is not None:
@@ -542,11 +493,10 @@ The structural analysis combines multiple similarity metrics to create a compreh
                             logger.warning(f"Progress update error (non-critical): {e}")
                     word_count_fig_res = generate_word_count_chart(word_counts_df_data)
-                    # Enable structural analysis button and store states for deferred run
-                    structural_btn_update_res = gr.update(interactive=True, value="Run Structural Analysis (time-consuming)")
                     state_text_data_res = text_data
                     state_df_results_res = df_results
-                    logger.info("Enabling structural analysis button")
                     # Save results to CSV
                     if progress is not None:
@@ -587,12 +537,8 @@ The structural analysis combines multiple similarity metrics to create a compreh
                 fuzzy_heatmap_res,
                 semantic_heatmap_res,
                 warning_update_res,
-                structural_heatmap_res,
-                structural_report_res,
-                structural_export_res,
                 state_text_data_res,
                 state_df_results_res,
-                structural_btn_update_res,
             )
         # Function to interpret results using LLM
@@ -641,53 +587,12 @@ The structural analysis combines multiple similarity metrics to create a compreh
                 heatmap_tabs["Fuzzy Similarity"],
                 heatmap_tabs["Semantic Similarity"],
                 warning_box,
-                structural_heatmap,
-                structural_report,
-                structural_export,
                 state_text_data,
                 state_df_results,
-                structural_btn,
             ]
         )
-        # Separate handler to run structural analysis on demand
-        def run_structural(text_data_state, df_results_state, progress=gr.Progress()):
-            if text_data_state is None or df_results_state is None:
-                return None, "<p>No initial results available. Please run the analysis first.</p>", None
-            # Progress for structural
-            try:
-                progress(0.1, desc="Generating structural analysis...")
-            except Exception:
-                pass
-            from pipeline.differential_viz import create_differential_heatmap, create_change_detection_report
-            # Create structural heatmap
-            try:
-                struct_heatmap = create_differential_heatmap(text_data_state, "all_chapters", df_results_state)
-            except Exception as e:
-                logger.warning(f"Could not generate structural heatmap: {e}")
-                struct_heatmap = None
-            # Create report
-            try:
-                struct_report = create_change_detection_report(text_data_state, "all_chapters", "html")
-            except Exception as e:
-                logger.warning(f"Could not generate structural report: {e}")
-                struct_report = "<p>Could not generate structural analysis report.</p>"
-            # Save report
-            try:
-                report_path = "structural_analysis_report.html"
-                with open(report_path, 'w', encoding='utf-8') as f:
-                    f.write(struct_report if isinstance(struct_report, str) else "")
-                struct_export = report_path
-            except Exception as e:
-                logger.warning(f"Could not save structural report: {e}")
-                struct_export = None
-            return struct_heatmap, struct_report, struct_export
-        structural_btn.click(
-            fn=run_structural,
-            inputs=[state_text_data, state_df_results],
-            outputs=[structural_heatmap, structural_report, structural_export]
-        )
         # Connect the interpret button
         interpret_btn.click(

         metrics_preview = gr.Dataframe(
             label="Similarity Metrics Preview", interactive=False, visible=True
         )
+        # States for data persistence
         state_text_data = gr.State()
         state_df_results = gr.State()
         # LLM Interpretation components
         with gr.Row():
         # Heatmap tabs for each metric
         heatmap_titles = {
+            "Jaccard Similarity (%)": "Higher scores mean more shared unique words.",
+            "Normalized LCS": "Higher scores mean longer shared sequences of words.",
+            "Fuzzy Similarity": "Higher scores mean more similar text with fuzzy matching tolerance for variations.",
+            "Semantic Similarity": "Higher scores mean more similar meanings.",
             "Word Counts": "Word Counts: Bar chart showing the number of words in each segment after tokenization.",
         }
                         heatmap_tabs[metric_key] = gr.Plot(label=f"Heatmap: {metric_key}", show_label=False, elem_classes="metric-heatmap")
             # Structural Analysis Tab
+            # Structural analysis tab removed - see dedicated collation app
         # For now, this modification focuses on creating the plot object and making it an output.
         # The visual placement depends on how Gradio renders children of gr.Tab or if there's another container.
             fuzzy_heatmap_res = None
             semantic_heatmap_res = None
             warning_update_res = gr.update(visible=False)
             state_text_data_res = None
             state_df_results_res = None
             # Create a ProgressiveUI instance for handling progressive updates
             progressive_ui = ProgressiveUI(
                 semantic_heatmap=heatmap_tabs["Semantic Similarity"],
                 warning_box=warning_box,
                 progress_container=progress_container,
+                heatmap_titles=heatmap_titles
             )
             # Make progress container visible during analysis
                     None,  # fuzzy_heatmap
                     None,  # semantic_heatmap
                     None,  # warning update
                     None,  # state_text_data
+                    None  # state_df_results
                 )
             # Check file size limits (10MB per file)
                         None,  # fuzzy_heatmap
                         None,  # semantic_heatmap
                         gr.update(value=f"Error: File '{Path(file.name).name}' exceeds the 10MB size limit.", visible=True),
                         None,  # state_text_data
+                        None  # state_df_results
                     )
             try:
                                 None,  # fuzzy_heatmap
                                 None,  # semantic_heatmap
                                 gr.update(value=f"Error: Could not decode file '{filename}'.", visible=True),
                                 None,  # state_text_data
+                                None  # state_df_results
                             )
                 # Configure semantic similarity and fuzzy matching
                     warning_message = "No common chapters found or results are empty. " + (warning_raw or "")
                     metrics_preview_df_res = pd.DataFrame({"Message": [warning_message]})
                     warning_update_res = gr.update(value=warning_md or warning_message, visible=True)
+                    # No structural analysis in this app
                 else:
                     # Generate visualizations
                     if progress is not None:
                             logger.warning(f"Progress update error (non-critical): {e}")
                     word_count_fig_res = generate_word_count_chart(word_counts_df_data)
+                    # Store state data for potential future use
                     state_text_data_res = text_data
                     state_df_results_res = df_results
+                    logger.info("Analysis complete, storing state data")
                     # Save results to CSV
                     if progress is not None:
                 fuzzy_heatmap_res,
                 semantic_heatmap_res,
                 warning_update_res,
                 state_text_data_res,
                 state_df_results_res,
             )
         # Function to interpret results using LLM
                 heatmap_tabs["Fuzzy Similarity"],
                 heatmap_tabs["Semantic Similarity"],
                 warning_box,
                 state_text_data,
                 state_df_results,
             ]
         )
+        # Structural analysis functionality removed - see dedicated collation app
         # Connect the interpret button
         interpret_btn.click(

pipeline/differential_viz.py DELETED Viewed

@@ -1,264 +0,0 @@
-"""
-Differential visualization enhancements for Tibetan legal manuscript analysis.
-Provides enhanced heatmaps with structural change highlighting.
-"""
-import plotly.graph_objects as go
-from typing import Dict, List
-import pandas as pd
-from .structural_analysis import detect_structural_changes, generate_structural_alignment
-def create_differential_heatmap(texts_dict: Dict[str, str],
-                               chapter_key: str,
-                               metric_results: pd.DataFrame,
-                               highlight_threshold: float = 0.7) -> go.Figure:
-    """
-    Create enhanced heatmap with structural change highlighting.
-    Args:
-        texts_dict: Dictionary mapping text names to their content
-        chapter_key: Chapter identifier being analyzed
-        metric_results: DataFrame with similarity metrics
-        highlight_threshold: Threshold for highlighting significant changes
-    """
-    # Get unique text pairs
-    text_pairs = metric_results['Text Pair'].unique()
-    # Create enhanced heatmap data
-    enhanced_data = []
-    for pair in text_pairs:
-        texts = pair.split(' vs ')
-        if len(texts) == 2:
-            text1_name, text2_name = texts
-            # Get actual text content
-            text1_content = texts_dict.get(text1_name, '')
-            text2_content = texts_dict.get(text2_name, '')
-            # Perform structural analysis
-            changes = detect_structural_changes(text1_content, text2_content)
-            alignment = generate_structural_alignment(text1_content, text2_content)
-            # Create enhanced metrics
-            enhanced_row = {
-                'Text Pair': pair,
-                'Chapter': chapter_key,
-                'structural_changes': len(changes['insertions']) + len(changes['deletions']) + len(changes['modifications']),
-                'modification_score': len(changes['modifications']),
-                'insertion_score': len(changes['insertions']),
-                'deletion_score': len(changes['deletions']),
-                'alignment_quality': len(alignment['matches']) / max(len(alignment['segments1']) + len(alignment['segments2']), 1),
-                'significant_differences': len([c for c in changes['modifications'] if len(c['original']) > 10])
-            }
-            enhanced_data.append(enhanced_row)
-    # Create a clean table with numbers and percentages
-    summary_table = []
-    for row in enhanced_data:
-        text_pair = row['Text Pair']
-        chapter = row['Chapter']
-        # Calculate percentages
-        total_changes = row['structural_changes']
-        modifications = row['modification_score']
-        insertions_deletions = row['insertion_score'] + row['deletion_score']
-        alignment_quality = row['alignment_quality']
-        # Create summary row
-        summary_row = {
-            'Text Pair': text_pair,
-            'Chapter': chapter,
-            'Total Changes': total_changes,
-            'Modifications': modifications,
-            'Insertions/Deletions': insertions_deletions,
-            'Alignment Quality': f"{alignment_quality:.1f}%",
-            'Significant Differences': row['significant_differences']
-        }
-        summary_table.append(summary_row)
-    # Create DataFrame for table display
-    summary_df = pd.DataFrame(summary_table)
-    # Create a simple table with styling
-    fig = go.Figure(data=[go.Table(
-        header=dict(
-            values=['Text Pair', 'Chapter', 'Total Changes', 'Modifications',
-                   'Insertions/Deletions', 'Alignment Quality', 'Significant Differences'],
-            font=dict(size=12, color='white'),
-            fill_color='darkblue',
-            align='left'
-        ),
-        cells=dict(
-            values=[
-                summary_df['Text Pair'],
-                summary_df['Chapter'],
-                summary_df['Total Changes'],
-                summary_df['Modifications'],
-                summary_df['Insertions/Deletions'],
-                summary_df['Alignment Quality'],
-                summary_df['Significant Differences']
-            ],
-            font=dict(size=11),
-            align='left',
-            fill_color=['lightgrey' if i % 2 == 0 else 'white'
-                       for i in range(len(summary_df))]
-        )
-    )])
-    fig.update_layout(
-        title="Structural Analysis Summary",
-        height=400,
-        margin=dict(l=10, r=10, t=40, b=10)
-    )
-    return fig
-def create_change_detection_report(texts_dict: Dict[str, str],
-                                 chapter_key: str,
-                                 output_format: str = 'html') -> str:
-    """
-    Create detailed change detection report for a chapter.
-    Args:
-        texts_dict: Dictionary mapping text names to content
-        chapter_key: Chapter identifier
-        output_format: Format for output ('html', 'json', 'markdown')
-    """
-    from .structural_analysis import generate_differential_report
-    text_names = list(texts_dict.keys())
-    reports = []
-    for i, text1_name in enumerate(text_names):
-        for text2_name in text_names[i+1:]:
-            text1_content = texts_dict[text1_name]
-            text2_content = texts_dict[text2_name]
-            report = generate_differential_report(
-                text1_content, text2_content, text1_name, text2_name
-            )
-            reports.append(report)
-    if output_format == 'html':
-        return create_html_report(reports, chapter_key)
-    elif output_format == 'json':
-        import json
-        return json.dumps(reports, indent=2, ensure_ascii=False)
-    else:
-        return create_markdown_report(reports, chapter_key)
-def create_html_report(reports: List[Dict], chapter_key: str) -> str:
-    """Create HTML report for structural analysis."""
-    html = f"""
-    <!DOCTYPE html>
-    <html>
-    <head>
-        <title>Structural Analysis Report - Chapter {chapter_key}</title>
-        <style>
-            body {{ font-family: Arial, sans-serif; margin: 20px; }}
-            .report {{ max-width: 1200px; margin: 0 auto; }}
-            .comparison {{ border: 1px solid #ddd; margin: 20px 0; padding: 15px; }}
-            .changes {{ display: flex; gap: 20px; }}
-            .change-type {{ flex: 1; padding: 10px; border: 1px solid #eee; }}
-            .insertion {{ background-color: #e8f5e8; }}
-            .deletion {{ background-color: #ffe8e8; }}
-            .modification {{ background-color: #fff3e0; }}
-            .highlight {{ background-color: yellow; padding: 2px 4px; }}
-        </style>
-    </head>
-    <body>
-        <div class="report">
-            <h1>Structural Analysis Report - Chapter {chapter_key}</h1>
-    """
-    for report in reports:
-        html += f"""
-            <div class="comparison">
-                <h2>{report['file1']} vs {report['file2']}</h2>
-                <div class="scores">
-                    <p><strong>Structural Similarity:</strong> {report['scores']['structural_similarity']:.2f}</p>
-                    <p><strong>Alignment Score:</strong> {report['scores']['alignment_score']:.2f}</p>
-                </div>
-                <div class="changes">
-                    <div class="change-type insertion">
-                        <h3>Insertions ({len(report['changes']['insertions'])})</h3>
-                        {format_changes_html(report['changes']['insertions'])}
-                    </div>
-                    <div class="change-type deletion">
-                        <h3>Deletions ({len(report['changes']['deletions'])})</h3>
-                        {format_changes_html(report['changes']['deletions'])}
-                    </div>
-                    <div class="change-type modification">
-                        <h3>Modifications ({len(report['changes']['modifications'])})</h3>
-                        {format_changes_html(report['changes']['modifications'], is_modification=True)}
-                    </div>
-                </div>
-            </div>
-        """
-    html += """
-        </div>
-    </body>
-    </html>
-    """
-    return html
-def format_changes_html(changes: List[Dict], is_modification: bool = False) -> str:
-    """Format changes for HTML display."""
-    if not changes:
-        return "<p>No changes detected.</p>"
-    html = ""
-    for change in changes[:5]:  # Limit to first 5 for brevity
-        if is_modification:
-            html += f"""
-            <div class="change">
-                <span class="highlight">{change.get('original', '')}</span> →
-                <span class="highlight">{change.get('replacement', '')}</span>
-            </div>
-            """
-        else:
-            html += f"""
-            <div class="change">
-                <span class="highlight">{change.get('word', '')}</span>
-            </div>
-            """
-    if len(changes) > 5:
-        html += f"<p>... and {len(changes) - 5} more</p>"
-    return html
-def create_markdown_report(reports: List[Dict], chapter_key: str) -> str:
-    """Create markdown report for structural analysis."""
-    md = f"# Structural Analysis Report - Chapter {chapter_key}\n\n"
-    for report in reports:
-        md += f"## {report['file1']} vs {report['file2']}\n\n"
-        md += f"- **Structural Similarity**: {report['scores']['structural_similarity']:.2f}\n"
-        md += f"- **Alignment Score**: {report['scores']['alignment_score']:.2f}\n"
-        md += f"- **Insertions**: {len(report['changes']['insertions'])}\n"
-        md += f"- **Deletions**: {len(report['changes']['deletions'])}\n"
-        md += f"- **Modifications**: {len(report['changes']['modifications'])}\n\n"
-        if report['changes']['modifications']:
-            md += "### Significant Modifications:\n"
-            for mod in report['changes']['modifications'][:3]:
-                md += f"- **{mod.get('original', '')}** → **{mod.get('replacement', '')}**\n"
-    return md

pipeline/structural_analysis.py DELETED Viewed

@@ -1,332 +0,0 @@
-"""
-Chapter-level structural analysis for Tibetan legal manuscripts.
-Enhanced with Juxta/CollateX-inspired advanced alignment algorithms.
-"""
-import difflib
-import re
-import logging
-from thefuzz import fuzz
-from .advanced_alignment import enhanced_structural_analysis
-logger = logging.getLogger(__name__)
-def detect_structural_changes(text1: str, text2: str,
-                           min_change_length: int = 5,
-                           context_window: int = 10) -> dict:
-    """
-    Detect structural changes between two Tibetan text chapters.
-    Args:
-        text1: First text chapter
-        text2: Second text chapter
-        min_change_length: Minimum length of change to report
-        context_window: Number of characters to include as context
-    Returns:
-        Dictionary with detected changes: insertions, deletions, modifications
-    """
-    # Clean texts for comparison
-    def clean_text(text):
-        # Remove extra whitespace and normalize
-        text = re.sub(r'\s+', ' ', text.strip())
-        return text
-    clean1 = clean_text(text1)
-    clean2 = clean_text(text2)
-    # Use difflib to detect changes
-    differ = difflib.Differ()
-    diff = list(differ.compare(clean1.split(), clean2.split()))
-    changes = {
-        'insertions': [],
-        'deletions': [],
-        'modifications': [],
-        'unchanged': []
-    }
-    # Track current position in both texts
-    pos1 = 0
-    pos2 = 0
-    for i, line in enumerate(diff):
-        if line.startswith('  '):  # Unchanged
-            word = line[2:]
-            changes['unchanged'].append({
-                'word': word,
-                'position1': pos1,
-                'position2': pos2,
-                'length': len(word)
-            })
-            pos1 += len(word) + 1
-            pos2 += len(word) + 1
-        elif line.startswith('- '):  # Deletion
-            word = line[2:]
-            if len(word) >= min_change_length:
-                changes['deletions'].append({
-                    'word': word,
-                    'position': pos1,
-                    'length': len(word),
-                    'context': get_context(clean1, pos1, context_window)
-                })
-            pos1 += len(word) + 1
-        elif line.startswith('+ '):  # Insertion
-            word = line[2:]
-            if len(word) >= min_change_length:
-                changes['insertions'].append({
-                    'word': word,
-                    'position': pos2,
-                    'length': len(word),
-                    'context': get_context(clean2, pos2, context_window)
-                })
-            pos2 += len(word) + 1
-    # Detect modifications (adjacent deletions and insertions)
-    modifications = detect_modifications(changes['deletions'], changes['insertions'])
-    changes['modifications'] = modifications
-    return changes
-def get_context(text: str, position: int, window: int) -> str:
-    """Get context around a position in text."""
-    start = max(0, position - window)
-    end = min(len(text), position + window)
-    return text[start:end]
-def detect_modifications(deletions: list[dict], insertions: list[dict]) -> list[dict]:
-    """Detect modifications by pairing nearby deletions and insertions using fuzzy matching."""
-    modifications = []
-    # First pass: match by position proximity
-    for deletion in deletions[:]:  # Copy to avoid modification during iteration
-        for insertion in insertions[:]:
-            # If deletion and insertion are close (within 5 positions)
-            if abs(deletion['position'] - insertion['position']) <= 5:
-                # Calculate fuzzy similarity score
-                similarity = fuzz.token_set_ratio(deletion['word'], insertion['word']) / 100.0
-                modifications.append({
-                    'original': deletion['word'],
-                    'replacement': insertion['word'],
-                    'position': deletion['position'],
-                    'deletion_context': deletion['context'],
-                    'insertion_context': insertion['context'],
-                    'similarity': similarity
-                })
-                # Remove from original lists to avoid duplicates
-                if deletion in deletions:
-                    deletions.remove(deletion)
-                if insertion in insertions:
-                    insertions.remove(insertion)
-                break
-    # Second pass: use fuzzy matching for remaining items that might be related
-    # but not positionally close
-    remaining_deletions = deletions[:]
-    for deletion in remaining_deletions:
-        if not insertions:  # No insertions left to match
-            break
-        # Find best fuzzy match among remaining insertions
-        best_match = None
-        best_score = 0
-        best_idx = -1
-        for i, insertion in enumerate(insertions):
-            score = fuzz.token_set_ratio(deletion['word'], insertion['word'])
-            if score > 60 and score > best_score:  # Threshold of 60% similarity
-                best_score = score
-                best_match = insertion
-                best_idx = i
-        if best_match:
-            modifications.append({
-                'original': deletion['word'],
-                'replacement': best_match['word'],
-                'position': deletion['position'],
-                'deletion_context': deletion['context'],
-                'insertion_context': best_match['context'],
-                'similarity': best_score / 100.0,
-                'fuzzy_matched': True
-            })
-            # Remove matched items
-            if deletion in deletions:
-                deletions.remove(deletion)
-            insertions.pop(best_idx)
-    return modifications
-def generate_structural_alignment(text1: str, text2: str) -> dict[str, list]:
-    """
-    Generate enhanced structural alignment using advanced algorithms.
-    Returns:
-        Dictionary with Juxta/CollateX-inspired alignment information
-    """
-    try:
-        # Use enhanced alignment from advanced_alignment module
-        result = enhanced_structural_analysis(text1, text2)
-        # Convert to legacy format for backward compatibility
-        alignment = {
-            'matches': [],
-            'gaps': [],
-            'mismatches': [],
-            'segments1': [],
-            'segments2': []
-        }
-        # Process alignment segments
-        for segment in result.get('alignment_segments', []):
-            if segment['type'] == 'match':
-                alignment['matches'].append({
-                    'segments1': [segment['content1']],
-                    'segments2': [segment['content2']],
-                    'type': 'match',
-                    'confidence': segment['confidence']
-                })
-            elif segment['type'] == 'insertion':
-                alignment['gaps'].append({
-                    'segments': [segment['content2']],
-                    'type': 'insertion',
-                    'position': 'text2',
-                    'confidence': segment['confidence']
-                })
-            elif segment['type'] == 'deletion':
-                alignment['gaps'].append({
-                    'segments': [segment['content1']],
-                    'type': 'deletion',
-                    'position': 'text1',
-                    'confidence': segment['confidence']
-                })
-            elif segment['type'] in ['mismatch', 'modification']:
-                alignment['mismatches'].append({
-                    'original': [segment['content1']],
-                    'replacement': [segment['content2']],
-                    'type': 'modification',
-                    'confidence': segment['confidence']
-                })
-        return alignment
-    except Exception as e:
-        logger.warning(f"Enhanced alignment failed, falling back to basic: {e}")
-        # Fallback to basic alignment for robustness
-        def split_into_segments(text):
-            segments = re.split(r'[།༎༏༐༑༔]', text)
-            return [seg.strip() for seg in segments if seg.strip()]
-        segments1 = split_into_segments(text1)
-        segments2 = split_into_segments(text2)
-        matcher = difflib.SequenceMatcher(None, segments1, segments2)
-        alignment = {
-            'matches': [],
-            'gaps': [],
-            'mismatches': [],
-            'segments1': segments1,
-            'segments2': segments2
-        }
-        for tag, i1, i2, j1, j2 in matcher.get_opcodes():
-            if tag == 'equal':
-                alignment['matches'].append({
-                    'segments1': segments1[i1:i2],
-                    'segments2': segments2[j1:j2],
-                    'type': 'match'
-                })
-            elif tag == 'delete':
-                alignment['gaps'].append({
-                    'segments': segments1[i1:i2],
-                    'type': 'deletion',
-                    'position': 'text1'
-                })
-            elif tag == 'insert':
-                alignment['gaps'].append({
-                    'segments': segments2[j1:j2],
-                    'type': 'insertion',
-                    'position': 'text2'
-                })
-            elif tag == 'replace':
-                alignment['mismatches'].append({
-                    'original': segments1[i1:i2],
-                    'replacement': segments2[j1:j2],
-                    'type': 'modification'
-                })
-        return alignment
-def calculate_structural_similarity_score(text1: str, text2: str) -> dict[str, float]:
-    """
-    Calculate various structural similarity scores between two texts.
-    Returns:
-        Dictionary with multiple similarity metrics
-    """
-    changes = detect_structural_changes(text1, text2)
-    alignment = generate_structural_alignment(text1, text2)
-    # Calculate scores
-    total_changes = len(changes['insertions']) + len(changes['deletions']) + len(changes['modifications'])
-    # Structural similarity score (inverse of changes)
-    text_length = max(len(text1.split()), len(text2.split()))
-    structural_score = max(0, 1 - (total_changes / text_length)) if text_length > 0 else 0
-    # Alignment-based score
-    total_segments = len(alignment['segments1']) + len(alignment['segments2'])
-    matches = len(alignment['matches'])
-    alignment_score = matches / (total_segments / 2) if total_segments > 0 else 0
-    return {
-        'structural_similarity': structural_score,
-        'alignment_score': alignment_score,
-        'insertions': len(changes['insertions']),
-        'deletions': len(changes['deletions']),
-        'modifications': len(changes['modifications']),
-        'total_changes': total_changes
-    }
-def generate_differential_report(text1: str, text2: str,
-                               file1_name: str = "Text 1",
-                               file2_name: str = "Text 2") -> dict[str, any]:
-    """
-    Generate a comprehensive differential report for two text chapters.
-    Returns:
-        Complete report with changes, alignment, and recommendations
-    """
-    changes = detect_structural_changes(text1, text2)
-    alignment = generate_structural_alignment(text1, text2)
-    scores = calculate_structural_similarity_score(text1, text2)
-    report = {
-        'file1': file1_name,
-        'file2': file2_name,
-        'changes': changes,
-        'alignment': alignment,
-        'scores': scores,
-        'summary': {
-            'significant_differences': len([c for c in changes['modifications'] if len(c['original']) > 10 or len(c['replacement']) > 10]),
-            'minor_variants': len([c for c in changes['modifications'] if len(c['original']) <= 5 and len(c['replacement']) <= 5]),
-            'structural_preservation': scores['alignment_score'] > 0.8,
-            'recommendation': 'Manuscripts are structurally similar' if scores['alignment_score'] > 0.7 else 'Significant structural differences detected'
-        }
-    }
-    return report