daniel-wojahn commited on
Commit
671c107
·
1 Parent(s): e30d4c0

deleted beta feature (will be outsourced in a different app)

Browse files
Files changed (4) hide show
  1. .gitignore +1 -2
  2. app.py +14 -109
  3. pipeline/differential_viz.py +0 -264
  4. pipeline/structural_analysis.py +0 -332
.gitignore CHANGED
@@ -1,5 +1,4 @@
1
  venv
2
  __pycache__
3
  academic_article.md
4
- #structural_analysis.py
5
- #differential_viz.py
 
1
  venv
2
  __pycache__
3
  academic_article.md
4
+ tibetan_collation_prototype
 
app.py CHANGED
@@ -135,17 +135,9 @@ def main_interface():
135
  metrics_preview = gr.Dataframe(
136
  label="Similarity Metrics Preview", interactive=False, visible=True
137
  )
138
- # States to hold data for optional structural analysis
139
  state_text_data = gr.State()
140
  state_df_results = gr.State()
141
-
142
- # Deferred structural analysis trigger
143
- structural_btn = gr.Button(
144
- "Run Structural Analysis (time-consuming)",
145
- variant="secondary",
146
- interactive=False,
147
- elem_id="structural-btn"
148
- )
149
 
150
  # LLM Interpretation components
151
  with gr.Row():
@@ -175,10 +167,10 @@ def main_interface():
175
 
176
  # Heatmap tabs for each metric
177
  heatmap_titles = {
178
- "Jaccard Similarity (%)": "Jaccard Similarity (%): Higher scores (darker) mean more shared unique words.",
179
- "Normalized LCS": "Normalized LCS: Higher scores (darker) mean longer shared sequences of words.",
180
- "Fuzzy Similarity": "Fuzzy Similarity: Higher scores (darker) mean more similar text with fuzzy matching tolerance for variations.",
181
- "Semantic Similarity": "Semantic Similarity (using word embeddings/experimental): Higher scores (darker) mean more similar meanings.",
182
  "Word Counts": "Word Counts: Bar chart showing the number of words in each segment after tokenization.",
183
  }
184
 
@@ -305,30 +297,7 @@ The structural analysis combines multiple similarity metrics to create a compreh
305
  heatmap_tabs[metric_key] = gr.Plot(label=f"Heatmap: {metric_key}", show_label=False, elem_classes="metric-heatmap")
306
 
307
  # Structural Analysis Tab
308
- with gr.Tab("Structural Analysis"):
309
- with gr.Accordion("ℹ️ About", open=False, elem_classes="metric-info-accordion structural-info"):
310
- if "Structural Analysis" in metric_tooltips:
311
- gr.Markdown(value=metric_tooltips["Structural Analysis"], elem_classes="metric-description")
312
- else:
313
- gr.Markdown(value="### Structural Analysis\nDescription not found.")
314
-
315
- gr.Markdown("""
316
- ### Structural Analysis for Tibetan Legal Manuscripts
317
-
318
- This analysis identifies potential source-target relationships between text segments, helping to reconstruct stemmatic relationships.
319
-
320
- Click the "Run Structural Analysis" button below after computing the basic metrics to perform this advanced analysis.
321
- """)
322
-
323
- # Structural analysis outputs
324
- structural_heatmap = gr.Plot(label="Structural Changes Summary", show_label=False, elem_classes="structural-heatmap")
325
- structural_report = gr.HTML(label="Differential Analysis Report")
326
- structural_export = gr.File(label="Export Structural Analysis Report", file_types=[".html", ".md", ".json"])
327
-
328
- # The outputs in process_btn.click should use the short metric names as keys for heatmap_tabs
329
- # e.g., heatmap_tabs["Jaccard Similarity (%)"]
330
- # Ensure the plot is part of the layout. This assumes plots are displayed sequentially
331
- # within the current gr.Tab("Results"). If they are in specific TabItems, this needs adjustment.
332
  # For now, this modification focuses on creating the plot object and making it an output.
333
  # The visual placement depends on how Gradio renders children of gr.Tab or if there's another container.
334
 
@@ -365,12 +334,8 @@ The structural analysis combines multiple similarity metrics to create a compreh
365
  fuzzy_heatmap_res = None
366
  semantic_heatmap_res = None
367
  warning_update_res = gr.update(visible=False)
368
- structural_heatmap_res = None
369
- structural_report_res = None
370
- structural_export_res = None
371
  state_text_data_res = None
372
  state_df_results_res = None
373
- structural_btn_update_res = gr.update(interactive=False) # Default: disabled
374
 
375
  # Create a ProgressiveUI instance for handling progressive updates
376
  progressive_ui = ProgressiveUI(
@@ -382,8 +347,7 @@ The structural analysis combines multiple similarity metrics to create a compreh
382
  semantic_heatmap=heatmap_tabs["Semantic Similarity"],
383
  warning_box=warning_box,
384
  progress_container=progress_container,
385
- heatmap_titles=heatmap_titles,
386
- structural_btn=structural_btn
387
  )
388
 
389
  # Make progress container visible during analysis
@@ -402,12 +366,8 @@ The structural analysis combines multiple similarity metrics to create a compreh
402
  None, # fuzzy_heatmap
403
  None, # semantic_heatmap
404
  None, # warning update
405
- None, # structural_heatmap
406
- None, # structural_report
407
- None, # structural_export
408
  None, # state_text_data
409
- None, # state_df_results
410
- gr.update(interactive=False), # structural_btn
411
  )
412
 
413
  # Check file size limits (10MB per file)
@@ -423,12 +383,8 @@ The structural analysis combines multiple similarity metrics to create a compreh
423
  None, # fuzzy_heatmap
424
  None, # semantic_heatmap
425
  gr.update(value=f"Error: File '{Path(file.name).name}' exceeds the 10MB size limit.", visible=True),
426
- None, # structural_heatmap
427
- None, # structural_report
428
- None, # structural_export
429
  None, # state_text_data
430
- None, # state_df_results
431
- gr.update(interactive=False), # structural_btn
432
  )
433
 
434
  try:
@@ -470,12 +426,8 @@ The structural analysis combines multiple similarity metrics to create a compreh
470
  None, # fuzzy_heatmap
471
  None, # semantic_heatmap
472
  gr.update(value=f"Error: Could not decode file '{filename}'.", visible=True),
473
- None, # structural_heatmap
474
- None, # structural_report
475
- None, # structural_export
476
  None, # state_text_data
477
- None, # state_df_results
478
- gr.update(interactive=False), # structural_btn
479
  )
480
 
481
  # Configure semantic similarity and fuzzy matching
@@ -519,8 +471,7 @@ The structural analysis combines multiple similarity metrics to create a compreh
519
  warning_message = "No common chapters found or results are empty. " + (warning_raw or "")
520
  metrics_preview_df_res = pd.DataFrame({"Message": [warning_message]})
521
  warning_update_res = gr.update(value=warning_md or warning_message, visible=True)
522
- # keep structural disabled
523
- structural_btn_update_res = gr.update(interactive=False)
524
  else:
525
  # Generate visualizations
526
  if progress is not None:
@@ -542,11 +493,10 @@ The structural analysis combines multiple similarity metrics to create a compreh
542
  logger.warning(f"Progress update error (non-critical): {e}")
543
  word_count_fig_res = generate_word_count_chart(word_counts_df_data)
544
 
545
- # Enable structural analysis button and store states for deferred run
546
- structural_btn_update_res = gr.update(interactive=True, value="Run Structural Analysis (time-consuming)")
547
  state_text_data_res = text_data
548
  state_df_results_res = df_results
549
- logger.info("Enabling structural analysis button")
550
 
551
  # Save results to CSV
552
  if progress is not None:
@@ -587,12 +537,8 @@ The structural analysis combines multiple similarity metrics to create a compreh
587
  fuzzy_heatmap_res,
588
  semantic_heatmap_res,
589
  warning_update_res,
590
- structural_heatmap_res,
591
- structural_report_res,
592
- structural_export_res,
593
  state_text_data_res,
594
  state_df_results_res,
595
- structural_btn_update_res,
596
  )
597
 
598
  # Function to interpret results using LLM
@@ -641,53 +587,12 @@ The structural analysis combines multiple similarity metrics to create a compreh
641
  heatmap_tabs["Fuzzy Similarity"],
642
  heatmap_tabs["Semantic Similarity"],
643
  warning_box,
644
- structural_heatmap,
645
- structural_report,
646
- structural_export,
647
  state_text_data,
648
  state_df_results,
649
- structural_btn,
650
  ]
651
  )
652
 
653
- # Separate handler to run structural analysis on demand
654
- def run_structural(text_data_state, df_results_state, progress=gr.Progress()):
655
- if text_data_state is None or df_results_state is None:
656
- return None, "<p>No initial results available. Please run the analysis first.</p>", None
657
- # Progress for structural
658
- try:
659
- progress(0.1, desc="Generating structural analysis...")
660
- except Exception:
661
- pass
662
- from pipeline.differential_viz import create_differential_heatmap, create_change_detection_report
663
- # Create structural heatmap
664
- try:
665
- struct_heatmap = create_differential_heatmap(text_data_state, "all_chapters", df_results_state)
666
- except Exception as e:
667
- logger.warning(f"Could not generate structural heatmap: {e}")
668
- struct_heatmap = None
669
- # Create report
670
- try:
671
- struct_report = create_change_detection_report(text_data_state, "all_chapters", "html")
672
- except Exception as e:
673
- logger.warning(f"Could not generate structural report: {e}")
674
- struct_report = "<p>Could not generate structural analysis report.</p>"
675
- # Save report
676
- try:
677
- report_path = "structural_analysis_report.html"
678
- with open(report_path, 'w', encoding='utf-8') as f:
679
- f.write(struct_report if isinstance(struct_report, str) else "")
680
- struct_export = report_path
681
- except Exception as e:
682
- logger.warning(f"Could not save structural report: {e}")
683
- struct_export = None
684
- return struct_heatmap, struct_report, struct_export
685
-
686
- structural_btn.click(
687
- fn=run_structural,
688
- inputs=[state_text_data, state_df_results],
689
- outputs=[structural_heatmap, structural_report, structural_export]
690
- )
691
 
692
  # Connect the interpret button
693
  interpret_btn.click(
 
135
  metrics_preview = gr.Dataframe(
136
  label="Similarity Metrics Preview", interactive=False, visible=True
137
  )
138
+ # States for data persistence
139
  state_text_data = gr.State()
140
  state_df_results = gr.State()
 
 
 
 
 
 
 
 
141
 
142
  # LLM Interpretation components
143
  with gr.Row():
 
167
 
168
  # Heatmap tabs for each metric
169
  heatmap_titles = {
170
+ "Jaccard Similarity (%)": "Higher scores mean more shared unique words.",
171
+ "Normalized LCS": "Higher scores mean longer shared sequences of words.",
172
+ "Fuzzy Similarity": "Higher scores mean more similar text with fuzzy matching tolerance for variations.",
173
+ "Semantic Similarity": "Higher scores mean more similar meanings.",
174
  "Word Counts": "Word Counts: Bar chart showing the number of words in each segment after tokenization.",
175
  }
176
 
 
297
  heatmap_tabs[metric_key] = gr.Plot(label=f"Heatmap: {metric_key}", show_label=False, elem_classes="metric-heatmap")
298
 
299
  # Structural Analysis Tab
300
+ # Structural analysis tab removed - see dedicated collation app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  # For now, this modification focuses on creating the plot object and making it an output.
302
  # The visual placement depends on how Gradio renders children of gr.Tab or if there's another container.
303
 
 
334
  fuzzy_heatmap_res = None
335
  semantic_heatmap_res = None
336
  warning_update_res = gr.update(visible=False)
 
 
 
337
  state_text_data_res = None
338
  state_df_results_res = None
 
339
 
340
  # Create a ProgressiveUI instance for handling progressive updates
341
  progressive_ui = ProgressiveUI(
 
347
  semantic_heatmap=heatmap_tabs["Semantic Similarity"],
348
  warning_box=warning_box,
349
  progress_container=progress_container,
350
+ heatmap_titles=heatmap_titles
 
351
  )
352
 
353
  # Make progress container visible during analysis
 
366
  None, # fuzzy_heatmap
367
  None, # semantic_heatmap
368
  None, # warning update
 
 
 
369
  None, # state_text_data
370
+ None # state_df_results
 
371
  )
372
 
373
  # Check file size limits (10MB per file)
 
383
  None, # fuzzy_heatmap
384
  None, # semantic_heatmap
385
  gr.update(value=f"Error: File '{Path(file.name).name}' exceeds the 10MB size limit.", visible=True),
 
 
 
386
  None, # state_text_data
387
+ None # state_df_results
 
388
  )
389
 
390
  try:
 
426
  None, # fuzzy_heatmap
427
  None, # semantic_heatmap
428
  gr.update(value=f"Error: Could not decode file '{filename}'.", visible=True),
 
 
 
429
  None, # state_text_data
430
+ None # state_df_results
 
431
  )
432
 
433
  # Configure semantic similarity and fuzzy matching
 
471
  warning_message = "No common chapters found or results are empty. " + (warning_raw or "")
472
  metrics_preview_df_res = pd.DataFrame({"Message": [warning_message]})
473
  warning_update_res = gr.update(value=warning_md or warning_message, visible=True)
474
+ # No structural analysis in this app
 
475
  else:
476
  # Generate visualizations
477
  if progress is not None:
 
493
  logger.warning(f"Progress update error (non-critical): {e}")
494
  word_count_fig_res = generate_word_count_chart(word_counts_df_data)
495
 
496
+ # Store state data for potential future use
 
497
  state_text_data_res = text_data
498
  state_df_results_res = df_results
499
+ logger.info("Analysis complete, storing state data")
500
 
501
  # Save results to CSV
502
  if progress is not None:
 
537
  fuzzy_heatmap_res,
538
  semantic_heatmap_res,
539
  warning_update_res,
 
 
 
540
  state_text_data_res,
541
  state_df_results_res,
 
542
  )
543
 
544
  # Function to interpret results using LLM
 
587
  heatmap_tabs["Fuzzy Similarity"],
588
  heatmap_tabs["Semantic Similarity"],
589
  warning_box,
 
 
 
590
  state_text_data,
591
  state_df_results,
 
592
  ]
593
  )
594
 
595
+ # Structural analysis functionality removed - see dedicated collation app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
 
597
  # Connect the interpret button
598
  interpret_btn.click(
pipeline/differential_viz.py DELETED
@@ -1,264 +0,0 @@
1
- """
2
- Differential visualization enhancements for Tibetan legal manuscript analysis.
3
- Provides enhanced heatmaps with structural change highlighting.
4
- """
5
-
6
- import plotly.graph_objects as go
7
- from typing import Dict, List
8
- import pandas as pd
9
- from .structural_analysis import detect_structural_changes, generate_structural_alignment
10
-
11
-
12
- def create_differential_heatmap(texts_dict: Dict[str, str],
13
- chapter_key: str,
14
- metric_results: pd.DataFrame,
15
- highlight_threshold: float = 0.7) -> go.Figure:
16
- """
17
- Create enhanced heatmap with structural change highlighting.
18
-
19
- Args:
20
- texts_dict: Dictionary mapping text names to their content
21
- chapter_key: Chapter identifier being analyzed
22
- metric_results: DataFrame with similarity metrics
23
- highlight_threshold: Threshold for highlighting significant changes
24
- """
25
-
26
- # Get unique text pairs
27
- text_pairs = metric_results['Text Pair'].unique()
28
-
29
- # Create enhanced heatmap data
30
- enhanced_data = []
31
-
32
- for pair in text_pairs:
33
- texts = pair.split(' vs ')
34
- if len(texts) == 2:
35
- text1_name, text2_name = texts
36
-
37
- # Get actual text content
38
- text1_content = texts_dict.get(text1_name, '')
39
- text2_content = texts_dict.get(text2_name, '')
40
-
41
- # Perform structural analysis
42
- changes = detect_structural_changes(text1_content, text2_content)
43
- alignment = generate_structural_alignment(text1_content, text2_content)
44
-
45
- # Create enhanced metrics
46
- enhanced_row = {
47
- 'Text Pair': pair,
48
- 'Chapter': chapter_key,
49
- 'structural_changes': len(changes['insertions']) + len(changes['deletions']) + len(changes['modifications']),
50
- 'modification_score': len(changes['modifications']),
51
- 'insertion_score': len(changes['insertions']),
52
- 'deletion_score': len(changes['deletions']),
53
- 'alignment_quality': len(alignment['matches']) / max(len(alignment['segments1']) + len(alignment['segments2']), 1),
54
- 'significant_differences': len([c for c in changes['modifications'] if len(c['original']) > 10])
55
- }
56
-
57
- enhanced_data.append(enhanced_row)
58
-
59
- # Create a clean table with numbers and percentages
60
- summary_table = []
61
-
62
- for row in enhanced_data:
63
- text_pair = row['Text Pair']
64
- chapter = row['Chapter']
65
-
66
- # Calculate percentages
67
- total_changes = row['structural_changes']
68
- modifications = row['modification_score']
69
- insertions_deletions = row['insertion_score'] + row['deletion_score']
70
- alignment_quality = row['alignment_quality']
71
-
72
- # Create summary row
73
- summary_row = {
74
- 'Text Pair': text_pair,
75
- 'Chapter': chapter,
76
- 'Total Changes': total_changes,
77
- 'Modifications': modifications,
78
- 'Insertions/Deletions': insertions_deletions,
79
- 'Alignment Quality': f"{alignment_quality:.1f}%",
80
- 'Significant Differences': row['significant_differences']
81
- }
82
-
83
- summary_table.append(summary_row)
84
-
85
- # Create DataFrame for table display
86
- summary_df = pd.DataFrame(summary_table)
87
-
88
- # Create a simple table with styling
89
- fig = go.Figure(data=[go.Table(
90
- header=dict(
91
- values=['Text Pair', 'Chapter', 'Total Changes', 'Modifications',
92
- 'Insertions/Deletions', 'Alignment Quality', 'Significant Differences'],
93
- font=dict(size=12, color='white'),
94
- fill_color='darkblue',
95
- align='left'
96
- ),
97
- cells=dict(
98
- values=[
99
- summary_df['Text Pair'],
100
- summary_df['Chapter'],
101
- summary_df['Total Changes'],
102
- summary_df['Modifications'],
103
- summary_df['Insertions/Deletions'],
104
- summary_df['Alignment Quality'],
105
- summary_df['Significant Differences']
106
- ],
107
- font=dict(size=11),
108
- align='left',
109
- fill_color=['lightgrey' if i % 2 == 0 else 'white'
110
- for i in range(len(summary_df))]
111
- )
112
- )])
113
-
114
- fig.update_layout(
115
- title="Structural Analysis Summary",
116
- height=400,
117
- margin=dict(l=10, r=10, t=40, b=10)
118
- )
119
-
120
- return fig
121
-
122
-
123
- def create_change_detection_report(texts_dict: Dict[str, str],
124
- chapter_key: str,
125
- output_format: str = 'html') -> str:
126
- """
127
- Create detailed change detection report for a chapter.
128
-
129
- Args:
130
- texts_dict: Dictionary mapping text names to content
131
- chapter_key: Chapter identifier
132
- output_format: Format for output ('html', 'json', 'markdown')
133
- """
134
-
135
- from .structural_analysis import generate_differential_report
136
-
137
- text_names = list(texts_dict.keys())
138
- reports = []
139
-
140
- for i, text1_name in enumerate(text_names):
141
- for text2_name in text_names[i+1:]:
142
- text1_content = texts_dict[text1_name]
143
- text2_content = texts_dict[text2_name]
144
-
145
- report = generate_differential_report(
146
- text1_content, text2_content, text1_name, text2_name
147
- )
148
- reports.append(report)
149
-
150
- if output_format == 'html':
151
- return create_html_report(reports, chapter_key)
152
- elif output_format == 'json':
153
- import json
154
- return json.dumps(reports, indent=2, ensure_ascii=False)
155
- else:
156
- return create_markdown_report(reports, chapter_key)
157
-
158
-
159
- def create_html_report(reports: List[Dict], chapter_key: str) -> str:
160
- """Create HTML report for structural analysis."""
161
-
162
- html = f"""
163
- <!DOCTYPE html>
164
- <html>
165
- <head>
166
- <title>Structural Analysis Report - Chapter {chapter_key}</title>
167
- <style>
168
- body {{ font-family: Arial, sans-serif; margin: 20px; }}
169
- .report {{ max-width: 1200px; margin: 0 auto; }}
170
- .comparison {{ border: 1px solid #ddd; margin: 20px 0; padding: 15px; }}
171
- .changes {{ display: flex; gap: 20px; }}
172
- .change-type {{ flex: 1; padding: 10px; border: 1px solid #eee; }}
173
- .insertion {{ background-color: #e8f5e8; }}
174
- .deletion {{ background-color: #ffe8e8; }}
175
- .modification {{ background-color: #fff3e0; }}
176
- .highlight {{ background-color: yellow; padding: 2px 4px; }}
177
- </style>
178
- </head>
179
- <body>
180
- <div class="report">
181
- <h1>Structural Analysis Report - Chapter {chapter_key}</h1>
182
- """
183
-
184
- for report in reports:
185
- html += f"""
186
- <div class="comparison">
187
- <h2>{report['file1']} vs {report['file2']}</h2>
188
- <div class="scores">
189
- <p><strong>Structural Similarity:</strong> {report['scores']['structural_similarity']:.2f}</p>
190
- <p><strong>Alignment Score:</strong> {report['scores']['alignment_score']:.2f}</p>
191
- </div>
192
-
193
- <div class="changes">
194
- <div class="change-type insertion">
195
- <h3>Insertions ({len(report['changes']['insertions'])})</h3>
196
- {format_changes_html(report['changes']['insertions'])}
197
- </div>
198
- <div class="change-type deletion">
199
- <h3>Deletions ({len(report['changes']['deletions'])})</h3>
200
- {format_changes_html(report['changes']['deletions'])}
201
- </div>
202
- <div class="change-type modification">
203
- <h3>Modifications ({len(report['changes']['modifications'])})</h3>
204
- {format_changes_html(report['changes']['modifications'], is_modification=True)}
205
- </div>
206
- </div>
207
- </div>
208
- """
209
-
210
- html += """
211
- </div>
212
- </body>
213
- </html>
214
- """
215
-
216
- return html
217
-
218
-
219
- def format_changes_html(changes: List[Dict], is_modification: bool = False) -> str:
220
- """Format changes for HTML display."""
221
- if not changes:
222
- return "<p>No changes detected.</p>"
223
-
224
- html = ""
225
- for change in changes[:5]: # Limit to first 5 for brevity
226
- if is_modification:
227
- html += f"""
228
- <div class="change">
229
- <span class="highlight">{change.get('original', '')}</span> →
230
- <span class="highlight">{change.get('replacement', '')}</span>
231
- </div>
232
- """
233
- else:
234
- html += f"""
235
- <div class="change">
236
- <span class="highlight">{change.get('word', '')}</span>
237
- </div>
238
- """
239
-
240
- if len(changes) > 5:
241
- html += f"<p>... and {len(changes) - 5} more</p>"
242
-
243
- return html
244
-
245
-
246
- def create_markdown_report(reports: List[Dict], chapter_key: str) -> str:
247
- """Create markdown report for structural analysis."""
248
-
249
- md = f"# Structural Analysis Report - Chapter {chapter_key}\n\n"
250
-
251
- for report in reports:
252
- md += f"## {report['file1']} vs {report['file2']}\n\n"
253
- md += f"- **Structural Similarity**: {report['scores']['structural_similarity']:.2f}\n"
254
- md += f"- **Alignment Score**: {report['scores']['alignment_score']:.2f}\n"
255
- md += f"- **Insertions**: {len(report['changes']['insertions'])}\n"
256
- md += f"- **Deletions**: {len(report['changes']['deletions'])}\n"
257
- md += f"- **Modifications**: {len(report['changes']['modifications'])}\n\n"
258
-
259
- if report['changes']['modifications']:
260
- md += "### Significant Modifications:\n"
261
- for mod in report['changes']['modifications'][:3]:
262
- md += f"- **{mod.get('original', '')}** → **{mod.get('replacement', '')}**\n"
263
-
264
- return md
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pipeline/structural_analysis.py DELETED
@@ -1,332 +0,0 @@
1
- """
2
- Chapter-level structural analysis for Tibetan legal manuscripts.
3
- Enhanced with Juxta/CollateX-inspired advanced alignment algorithms.
4
- """
5
-
6
- import difflib
7
- import re
8
- import logging
9
- from thefuzz import fuzz
10
- from .advanced_alignment import enhanced_structural_analysis
11
-
12
- logger = logging.getLogger(__name__)
13
-
14
-
15
- def detect_structural_changes(text1: str, text2: str,
16
- min_change_length: int = 5,
17
- context_window: int = 10) -> dict:
18
- """
19
- Detect structural changes between two Tibetan text chapters.
20
-
21
- Args:
22
- text1: First text chapter
23
- text2: Second text chapter
24
- min_change_length: Minimum length of change to report
25
- context_window: Number of characters to include as context
26
-
27
- Returns:
28
- Dictionary with detected changes: insertions, deletions, modifications
29
- """
30
-
31
- # Clean texts for comparison
32
- def clean_text(text):
33
- # Remove extra whitespace and normalize
34
- text = re.sub(r'\s+', ' ', text.strip())
35
- return text
36
-
37
- clean1 = clean_text(text1)
38
- clean2 = clean_text(text2)
39
-
40
- # Use difflib to detect changes
41
- differ = difflib.Differ()
42
- diff = list(differ.compare(clean1.split(), clean2.split()))
43
-
44
- changes = {
45
- 'insertions': [],
46
- 'deletions': [],
47
- 'modifications': [],
48
- 'unchanged': []
49
- }
50
-
51
- # Track current position in both texts
52
- pos1 = 0
53
- pos2 = 0
54
-
55
- for i, line in enumerate(diff):
56
- if line.startswith(' '): # Unchanged
57
- word = line[2:]
58
- changes['unchanged'].append({
59
- 'word': word,
60
- 'position1': pos1,
61
- 'position2': pos2,
62
- 'length': len(word)
63
- })
64
- pos1 += len(word) + 1
65
- pos2 += len(word) + 1
66
-
67
- elif line.startswith('- '): # Deletion
68
- word = line[2:]
69
- if len(word) >= min_change_length:
70
- changes['deletions'].append({
71
- 'word': word,
72
- 'position': pos1,
73
- 'length': len(word),
74
- 'context': get_context(clean1, pos1, context_window)
75
- })
76
- pos1 += len(word) + 1
77
-
78
- elif line.startswith('+ '): # Insertion
79
- word = line[2:]
80
- if len(word) >= min_change_length:
81
- changes['insertions'].append({
82
- 'word': word,
83
- 'position': pos2,
84
- 'length': len(word),
85
- 'context': get_context(clean2, pos2, context_window)
86
- })
87
- pos2 += len(word) + 1
88
-
89
- # Detect modifications (adjacent deletions and insertions)
90
- modifications = detect_modifications(changes['deletions'], changes['insertions'])
91
- changes['modifications'] = modifications
92
-
93
- return changes
94
-
95
-
96
- def get_context(text: str, position: int, window: int) -> str:
97
- """Get context around a position in text."""
98
- start = max(0, position - window)
99
- end = min(len(text), position + window)
100
- return text[start:end]
101
-
102
-
103
- def detect_modifications(deletions: list[dict], insertions: list[dict]) -> list[dict]:
104
- """Detect modifications by pairing nearby deletions and insertions using fuzzy matching."""
105
- modifications = []
106
-
107
- # First pass: match by position proximity
108
- for deletion in deletions[:]: # Copy to avoid modification during iteration
109
- for insertion in insertions[:]:
110
- # If deletion and insertion are close (within 5 positions)
111
- if abs(deletion['position'] - insertion['position']) <= 5:
112
- # Calculate fuzzy similarity score
113
- similarity = fuzz.token_set_ratio(deletion['word'], insertion['word']) / 100.0
114
-
115
- modifications.append({
116
- 'original': deletion['word'],
117
- 'replacement': insertion['word'],
118
- 'position': deletion['position'],
119
- 'deletion_context': deletion['context'],
120
- 'insertion_context': insertion['context'],
121
- 'similarity': similarity
122
- })
123
- # Remove from original lists to avoid duplicates
124
- if deletion in deletions:
125
- deletions.remove(deletion)
126
- if insertion in insertions:
127
- insertions.remove(insertion)
128
- break
129
-
130
- # Second pass: use fuzzy matching for remaining items that might be related
131
- # but not positionally close
132
- remaining_deletions = deletions[:]
133
- for deletion in remaining_deletions:
134
- if not insertions: # No insertions left to match
135
- break
136
-
137
- # Find best fuzzy match among remaining insertions
138
- best_match = None
139
- best_score = 0
140
- best_idx = -1
141
-
142
- for i, insertion in enumerate(insertions):
143
- score = fuzz.token_set_ratio(deletion['word'], insertion['word'])
144
- if score > 60 and score > best_score: # Threshold of 60% similarity
145
- best_score = score
146
- best_match = insertion
147
- best_idx = i
148
-
149
- if best_match:
150
- modifications.append({
151
- 'original': deletion['word'],
152
- 'replacement': best_match['word'],
153
- 'position': deletion['position'],
154
- 'deletion_context': deletion['context'],
155
- 'insertion_context': best_match['context'],
156
- 'similarity': best_score / 100.0,
157
- 'fuzzy_matched': True
158
- })
159
- # Remove matched items
160
- if deletion in deletions:
161
- deletions.remove(deletion)
162
- insertions.pop(best_idx)
163
-
164
- return modifications
165
-
166
-
167
- def generate_structural_alignment(text1: str, text2: str) -> dict[str, list]:
168
- """
169
- Generate enhanced structural alignment using advanced algorithms.
170
-
171
- Returns:
172
- Dictionary with Juxta/CollateX-inspired alignment information
173
- """
174
-
175
- try:
176
- # Use enhanced alignment from advanced_alignment module
177
- result = enhanced_structural_analysis(text1, text2)
178
-
179
- # Convert to legacy format for backward compatibility
180
- alignment = {
181
- 'matches': [],
182
- 'gaps': [],
183
- 'mismatches': [],
184
- 'segments1': [],
185
- 'segments2': []
186
- }
187
-
188
- # Process alignment segments
189
- for segment in result.get('alignment_segments', []):
190
- if segment['type'] == 'match':
191
- alignment['matches'].append({
192
- 'segments1': [segment['content1']],
193
- 'segments2': [segment['content2']],
194
- 'type': 'match',
195
- 'confidence': segment['confidence']
196
- })
197
- elif segment['type'] == 'insertion':
198
- alignment['gaps'].append({
199
- 'segments': [segment['content2']],
200
- 'type': 'insertion',
201
- 'position': 'text2',
202
- 'confidence': segment['confidence']
203
- })
204
- elif segment['type'] == 'deletion':
205
- alignment['gaps'].append({
206
- 'segments': [segment['content1']],
207
- 'type': 'deletion',
208
- 'position': 'text1',
209
- 'confidence': segment['confidence']
210
- })
211
- elif segment['type'] in ['mismatch', 'modification']:
212
- alignment['mismatches'].append({
213
- 'original': [segment['content1']],
214
- 'replacement': [segment['content2']],
215
- 'type': 'modification',
216
- 'confidence': segment['confidence']
217
- })
218
-
219
- return alignment
220
-
221
- except Exception as e:
222
- logger.warning(f"Enhanced alignment failed, falling back to basic: {e}")
223
-
224
- # Fallback to basic alignment for robustness
225
- def split_into_segments(text):
226
- segments = re.split(r'[།༎༏༐༑༔]', text)
227
- return [seg.strip() for seg in segments if seg.strip()]
228
-
229
- segments1 = split_into_segments(text1)
230
- segments2 = split_into_segments(text2)
231
-
232
- matcher = difflib.SequenceMatcher(None, segments1, segments2)
233
-
234
- alignment = {
235
- 'matches': [],
236
- 'gaps': [],
237
- 'mismatches': [],
238
- 'segments1': segments1,
239
- 'segments2': segments2
240
- }
241
-
242
- for tag, i1, i2, j1, j2 in matcher.get_opcodes():
243
- if tag == 'equal':
244
- alignment['matches'].append({
245
- 'segments1': segments1[i1:i2],
246
- 'segments2': segments2[j1:j2],
247
- 'type': 'match'
248
- })
249
- elif tag == 'delete':
250
- alignment['gaps'].append({
251
- 'segments': segments1[i1:i2],
252
- 'type': 'deletion',
253
- 'position': 'text1'
254
- })
255
- elif tag == 'insert':
256
- alignment['gaps'].append({
257
- 'segments': segments2[j1:j2],
258
- 'type': 'insertion',
259
- 'position': 'text2'
260
- })
261
- elif tag == 'replace':
262
- alignment['mismatches'].append({
263
- 'original': segments1[i1:i2],
264
- 'replacement': segments2[j1:j2],
265
- 'type': 'modification'
266
- })
267
-
268
- return alignment
269
-
270
-
271
- def calculate_structural_similarity_score(text1: str, text2: str) -> dict[str, float]:
272
- """
273
- Calculate various structural similarity scores between two texts.
274
-
275
- Returns:
276
- Dictionary with multiple similarity metrics
277
- """
278
-
279
- changes = detect_structural_changes(text1, text2)
280
- alignment = generate_structural_alignment(text1, text2)
281
-
282
- # Calculate scores
283
- total_changes = len(changes['insertions']) + len(changes['deletions']) + len(changes['modifications'])
284
-
285
- # Structural similarity score (inverse of changes)
286
- text_length = max(len(text1.split()), len(text2.split()))
287
- structural_score = max(0, 1 - (total_changes / text_length)) if text_length > 0 else 0
288
-
289
- # Alignment-based score
290
- total_segments = len(alignment['segments1']) + len(alignment['segments2'])
291
- matches = len(alignment['matches'])
292
- alignment_score = matches / (total_segments / 2) if total_segments > 0 else 0
293
-
294
- return {
295
- 'structural_similarity': structural_score,
296
- 'alignment_score': alignment_score,
297
- 'insertions': len(changes['insertions']),
298
- 'deletions': len(changes['deletions']),
299
- 'modifications': len(changes['modifications']),
300
- 'total_changes': total_changes
301
- }
302
-
303
-
304
- def generate_differential_report(text1: str, text2: str,
305
- file1_name: str = "Text 1",
306
- file2_name: str = "Text 2") -> dict[str, any]:
307
- """
308
- Generate a comprehensive differential report for two text chapters.
309
-
310
- Returns:
311
- Complete report with changes, alignment, and recommendations
312
- """
313
-
314
- changes = detect_structural_changes(text1, text2)
315
- alignment = generate_structural_alignment(text1, text2)
316
- scores = calculate_structural_similarity_score(text1, text2)
317
-
318
- report = {
319
- 'file1': file1_name,
320
- 'file2': file2_name,
321
- 'changes': changes,
322
- 'alignment': alignment,
323
- 'scores': scores,
324
- 'summary': {
325
- 'significant_differences': len([c for c in changes['modifications'] if len(c['original']) > 10 or len(c['replacement']) > 10]),
326
- 'minor_variants': len([c for c in changes['modifications'] if len(c['original']) <= 5 and len(c['replacement']) <= 5]),
327
- 'structural_preservation': scores['alignment_score'] > 0.8,
328
- 'recommendation': 'Manuscripts are structurally similar' if scores['alignment_score'] > 0.7 else 'Significant structural differences detected'
329
- }
330
- }
331
-
332
- return report