Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -320,38 +320,65 @@ def analyze_subregion(state, header, region_start, region_end):
|
|
320 |
###############################################################################
|
321 |
|
322 |
def normalize_shap_lengths(shap1, shap2, num_points=1000):
|
|
|
|
|
|
|
|
|
|
|
323 |
x1 = np.linspace(0, 1, len(shap1))
|
324 |
x2 = np.linspace(0, 1, len(shap2))
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
|
|
|
|
|
|
331 |
|
332 |
def compute_shap_difference(shap1_norm, shap2_norm):
|
|
|
333 |
return shap2_norm - shap1_norm
|
334 |
|
335 |
def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
|
|
|
|
|
|
|
336 |
heatmap_data = shap_diff.reshape(1, -1)
|
337 |
extent = max(abs(np.min(shap_diff)), abs(np.max(shap_diff)))
|
338 |
-
|
339 |
fig, ax = plt.subplots(figsize=(12, 1.8))
|
|
|
340 |
cax = ax.imshow(heatmap_data, aspect='auto', cmap=cmap, vmin=-extent, vmax=extent)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
cbar = plt.colorbar(cax, orientation='horizontal', pad=0.25, aspect=40, shrink=0.8)
|
342 |
cbar.ax.tick_params(labelsize=8)
|
343 |
cbar.set_label('SHAP Difference (Seq2 - Seq1)', fontsize=9, labelpad=5)
|
|
|
344 |
ax.set_yticks([])
|
345 |
-
ax.set_xlabel('
|
346 |
ax.set_title(title, pad=10)
|
347 |
plt.subplots_adjust(bottom=0.25, left=0.05, right=0.95)
|
|
|
348 |
return fig
|
349 |
|
350 |
def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
|
|
|
|
|
|
|
351 |
# Analyze first sequence
|
352 |
res1 = analyze_sequence(file1, top_kmers=10, fasta_text=fasta1, window_size=500)
|
353 |
if isinstance(res1[0], str) and "Error" in res1[0]:
|
354 |
return (f"Error in sequence 1: {res1[0]}", None, None)
|
|
|
355 |
# Analyze second sequence
|
356 |
res2 = analyze_sequence(file2, top_kmers=10, fasta_text=fasta2, window_size=500)
|
357 |
if isinstance(res2[0], str) and "Error" in res2[0]:
|
@@ -359,46 +386,52 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
|
|
359 |
|
360 |
shap1 = res1[3]["shap_means"]
|
361 |
shap2 = res2[3]["shap_means"]
|
|
|
|
|
362 |
shap1_norm, shap2_norm = normalize_shap_lengths(shap1, shap2)
|
363 |
shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
|
364 |
|
|
|
365 |
avg_diff = np.mean(shap_diff)
|
366 |
std_diff = np.std(shap_diff)
|
367 |
max_diff = np.max(shap_diff)
|
368 |
min_diff = np.min(shap_diff)
|
|
|
369 |
threshold = 0.05
|
370 |
substantial_diffs = np.abs(shap_diff) > threshold
|
371 |
frac_different = np.mean(substantial_diffs)
|
372 |
-
|
373 |
-
|
374 |
-
classification2 = res2[0].split('Classification: ')[1].split('\n')[0].strip()
|
375 |
len1_formatted = "{:,}".format(len(shap1))
|
376 |
len2_formatted = "{:,}".format(len(shap2))
|
377 |
-
|
378 |
-
|
|
|
379 |
comparison_text = (
|
380 |
"Sequence Comparison Results:\n"
|
381 |
-
f"Sequence 1: {res1[4]}\n"
|
382 |
-
f"Length: {len1_formatted} bases\n"
|
383 |
f"Classification: {classification1}\n\n"
|
384 |
-
f"Sequence 2: {res2[4]}\n"
|
385 |
-
f"Length: {len2_formatted} bases\n"
|
386 |
f"Classification: {classification2}\n\n"
|
387 |
"Comparison Statistics:\n"
|
388 |
f"Average SHAP difference: {avg_diff:.4f}\n"
|
389 |
f"Standard deviation: {std_diff:.4f}\n"
|
390 |
f"Max difference: {max_diff:.4f} (Seq2 more human-like)\n"
|
391 |
f"Min difference: {min_diff:.4f} (Seq1 more human-like)\n"
|
392 |
-
f"Fraction of positions with substantial differences: {
|
|
|
393 |
"Interpretation:\n"
|
394 |
-
"
|
395 |
-
"
|
|
|
396 |
)
|
397 |
-
|
|
|
398 |
heatmap_fig = plot_comparative_heatmap(shap_diff)
|
399 |
heatmap_img = fig_to_image(heatmap_fig)
|
400 |
hist_fig = plot_shap_histogram(shap_diff, title="Distribution of SHAP Differences")
|
401 |
hist_img = fig_to_image(hist_fig)
|
|
|
402 |
return comparison_text, heatmap_img, hist_img
|
403 |
|
404 |
###############################################################################
|
|
|
320 |
###############################################################################
|
321 |
|
322 |
def normalize_shap_lengths(shap1, shap2, num_points=1000):
|
323 |
+
"""
|
324 |
+
Normalize SHAP values to relative positions (0-1 scale).
|
325 |
+
Each point represents a relative position in the sequence (e.g., 0.75 = 75% through sequence).
|
326 |
+
"""
|
327 |
+
# Create relative position arrays (0 to 1)
|
328 |
x1 = np.linspace(0, 1, len(shap1))
|
329 |
x2 = np.linspace(0, 1, len(shap2))
|
330 |
+
|
331 |
+
# Create normalized positions for comparison
|
332 |
+
x_norm = np.linspace(0, 1, num_points)
|
333 |
+
|
334 |
+
# Interpolate both sequences to the normalized positions
|
335 |
+
shap1_interp = np.interp(x_norm, x1, shap1)
|
336 |
+
shap2_interp = np.interp(x_norm, x2, shap2)
|
337 |
+
|
338 |
+
return shap1_interp, shap2_interp
|
339 |
|
340 |
def compute_shap_difference(shap1_norm, shap2_norm):
|
341 |
+
"""Compute the SHAP difference between normalized sequences"""
|
342 |
return shap2_norm - shap1_norm
|
343 |
|
344 |
def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
|
345 |
+
"""
|
346 |
+
Plot heatmap using relative positions (0-100%)
|
347 |
+
"""
|
348 |
heatmap_data = shap_diff.reshape(1, -1)
|
349 |
extent = max(abs(np.min(shap_diff)), abs(np.max(shap_diff)))
|
350 |
+
|
351 |
fig, ax = plt.subplots(figsize=(12, 1.8))
|
352 |
+
cmap = get_zero_centered_cmap()
|
353 |
cax = ax.imshow(heatmap_data, aspect='auto', cmap=cmap, vmin=-extent, vmax=extent)
|
354 |
+
|
355 |
+
# Create percentage-based x-axis ticks
|
356 |
+
num_ticks = 5
|
357 |
+
tick_positions = np.linspace(0, shap_diff.shape[0]-1, num_ticks)
|
358 |
+
tick_labels = [f"{int(x*100)}%" for x in np.linspace(0, 1, num_ticks)]
|
359 |
+
ax.set_xticks(tick_positions)
|
360 |
+
ax.set_xticklabels(tick_labels)
|
361 |
+
|
362 |
cbar = plt.colorbar(cax, orientation='horizontal', pad=0.25, aspect=40, shrink=0.8)
|
363 |
cbar.ax.tick_params(labelsize=8)
|
364 |
cbar.set_label('SHAP Difference (Seq2 - Seq1)', fontsize=9, labelpad=5)
|
365 |
+
|
366 |
ax.set_yticks([])
|
367 |
+
ax.set_xlabel('Relative Position in Sequence', fontsize=10)
|
368 |
ax.set_title(title, pad=10)
|
369 |
plt.subplots_adjust(bottom=0.25, left=0.05, right=0.95)
|
370 |
+
|
371 |
return fig
|
372 |
|
373 |
def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
|
374 |
+
"""
|
375 |
+
Compare two sequences using relative positions (0-1 scale)
|
376 |
+
"""
|
377 |
# Analyze first sequence
|
378 |
res1 = analyze_sequence(file1, top_kmers=10, fasta_text=fasta1, window_size=500)
|
379 |
if isinstance(res1[0], str) and "Error" in res1[0]:
|
380 |
return (f"Error in sequence 1: {res1[0]}", None, None)
|
381 |
+
|
382 |
# Analyze second sequence
|
383 |
res2 = analyze_sequence(file2, top_kmers=10, fasta_text=fasta2, window_size=500)
|
384 |
if isinstance(res2[0], str) and "Error" in res2[0]:
|
|
|
386 |
|
387 |
shap1 = res1[3]["shap_means"]
|
388 |
shap2 = res2[3]["shap_means"]
|
389 |
+
|
390 |
+
# Normalize to relative positions
|
391 |
shap1_norm, shap2_norm = normalize_shap_lengths(shap1, shap2)
|
392 |
shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
|
393 |
|
394 |
+
# Calculate statistics
|
395 |
avg_diff = np.mean(shap_diff)
|
396 |
std_diff = np.std(shap_diff)
|
397 |
max_diff = np.max(shap_diff)
|
398 |
min_diff = np.min(shap_diff)
|
399 |
+
|
400 |
threshold = 0.05
|
401 |
substantial_diffs = np.abs(shap_diff) > threshold
|
402 |
frac_different = np.mean(substantial_diffs)
|
403 |
+
|
404 |
+
# Format output text
|
|
|
405 |
len1_formatted = "{:,}".format(len(shap1))
|
406 |
len2_formatted = "{:,}".format(len(shap2))
|
407 |
+
classification1 = res1[0].split('Classification: ')[1].split('\n')[0].strip()
|
408 |
+
classification2 = res2[0].split('Classification: ')[1].split('\n')[0].strip()
|
409 |
+
|
410 |
comparison_text = (
|
411 |
"Sequence Comparison Results:\n"
|
412 |
+
f"Sequence 1: {res1[4]} (Length: {len1_formatted} bases)\n"
|
|
|
413 |
f"Classification: {classification1}\n\n"
|
414 |
+
f"Sequence 2: {res2[4]} (Length: {len2_formatted} bases)\n"
|
|
|
415 |
f"Classification: {classification2}\n\n"
|
416 |
"Comparison Statistics:\n"
|
417 |
f"Average SHAP difference: {avg_diff:.4f}\n"
|
418 |
f"Standard deviation: {std_diff:.4f}\n"
|
419 |
f"Max difference: {max_diff:.4f} (Seq2 more human-like)\n"
|
420 |
f"Min difference: {min_diff:.4f} (Seq1 more human-like)\n"
|
421 |
+
f"Fraction of positions with substantial differences: {frac_different:.2%}\n\n"
|
422 |
+
"Note: Comparisons shown at relative positions (0-100%) in each sequence\n"
|
423 |
"Interpretation:\n"
|
424 |
+
"- Red regions: Sequence 2 is more human-like at that relative position\n"
|
425 |
+
"- Blue regions: Sequence 1 is more human-like at that relative position\n"
|
426 |
+
"- White regions: Similar between sequences"
|
427 |
)
|
428 |
+
|
429 |
+
# Generate visualizations
|
430 |
heatmap_fig = plot_comparative_heatmap(shap_diff)
|
431 |
heatmap_img = fig_to_image(heatmap_fig)
|
432 |
hist_fig = plot_shap_histogram(shap_diff, title="Distribution of SHAP Differences")
|
433 |
hist_img = fig_to_image(hist_fig)
|
434 |
+
|
435 |
return comparison_text, heatmap_img, hist_img
|
436 |
|
437 |
###############################################################################
|