hiyata commited on
Commit
0c54683
·
verified ·
1 Parent(s): 7d672a0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -168
app.py CHANGED
@@ -571,6 +571,18 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
571
  # 11. GENE FEATURE ANALYSIS
572
  ###############################################################################
573
 
 
 
 
 
 
 
 
 
 
 
 
 
574
  def parse_gene_features(text: str) -> List[Dict[str, Any]]:
575
  """Parse gene features from text file in FASTA-like format"""
576
  genes = []
@@ -643,174 +655,6 @@ def compute_gene_statistics(gene_shap: np.ndarray) -> Dict[str, float]:
643
  'pos_fraction': float(np.mean(gene_shap > 0))
644
  }
645
 
646
- def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_length: int) -> Image.Image:
647
- """
648
- Create a simple genome diagram using PIL, forcing a minimum color intensity
649
- so that small SHAP values don't appear white.
650
- """
651
- from PIL import Image, ImageDraw, ImageFont
652
-
653
- # Validate inputs
654
- if not gene_results or genome_length <= 0:
655
- img = Image.new('RGB', (800, 100), color='white')
656
- draw = ImageDraw.Draw(img)
657
- draw.text((10, 40), "Error: Invalid input data", fill='black')
658
- return img
659
-
660
- # Ensure all gene coordinates are valid integers
661
- for gene in gene_results:
662
- gene['start'] = max(0, int(gene['start']))
663
- gene['end'] = min(genome_length, int(gene['end']))
664
- if gene['start'] >= gene['end']:
665
- print(f"Warning: Invalid coordinates for gene {gene.get('gene_name','?')}: {gene['start']}-{gene['end']}")
666
-
667
- # Image dimensions
668
- width = 1500
669
- height = 600
670
- margin = 50
671
- track_height = 40
672
-
673
- # Create image with white background
674
- img = Image.new('RGB', (width, height), 'white')
675
- draw = ImageDraw.Draw(img)
676
-
677
- # Try to load font, fall back to default if unavailable
678
- try:
679
- font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
680
- title_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 16)
681
- except:
682
- font = ImageFont.load_default()
683
- title_font = ImageFont.load_default()
684
-
685
- # Draw title
686
- draw.text((margin, margin // 2), "Genome SHAP Analysis", fill='black', font=title_font or font)
687
-
688
- # Draw genome line
689
- line_y = height // 2
690
- draw.line([(int(margin), int(line_y)), (int(width - margin), int(line_y))], fill='black', width=2)
691
-
692
- # Calculate scale factor
693
- scale = float(width - 2 * margin) / float(genome_length)
694
-
695
- # Determine a reasonable step for scale markers
696
- num_ticks = 10
697
- if genome_length < num_ticks:
698
- step = 1
699
- else:
700
- step = genome_length // num_ticks
701
-
702
- # Draw scale markers
703
- for i in range(0, genome_length + 1, step):
704
- x_coord = margin + i * scale
705
- draw.line([
706
- (int(x_coord), int(line_y - 5)),
707
- (int(x_coord), int(line_y + 5))
708
- ], fill='black', width=1)
709
- draw.text((int(x_coord - 20), int(line_y + 10)), f"{i:,}", fill='black', font=font)
710
-
711
- # Sort genes by absolute SHAP value for drawing
712
- sorted_genes = sorted(gene_results, key=lambda x: abs(x['avg_shap']))
713
-
714
- # Draw genes
715
- for idx, gene in enumerate(sorted_genes):
716
- # Calculate position and ensure integers
717
- start_x = margin + int(gene['start'] * scale)
718
- end_x = margin + int(gene['end'] * scale)
719
-
720
- # Calculate color based on SHAP value
721
- avg_shap = gene['avg_shap']
722
-
723
- # Convert shap -> color intensity (0 to 255)
724
- # Then clamp to a minimum intensity so it never ends up plain white
725
- intensity = int(abs(avg_shap) * 500)
726
- intensity = max(50, min(255, intensity)) # clamp between 50 and 255
727
-
728
- if avg_shap > 0:
729
- # Red-ish for positive
730
- color = (255, 255 - intensity, 255 - intensity)
731
- else:
732
- # Blue-ish for negative or zero
733
- color = (255 - intensity, 255 - intensity, 255)
734
-
735
- # Draw gene rectangle
736
- draw.rectangle([
737
- (int(start_x), int(line_y - track_height // 2)),
738
- (int(end_x), int(line_y + track_height // 2))
739
- ], fill=color, outline='black')
740
-
741
- # Prepare gene name label
742
- label = str(gene.get('gene_name','?'))
743
-
744
- # If getsize() or textsize() is missing, use getmask(...).size as fallback
745
- # But if your Pillow version supports font.getsize, you can do:
746
- # label_width, label_height = font.getsize(label)
747
- label_mask = font.getmask(label)
748
- label_width, label_height = label_mask.size
749
-
750
- # Alternate label positions above/below line
751
- if idx % 2 == 0:
752
- text_y = line_y - track_height - 15
753
- else:
754
- text_y = line_y + track_height + 5
755
-
756
- # Decide whether to rotate text based on space
757
- gene_width = end_x - start_x
758
- if gene_width > label_width:
759
- text_x = start_x + (gene_width - label_width) // 2
760
- draw.text((int(text_x), int(text_y)), label, fill='black', font=font)
761
- elif gene_width > 20:
762
- txt_img = Image.new('RGBA', (label_width, label_height), (255, 255, 255, 0))
763
- txt_draw = ImageDraw.Draw(txt_img)
764
- txt_draw.text((0, 0), label, font=font, fill='black')
765
- rotated_img = txt_img.rotate(90, expand=True)
766
- img.paste(rotated_img, (int(start_x), int(text_y)), rotated_img)
767
-
768
- # Draw legend
769
- legend_x = margin
770
- legend_y = height - margin
771
- draw.text((int(legend_x), int(legend_y - 60)), "SHAP Values:", fill='black', font=font)
772
-
773
- # Draw legend boxes
774
- box_width = 20
775
- box_height = 20
776
- spacing = 15
777
-
778
- # Strong human-like
779
- draw.rectangle([
780
- (int(legend_x), int(legend_y - 45)),
781
- (int(legend_x + box_width), int(legend_y - 45 + box_height))
782
- ], fill=(255, 0, 0), outline='black')
783
- draw.text((int(legend_x + box_width + spacing), int(legend_y - 45)),
784
- "Strong human-like signal", fill='black', font=font)
785
-
786
- # Weak human-like
787
- draw.rectangle([
788
- (int(legend_x), int(legend_y - 20)),
789
- (int(legend_x + box_width), int(legend_y - 20 + box_height))
790
- ], fill=(255, 200, 200), outline='black')
791
- draw.text((int(legend_x + box_width + spacing), int(legend_y - 20)),
792
- "Weak human-like signal", fill='black', font=font)
793
-
794
- # Weak non-human-like
795
- draw.rectangle([
796
- (int(legend_x + 250), int(legend_y - 45)),
797
- (int(legend_x + 250 + box_width), int(legend_y - 45 + box_height))
798
- ], fill=(200, 200, 255), outline='black')
799
- draw.text((int(legend_x + 250 + box_width + spacing), int(legend_y - 45)),
800
- "Weak non-human-like signal", fill='black', font=font)
801
-
802
- # Strong non-human-like
803
- draw.rectangle([
804
- (int(legend_x + 250), int(legend_y - 20)),
805
- (int(legend_x + 250 + box_width), int(legend_y - 20 + box_height))
806
- ], fill=(0, 0, 255), outline='black')
807
- draw.text((int(legend_x + 250 + box_width + spacing), int(legend_y - 20)),
808
- "Strong non-human-like signal", fill='black', font=font)
809
-
810
- return img
811
-
812
-
813
-
814
  def create_simple_genome_diagram(gene_results, genome_length):
815
  from PIL import Image, ImageDraw, ImageFont
816
 
@@ -958,6 +802,121 @@ def create_simple_genome_diagram(gene_results, genome_length):
958
  return img
959
 
960
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
961
 
962
  ###############################################################################
963
  # 12. DOWNLOAD FUNCTIONS
 
571
  # 11. GENE FEATURE ANALYSIS
572
  ###############################################################################
573
 
574
+ import io
575
+ from io import BytesIO
576
+ from PIL import Image, ImageDraw, ImageFont
577
+ import numpy as np
578
+ import pandas as pd
579
+ import tempfile
580
+ import os
581
+ from typing import List, Dict, Tuple, Optional, Any
582
+ import matplotlib.pyplot as plt
583
+ from matplotlib.colors import LinearSegmentedColormap
584
+ import seaborn as sns
585
+
586
  def parse_gene_features(text: str) -> List[Dict[str, Any]]:
587
  """Parse gene features from text file in FASTA-like format"""
588
  genes = []
 
655
  'pos_fraction': float(np.mean(gene_shap > 0))
656
  }
657
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
658
  def create_simple_genome_diagram(gene_results, genome_length):
659
  from PIL import Image, ImageDraw, ImageFont
660
 
 
802
  return img
803
 
804
 
805
+ def analyze_gene_features(sequence_file: str,
806
+ features_file: str,
807
+ fasta_text: str = "",
808
+ features_text: str = "") -> Tuple[str, Optional[str], Optional[Image.Image]]:
809
+ """Analyze SHAP values for each gene feature"""
810
+ # First analyze whole sequence
811
+ sequence_results = analyze_sequence(sequence_file, top_kmers=10, fasta_text=fasta_text)
812
+ if isinstance(sequence_results[0], str) and "Error" in sequence_results[0]:
813
+ return f"Error in sequence analysis: {sequence_results[0]}", None, None
814
+
815
+ # Get SHAP values
816
+ shap_means = sequence_results[3]["shap_means"]
817
+
818
+ # Parse gene features
819
+ try:
820
+ if features_text.strip():
821
+ genes = parse_gene_features(features_text)
822
+ else:
823
+ with open(features_file, 'r') as f:
824
+ genes = parse_gene_features(f.read())
825
+ except Exception as e:
826
+ return f"Error reading features file: {str(e)}", None, None
827
+
828
+ # Analyze each gene
829
+ gene_results = []
830
+ for gene in genes:
831
+ try:
832
+ location = gene['metadata'].get('location', '')
833
+ if not location:
834
+ continue
835
+
836
+ start, end = parse_location(location)
837
+ if start is None or end is None:
838
+ continue
839
+
840
+ # Get SHAP values for this region
841
+ gene_shap = shap_means[start:end]
842
+ stats = compute_gene_statistics(gene_shap)
843
+
844
+ gene_results.append({
845
+ 'gene_name': gene['metadata'].get('gene', 'Unknown'),
846
+ 'location': location,
847
+ 'start': start,
848
+ 'end': end,
849
+ 'locus_tag': gene['metadata'].get('locus_tag', ''),
850
+ 'avg_shap': stats['avg_shap'],
851
+ 'median_shap': stats['median_shap'],
852
+ 'std_shap': stats['std_shap'],
853
+ 'max_shap': stats['max_shap'],
854
+ 'min_shap': stats['min_shap'],
855
+ 'pos_fraction': stats['pos_fraction'],
856
+ 'classification': 'Human' if stats['avg_shap'] > 0 else 'Non-human',
857
+ 'confidence': abs(stats['avg_shap'])
858
+ })
859
+
860
+ except Exception as e:
861
+ print(f"Error processing gene {gene['metadata'].get('gene', 'Unknown')}: {str(e)}")
862
+ continue
863
+
864
+ if not gene_results:
865
+ return "No valid genes could be processed", None, None
866
+
867
+ # Sort genes by absolute SHAP value
868
+ sorted_genes = sorted(gene_results, key=lambda x: abs(x['avg_shap']), reverse=True)
869
+
870
+ # Create results text
871
+ results_text = "Gene Analysis Results:\n\n"
872
+ results_text += f"Total genes analyzed: {len(gene_results)}\n"
873
+ results_text += f"Human-like genes: {sum(1 for g in gene_results if g['classification'] == 'Human')}\n"
874
+ results_text += f"Non-human-like genes: {sum(1 for g in gene_results if g['classification'] == 'Non-human')}\n\n"
875
+
876
+ results_text += "Top 10 most distinctive genes:\n"
877
+ for gene in sorted_genes[:10]:
878
+ results_text += (
879
+ f"Gene: {gene['gene_name']}\n"
880
+ f"Location: {gene['location']}\n"
881
+ f"Classification: {gene['classification']} "
882
+ f"(confidence: {gene['confidence']:.4f})\n"
883
+ f"Average SHAP: {gene['avg_shap']:.4f}\n\n"
884
+ )
885
+
886
+ # Create CSV content
887
+ csv_content = "gene_name,location,avg_shap,median_shap,std_shap,max_shap,min_shap,"
888
+ csv_content += "pos_fraction,classification,confidence,locus_tag\n"
889
+
890
+ for gene in gene_results:
891
+ csv_content += (
892
+ f"{gene['gene_name']},{gene['location']},{gene['avg_shap']:.4f},"
893
+ f"{gene['median_shap']:.4f},{gene['std_shap']:.4f},{gene['max_shap']:.4f},"
894
+ f"{gene['min_shap']:.4f},{gene['pos_fraction']:.4f},{gene['classification']},"
895
+ f"{gene['confidence']:.4f},{gene['locus_tag']}\n"
896
+ )
897
+
898
+ # Save CSV to temp file
899
+ try:
900
+ temp_dir = tempfile.gettempdir()
901
+ temp_path = os.path.join(temp_dir, f"gene_analysis_{os.urandom(4).hex()}.csv")
902
+
903
+ with open(temp_path, 'w') as f:
904
+ f.write(csv_content)
905
+ except Exception as e:
906
+ print(f"Error saving CSV: {str(e)}")
907
+ temp_path = None
908
+
909
+ # Create visualization
910
+ try:
911
+ diagram_img = create_simple_genome_diagram(gene_results, len(shap_means))
912
+ except Exception as e:
913
+ print(f"Error creating visualization: {str(e)}")
914
+ # Create error image
915
+ diagram_img = Image.new('RGB', (800, 100), color='white')
916
+ draw = ImageDraw.Draw(diagram_img)
917
+ draw.text((10, 40), f"Error creating visualization: {str(e)}", fill='black')
918
+
919
+ return results_text, temp_path, diagram_img
920
 
921
  ###############################################################################
922
  # 12. DOWNLOAD FUNCTIONS