hiyata commited on
Commit
d5992b1
·
verified ·
1 Parent(s): 4241c24

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -168
app.py CHANGED
@@ -567,16 +567,20 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
567
  # 11. GENE FEATURE ANALYSIS
568
  ###############################################################################
569
 
 
 
 
 
 
 
 
 
 
 
 
 
570
  def parse_gene_features(text: str) -> List[Dict[str, Any]]:
571
- """
572
- Parse gene features from text file in FASTA-like format
573
-
574
- Args:
575
- text (str): Input text in FASTA format with gene metadata
576
-
577
- Returns:
578
- List[Dict]: List of gene dictionaries containing sequence and metadata
579
- """
580
  genes = []
581
  current_header = None
582
  current_sequence = []
@@ -608,15 +612,7 @@ def parse_gene_features(text: str) -> List[Dict[str, Any]]:
608
  return genes
609
 
610
  def parse_gene_metadata(header: str) -> Dict[str, str]:
611
- """
612
- Extract metadata from gene header
613
-
614
- Args:
615
- header (str): Gene header line starting with '>'
616
-
617
- Returns:
618
- Dict[str, str]: Dictionary of metadata key-value pairs
619
- """
620
  metadata = {}
621
  parts = header.split()
622
 
@@ -629,18 +625,9 @@ def parse_gene_metadata(header: str) -> Dict[str, str]:
629
  return metadata
630
 
631
  def parse_location(location_str: str) -> Tuple[Optional[int], Optional[int]]:
632
- """
633
- Parse gene location string, handling both forward and complement strands
634
-
635
- Args:
636
- location_str (str): Location string (e.g., "1234..5678" or "complement(1234..5678)")
637
-
638
- Returns:
639
- Tuple[Optional[int], Optional[int]]: Start and end positions, or (None, None) if parsing fails
640
- """
641
  try:
642
- # Handle complement strand
643
- is_complement = location_str.startswith('complement(')
644
  clean_loc = location_str.replace('complement(', '').replace(')', '')
645
 
646
  # Split on '..' and convert to integers
@@ -649,43 +636,12 @@ def parse_location(location_str: str) -> Tuple[Optional[int], Optional[int]]:
649
  return start, end
650
  else:
651
  return None, None
652
-
653
  except Exception as e:
654
  print(f"Error parsing location {location_str}: {str(e)}")
655
  return None, None
656
 
657
- def save_results_to_temp(results: str, prefix: str = "analysis") -> Optional[str]:
658
- """
659
- Save results to a temporary file
660
-
661
- Args:
662
- results (str): Content to save
663
- prefix (str): Prefix for the temporary file name
664
-
665
- Returns:
666
- Optional[str]: Path to temporary file, or None if save fails
667
- """
668
- try:
669
- temp_dir = tempfile.gettempdir()
670
- temp_path = os.path.join(temp_dir, f"{prefix}_{os.urandom(4).hex()}.csv")
671
-
672
- with open(temp_path, 'w') as f:
673
- f.write(results)
674
- return temp_path
675
- except Exception as e:
676
- print(f"Error saving results: {str(e)}")
677
- return None
678
-
679
  def compute_gene_statistics(gene_shap: np.ndarray) -> Dict[str, float]:
680
- """
681
- Compute statistical measures for gene SHAP values
682
-
683
- Args:
684
- gene_shap (np.ndarray): Array of SHAP values for a gene
685
-
686
- Returns:
687
- Dict[str, float]: Dictionary of statistical measures
688
- """
689
  return {
690
  'avg_shap': float(np.mean(gene_shap)),
691
  'median_shap': float(np.median(gene_shap)),
@@ -695,25 +651,132 @@ def compute_gene_statistics(gene_shap: np.ndarray) -> Dict[str, float]:
695
  'pos_fraction': float(np.mean(gene_shap > 0))
696
  }
697
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698
  def analyze_gene_features(sequence_file: str,
699
  features_file: str,
700
  fasta_text: str = "",
701
  features_text: str = "") -> Tuple[str, Optional[str], Optional[Image.Image]]:
702
- """
703
- Analyze SHAP values for each gene feature
704
-
705
- Args:
706
- sequence_file (str): Path to FASTA file
707
- features_file (str): Path to features file
708
- fasta_text (str): FASTA content if provided as text
709
- features_text (str): Features content if provided as text
710
-
711
- Returns:
712
- Tuple[str, Optional[str], Optional[Image.Image]]:
713
- - Analysis results text
714
- - Path to CSV file
715
- - Genome diagram image
716
- """
717
  # First analyze whole sequence
718
  sequence_results = analyze_sequence(sequence_file, top_kmers=10, fasta_text=fasta_text)
719
  if isinstance(sequence_results[0], str) and "Error" in sequence_results[0]:
@@ -771,16 +834,16 @@ def analyze_gene_features(sequence_file: str,
771
  if not gene_results:
772
  return "No valid genes could be processed", None, None
773
 
 
 
 
774
  # Create results text
775
  results_text = "Gene Analysis Results:\n\n"
776
  results_text += f"Total genes analyzed: {len(gene_results)}\n"
777
  results_text += f"Human-like genes: {sum(1 for g in gene_results if g['classification'] == 'Human')}\n"
778
  results_text += f"Non-human-like genes: {sum(1 for g in gene_results if g['classification'] == 'Non-human')}\n\n"
779
 
780
- # Sort genes by absolute SHAP value for reporting
781
- sorted_genes = sorted(gene_results, key=lambda x: abs(x['avg_shap']), reverse=True)
782
-
783
- results_text += "Top 10 genes by signal strength:\n"
784
  for gene in sorted_genes[:10]:
785
  results_text += (
786
  f"Gene: {gene['gene_name']}\n"
@@ -803,99 +866,28 @@ def analyze_gene_features(sequence_file: str,
803
  )
804
 
805
  # Save CSV to temp file
806
- csv_path = save_results_to_temp(csv_content, "gene_analysis")
807
-
808
  try:
809
- # Create genome diagram
810
- diagram_img = create_genome_diagram(gene_results, len(shap_means))
811
- except Exception as e:
812
- print(f"Error creating genome diagram: {str(e)}")
813
- diagram_img = create_error_image(str(e))
814
-
815
- return results_text, csv_path, diagram_img
816
-
817
- def create_error_image(error_message: str) -> Image.Image:
818
- """
819
- Create an error image with message
820
-
821
- Args:
822
- error_message (str): Error message to display
823
 
824
- Returns:
825
- Image.Image: Error image
826
- """
827
- img = Image.new('RGB', (800, 100), color='white')
828
- draw = ImageDraw.Draw(img)
829
- try:
830
- font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
831
- except:
832
- font = None
833
- draw.text((10, 40), f"Error creating genome diagram: {error_message}",
834
- fill='black', font=font)
835
- return img
836
-
837
- def create_genome_diagram(gene_results: List[Dict[str, Any]],
838
- genome_length: int) -> Image.Image:
839
- """
840
- Create genome diagram using BioPython
841
 
842
- Args:
843
- gene_results (List[Dict]): List of gene analysis results
844
- genome_length (int): Total length of the genome
845
-
846
- Returns:
847
- Image.Image: Genome diagram image
848
- """
849
  try:
850
- # Create diagram
851
- gd_diagram = GenomeDiagram.Diagram("Genome SHAP Analysis")
852
- gd_track = gd_diagram.new_track(1, name="Genes")
853
- gd_feature_set = gd_track.new_set()
854
-
855
- # Add features
856
- for gene in gene_results:
857
- # Create feature
858
- feature = SeqFeature(
859
- FeatureLocation(gene['start'], gene['end']),
860
- type="gene"
861
- )
862
-
863
- # Calculate color based on SHAP value
864
- if gene['avg_shap'] > 0:
865
- intensity = min(1.0, abs(gene['avg_shap']) * 2)
866
- color = colors.Color(1-intensity, 1-intensity, 1) # Red
867
- else:
868
- intensity = min(1.0, abs(gene['avg_shap']) * 2)
869
- color = colors.Color(1-intensity, 1-intensity, 1) # Blue
870
-
871
- # Add to diagram
872
- gd_feature_set.add_feature(
873
- feature,
874
- color=color,
875
- label=True,
876
- name=f"{gene['gene_name']}\n(SHAP: {gene['avg_shap']:.3f})"
877
- )
878
-
879
- # Draw diagram
880
- gd_diagram.draw(
881
- format="linear",
882
- orientation="landscape",
883
- pagesize=(15, 5),
884
- start=0,
885
- end=genome_length,
886
- fragments=1
887
- )
888
-
889
- # Save to BytesIO and convert to PIL Image
890
- buffer = io.BytesIO()
891
- gd_diagram.write(buffer, "PNG")
892
- buffer.seek(0)
893
- return Image.open(buffer)
894
-
895
  except Exception as e:
896
- print(f"Error creating genome diagram: {str(e)}")
897
- return create_error_image(str(e))
898
-
 
 
 
 
 
899
  ###############################################################################
900
  # 12. DOWNLOAD FUNCTIONS
901
  ###############################################################################
 
567
  # 11. GENE FEATURE ANALYSIS
568
  ###############################################################################
569
 
570
+ import io
571
+ from io import BytesIO
572
+ from PIL import Image, ImageDraw, ImageFont
573
+ import numpy as np
574
+ import pandas as pd
575
+ import tempfile
576
+ import os
577
+ from typing import List, Dict, Tuple, Optional, Any
578
+ import matplotlib.pyplot as plt
579
+ from matplotlib.colors import LinearSegmentedColormap
580
+ import seaborn as sns
581
+
582
  def parse_gene_features(text: str) -> List[Dict[str, Any]]:
583
+ """Parse gene features from text file in FASTA-like format"""
 
 
 
 
 
 
 
 
584
  genes = []
585
  current_header = None
586
  current_sequence = []
 
612
  return genes
613
 
614
  def parse_gene_metadata(header: str) -> Dict[str, str]:
615
+ """Extract metadata from gene header"""
 
 
 
 
 
 
 
 
616
  metadata = {}
617
  parts = header.split()
618
 
 
625
  return metadata
626
 
627
  def parse_location(location_str: str) -> Tuple[Optional[int], Optional[int]]:
628
+ """Parse gene location string, handling both forward and complement strands"""
 
 
 
 
 
 
 
 
629
  try:
630
+ # Remove 'complement(' and ')' if present
 
631
  clean_loc = location_str.replace('complement(', '').replace(')', '')
632
 
633
  # Split on '..' and convert to integers
 
636
  return start, end
637
  else:
638
  return None, None
 
639
  except Exception as e:
640
  print(f"Error parsing location {location_str}: {str(e)}")
641
  return None, None
642
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
  def compute_gene_statistics(gene_shap: np.ndarray) -> Dict[str, float]:
644
+ """Compute statistical measures for gene SHAP values"""
 
 
 
 
 
 
 
 
645
  return {
646
  'avg_shap': float(np.mean(gene_shap)),
647
  'median_shap': float(np.median(gene_shap)),
 
651
  'pos_fraction': float(np.mean(gene_shap > 0))
652
  }
653
 
654
+ def create_simple_genome_diagram(gene_results: List[Dict[str, Any]], genome_length: int) -> Image.Image:
655
+ """Create a simple genome diagram using PIL"""
656
+ # Image dimensions
657
+ width = 1500
658
+ height = 600
659
+ margin = 50
660
+ track_height = 40
661
+
662
+ # Create image with white background
663
+ img = Image.new('RGB', (width, height), 'white')
664
+ draw = ImageDraw.Draw(img)
665
+
666
+ # Try to load font, fall back to default if unavailable
667
+ try:
668
+ font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
669
+ title_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 16)
670
+ except:
671
+ font = None
672
+ title_font = None
673
+
674
+ # Draw title
675
+ draw.text((margin, margin//2), "Genome SHAP Analysis",
676
+ fill='black', font=title_font or font)
677
+
678
+ # Draw genome line
679
+ line_y = height // 2
680
+ draw.line([(margin, line_y), (width - margin, line_y)], fill='black', width=2)
681
+
682
+ # Calculate scale factor
683
+ scale = (width - 2 * margin) / genome_length
684
+
685
+ # Draw scale markers
686
+ for i in range(0, genome_length + 1, genome_length // 10):
687
+ x = margin + i * scale
688
+ draw.line([(x, line_y - 5), (x, line_y + 5)], fill='black', width=1)
689
+ draw.text((x - 20, line_y + 10), f"{i:,}", fill='black', font=font)
690
+
691
+ # Sort genes by absolute SHAP value for drawing
692
+ sorted_genes = sorted(gene_results, key=lambda x: abs(x['avg_shap']))
693
+
694
+ # Draw genes
695
+ for gene in sorted_genes:
696
+ # Calculate position
697
+ start_x = margin + gene['start'] * scale
698
+ end_x = margin + gene['end'] * scale
699
+
700
+ # Calculate color based on SHAP value
701
+ if gene['avg_shap'] > 0:
702
+ intensity = min(255, int(abs(gene['avg_shap'] * 500)))
703
+ color = (255, 255 - intensity, 255 - intensity) # Red
704
+ else:
705
+ intensity = min(255, int(abs(gene['avg_shap'] * 500)))
706
+ color = (255 - intensity, 255 - intensity, 255) # Blue
707
+
708
+ # Draw gene box
709
+ draw.rectangle([
710
+ (start_x, line_y - track_height // 2),
711
+ (end_x, line_y + track_height // 2)
712
+ ], fill=color, outline='black')
713
+
714
+ # Draw gene name
715
+ label = f"{gene['gene_name']}"
716
+ label_bbox = draw.textbbox((0, 0), label, font=font)
717
+ label_width = label_bbox[2] - label_bbox[0]
718
+
719
+ # Try to place label, alternating above and below
720
+ if sorted_genes.index(gene) % 2 == 0:
721
+ text_y = line_y - track_height - 15
722
+ else:
723
+ text_y = line_y + track_height + 5
724
+
725
+ # Draw label with rotation if space is tight
726
+ gene_width = end_x - start_x
727
+ if gene_width > label_width:
728
+ # Horizontal label
729
+ text_x = start_x + (gene_width - label_width) // 2
730
+ draw.text((text_x, text_y), label, fill='black', font=font)
731
+ elif gene_width > 20:
732
+ # Create rotated text image
733
+ txt_img = Image.new('RGBA', (label_width, 20), (255, 255, 255, 0))
734
+ txt_draw = ImageDraw.Draw(txt_img)
735
+ txt_draw.text((0, 0), label, font=font, fill='black')
736
+ txt_img = txt_img.rotate(90, expand=True)
737
+ img.paste(txt_img, (int(start_x), text_y), txt_img)
738
+
739
+ # Draw legend
740
+ legend_x = margin
741
+ legend_y = height - margin
742
+ draw.text((legend_x, legend_y - 60), "SHAP Values:", fill='black', font=font)
743
+
744
+ # Draw legend boxes
745
+ box_width = 20
746
+ box_height = 20
747
+ spacing = 15
748
+
749
+ # Strong human-like
750
+ draw.rectangle([(legend_x, legend_y - 45, legend_x + box_width, legend_y - 45 + box_height)],
751
+ fill=(255, 0, 0), outline='black')
752
+ draw.text((legend_x + box_width + spacing, legend_y - 45),
753
+ "Strong human-like signal", fill='black', font=font)
754
+
755
+ # Weak human-like
756
+ draw.rectangle([(legend_x, legend_y - 20, legend_x + box_width, legend_y - 20 + box_height)],
757
+ fill=(255, 200, 200), outline='black')
758
+ draw.text((legend_x + box_width + spacing, legend_y - 20),
759
+ "Weak human-like signal", fill='black', font=font)
760
+
761
+ # Weak non-human-like
762
+ draw.rectangle([(legend_x + 250, legend_y - 45, legend_x + 250 + box_width, legend_y - 45 + box_height)],
763
+ fill=(200, 200, 255), outline='black')
764
+ draw.text((legend_x + 250 + box_width + spacing, legend_y - 45),
765
+ "Weak non-human-like signal", fill='black', font=font)
766
+
767
+ # Strong non-human-like
768
+ draw.rectangle([(legend_x + 250, legend_y - 20, legend_x + 250 + box_width, legend_y - 20 + box_height)],
769
+ fill=(0, 0, 255), outline='black')
770
+ draw.text((legend_x + 250 + box_width + spacing, legend_y - 20),
771
+ "Strong non-human-like signal", fill='black', font=font)
772
+
773
+ return img
774
+
775
  def analyze_gene_features(sequence_file: str,
776
  features_file: str,
777
  fasta_text: str = "",
778
  features_text: str = "") -> Tuple[str, Optional[str], Optional[Image.Image]]:
779
+ """Analyze SHAP values for each gene feature"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
780
  # First analyze whole sequence
781
  sequence_results = analyze_sequence(sequence_file, top_kmers=10, fasta_text=fasta_text)
782
  if isinstance(sequence_results[0], str) and "Error" in sequence_results[0]:
 
834
  if not gene_results:
835
  return "No valid genes could be processed", None, None
836
 
837
+ # Sort genes by absolute SHAP value
838
+ sorted_genes = sorted(gene_results, key=lambda x: abs(x['avg_shap']), reverse=True)
839
+
840
  # Create results text
841
  results_text = "Gene Analysis Results:\n\n"
842
  results_text += f"Total genes analyzed: {len(gene_results)}\n"
843
  results_text += f"Human-like genes: {sum(1 for g in gene_results if g['classification'] == 'Human')}\n"
844
  results_text += f"Non-human-like genes: {sum(1 for g in gene_results if g['classification'] == 'Non-human')}\n\n"
845
 
846
+ results_text += "Top 10 most distinctive genes:\n"
 
 
 
847
  for gene in sorted_genes[:10]:
848
  results_text += (
849
  f"Gene: {gene['gene_name']}\n"
 
866
  )
867
 
868
  # Save CSV to temp file
 
 
869
  try:
870
+ temp_dir = tempfile.gettempdir()
871
+ temp_path = os.path.join(temp_dir, f"gene_analysis_{os.urandom(4).hex()}.csv")
 
 
 
 
 
 
 
 
 
 
 
 
872
 
873
+ with open(temp_path, 'w') as f:
874
+ f.write(csv_content)
875
+ except Exception as e:
876
+ print(f"Error saving CSV: {str(e)}")
877
+ temp_path = None
 
 
 
 
 
 
 
 
 
 
 
 
878
 
879
+ # Create visualization
 
 
 
 
 
 
880
  try:
881
+ diagram_img = create_simple_genome_diagram(gene_results, len(shap_means))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
882
  except Exception as e:
883
+ print(f"Error creating visualization: {str(e)}")
884
+ # Create error image
885
+ diagram_img = Image.new('RGB', (800, 100), color='white')
886
+ draw = ImageDraw.Draw(diagram_img)
887
+ draw.text((10, 40), f"Error creating visualization: {str(e)}", fill='black')
888
+
889
+ return results_text, temp_path, diagram_img
890
+
891
  ###############################################################################
892
  # 12. DOWNLOAD FUNCTIONS
893
  ###############################################################################