Spaces:

lyimo
/

DNA_SEQV2

Sleeping

App Files Files Community

lyimo commited on Feb 24

Commit

e0a58f9

verified ·

1 Parent(s): bb74978

Create app.py

Browse files

Files changed (1) hide show

app.py +298 -0

app.py ADDED Viewed

	@@ -0,0 +1,298 @@

+import streamlit as st
+from Bio import pairwise2
+from Bio.Seq import Seq
+import re
+from collections import defaultdict
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+# Define resistance genes and mutation patterns
+RESISTANCE_GENES = {
+    'rpoB': {
+        'start': 759807,
+        'end': 763325,
+        'description': 'RNA polymerase β subunit',
+        'drug': 'Rifampicin',
+        'mutations': {
+            '531': {'from': 'S', 'to': ['L'], 'freq': 'High', 'confidence': 'High'},
+            '526': {'from': 'H', 'to': ['Y', 'D', 'R'], 'freq': 'High', 'confidence': 'High'},
+            '516': {'from': 'D', 'to': ['V', 'G'], 'freq': 'Moderate', 'confidence': 'High'},
+            '511': {'from': 'L', 'to': ['P'], 'freq': 'Low', 'confidence': 'Moderate'}
+        }
+    },
+    'katG': {
+        'start': 2153889,
+        'end': 2156111,
+        'description': 'Catalase-peroxidase',
+        'drug': 'Isoniazid',
+        'mutations': {
+            '315': {'from': 'S', 'to': ['T', 'N'], 'freq': 'High', 'confidence': 'High'},
+            '463': {'from': 'R', 'to': ['L'], 'freq': 'Moderate', 'confidence': 'Moderate'}
+        }
+    },
+    'inhA': {
+        'start': 1674202,
+        'end': 1675011,
+        'description': 'Enoyl-ACP reductase',
+        'drug': 'Isoniazid/Ethionamide',
+        'mutations': {
+            '-15': {'from': 'C', 'to': ['T'], 'freq': 'High', 'confidence': 'High'},
+            '94':  {'from': 'S', 'to': ['A'], 'freq': 'Moderate', 'confidence': 'High'}
+        }
+    },
+    'gyrA': {
+        'start': 7302,
+        'end': 9818,
+        'description': 'DNA gyrase subunit A',
+        'drug': 'Fluoroquinolones',
+        'mutations': {
+            '90': {'from': 'A', 'to': ['V'], 'freq': 'High', 'confidence': 'High'},
+            '94': {'from': 'D', 'to': ['G', 'A', 'N'], 'freq': 'High', 'confidence': 'High'}
+        }
+    }
+}
+# File reading functions
+def read_fasta_file(file_path):
+    """Read a FASTA file from disk"""
+    try:
+        with open(file_path, 'r') as handle:
+            content = handle.read().strip()
+            parts = content.split('\n', 1)
+            sequence = ''.join(parts[1].split('\n')).replace(' ', '')
+            return sequence.upper()
+    except Exception as e:
+        st.error(f"Error reading file {file_path}: {str(e)}")
+        return None
+def read_fasta_from_upload(uploaded_file):
+    """Read a FASTA file from Streamlit upload"""
+    try:
+        content = uploaded_file.getvalue().decode('utf-8').strip()
+        parts = content.split('\n', 1)
+        sequence = ''.join(parts[1].split('\n')).replace(' ', '')
+        return sequence.upper()
+    except Exception as e:
+        st.error(f"Error reading uploaded file: {str(e)}")
+        return None
+# Region extraction function
+def extract_gene_region(genome_seq, gene_start, gene_end):
+    """Extract a gene region with 200bp flanking for alignment context"""
+    flank = 200
+    start = max(0, gene_start - flank)
+    end = min(len(genome_seq), gene_end + flank)
+    extracted_seq = genome_seq[start:end]
+    return extracted_seq, start
+# Codon alignment extraction
+def extract_codon_alignment(ref_aligned, query_aligned, gene_start, gene_end, offset):
+    """Extract codon-level differences from aligned sequences"""
+    codon_list = []
+    real_pos = 0
+    ref_codon = []
+    query_codon = []
+    for i in range(len(ref_aligned)):
+        ref_base = ref_aligned[i]
+        query_base = query_aligned[i]
+        if ref_base != '-':
+            real_pos += 1
+            ref_codon.append(ref_base)
+            query_codon.append(query_base if query_base != '-' else 'N')
+            if len(ref_codon) == 3:
+                codon_start_pos = offset + (real_pos - 3)
+                if (codon_start_pos >= gene_start) and (codon_start_pos + 2 <= gene_end):
+                    ref_aa = str(Seq(''.join(ref_codon)).translate())
+                    query_aa = str(Seq(''.join(query_codon)).translate())
+                    gene_nt_pos = codon_start_pos - gene_start + 1
+                    codon_number = (gene_nt_pos - 1) // 3 + 1
+                    if ref_aa != query_aa:
+                        codon_list.append({
+                            'codon_number': codon_number,
+                            'ref_aa': ref_aa,
+                            'query_aa': query_aa
+                        })
+                ref_codon = []
+                query_codon = []
+    return codon_list
+# Mutation detection
+def find_mutations_with_context(ref_seq, query_seq, gene_start, gene_end, offset=0):
+    """Find codon-level and nucleotide-level mutations"""
+    alignments = pairwise2.align.globalms(ref_seq, query_seq, match=2, mismatch=-3, open=-10, extend=-0.5)
+    if not alignments:
+        return {'codon_diffs': [], 'nt_diffs': []}
+    ref_aligned, query_aligned = alignments[0][0], alignments[0][1]
+    codon_diffs = extract_codon_alignment(ref_aligned, query_aligned, gene_start, gene_end, offset)
+    nt_diffs = []
+    ref_pos = 0
+    for i in range(len(ref_aligned)):
+        ref_base = ref_aligned[i]
+        query_base = query_aligned[i]
+        if ref_base != '-':
+            ref_pos += 1
+            actual_genome_pos = offset + ref_pos
+            if ref_base != query_base and query_base != '-':
+                nt_diffs.append({
+                    'genome_pos': actual_genome_pos,
+                    'ref_base': ref_base,
+                    'query_base': query_base
+                })
+    return {'codon_diffs': codon_diffs, 'nt_diffs': nt_diffs}
+# Resistance analysis
+def analyze_resistance(mutation_data, gene_info):
+    """Match mutations to known resistance patterns"""
+    codon_diffs = mutation_data['codon_diffs']
+    nt_diffs = mutation_data['nt_diffs']
+    resistance_found = []
+    for key_str, pattern in gene_info['mutations'].items():
+        key_val = int(key_str)
+        if key_val > 0:
+            for diff in codon_diffs:
+                if diff['codon_number'] == key_val and diff['ref_aa'] == pattern['from'] and diff['query_aa'] in pattern['to']:
+                    resistance_found.append({
+                        'position': key_str,
+                        'change': f"{pattern['from']}{key_str}{diff['query_aa']}",
+                        'frequency': pattern['freq'],
+                        'confidence': pattern['confidence']
+                    })
+        else:
+            promoter_genome_pos = gene_info['start'] + key_val
+            for diff in nt_diffs:
+                if diff['genome_pos'] == promoter_genome_pos and diff['ref_base'] == pattern['from'] and diff['query_base'] in pattern['to']:
+                    resistance_found.append({
+                        'position': key_str,
+                        'change': f"{pattern['from']}{key_str}{diff['query_base']}",
+                        'frequency': pattern['freq'],
+                        'confidence': pattern['confidence']
+                    })
+    return resistance_found
+# Main Streamlit app
+def main():
+    st.title("M. tuberculosis Drug Resistance Analysis")
+    st.markdown("""
+    ### Automated Drug Resistance Analysis Tool
+    Upload your query genome (clinical isolate) in FASTA format to compare with the H37Rv reference.
+    **Note**: Detects codon-based (e.g., rpoB S531L) and nucleotide-based (e.g., inhA -15C>T) mutations.
+    """)
+    debug_mode = st.checkbox("Enable debug mode")
+    ref_genome = read_fasta_file("NC_000962.3.fasta")
+    if not ref_genome:
+        st.error("Failed to load reference genome")
+        return
+    st.success(f"Reference genome loaded (length: {len(ref_genome)}bp)")
+    query_file = st.file_uploader("Upload Query Genome (FASTA)", type=['fasta', 'fa'])
+    if query_file and st.button("Analyze Drug Resistance"):
+        query_genome = read_fasta_from_upload(query_file)
+        if not query_genome:
+            return
+        st.success(f"Query genome loaded (length: {len(query_genome)}bp)")
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        all_results = {}
+        for i, (gene, info) in enumerate(RESISTANCE_GENES.items()):
+            status_text.text(f"Analyzing {gene} ({info['drug']})...")
+            progress_bar.progress((i + 1) / len(RESISTANCE_GENES))
+            if debug_mode:
+                st.subheader(f"Analyzing {gene}")
+                st.write(f"Gene region: {info['start']}-{info['end']}")
+            ref_region, ref_start = extract_gene_region(ref_genome, info['start'], info['end'])
+            query_region, _ = extract_gene_region(query_genome, info['start'], info['end'])
+            if ref_region and query_region:
+                mutation_data = find_mutations_with_context(ref_region, query_region, info['start'], info['end'], ref_start)
+                resistance = analyze_resistance(mutation_data, info)
+                all_results[gene] = {'mutation_data': mutation_data, 'resistance': resistance}
+                if debug_mode:
+                    st.write(f"Codon diffs: {len(mutation_data['codon_diffs'])}", mutation_data['codon_diffs'])
+                    st.write(f"Nucleotide diffs: {len(mutation_data['nt_diffs'])}", mutation_data['nt_diffs'])
+                    st.write(f"Resistance patterns: {len(resistance)}")
+            else:
+                st.error(f"Failed to analyze {gene}")
+        progress_bar.empty()
+        status_text.empty()
+        # Summary table
+        summary_data = [
+            {
+                'Gene': gene,
+                'Drug': RESISTANCE_GENES[gene]['drug'],
+                'Codon Diffs': len(results['mutation_data']['codon_diffs']),
+                'Nucleotide Diffs': len(results['mutation_data']['nt_diffs']),
+                'Resistance Mutations': len(results['resistance'])
+            }
+            for gene, results in all_results.items()
+        ]
+        summary_df = pd.DataFrame(summary_data)
+        def highlight_resistance(row):
+            return ['background-color: yellow' if row['Resistance Mutations'] > 0 else '' for _ in row]
+        styled_summary = summary_df.style.apply(highlight_resistance, axis=1)
+        st.subheader("Summary of Analysis")
+        st.dataframe(styled_summary)
+        # Detailed results
+        for gene, results in all_results.items():
+            st.subheader(f"{gene} Analysis")
+            info = RESISTANCE_GENES[gene]
+            st.write(f"**Drug**: {info['drug']}")
+            st.write(f"**Codon-level differences**: {len(results['mutation_data']['codon_diffs'])}")
+            st.write(f"**Nucleotide-level differences**: {len(results['mutation_data']['nt_diffs'])}")
+            if results['resistance']:
+                st.warning(f"Potential resistance mutations found in {gene}")
+                resistance_df = pd.DataFrame(results['resistance'])
+                st.dataframe(resistance_df)
+            else:
+                st.info(f"No known resistance mutations found in {gene}")
+        # Download button
+        st.markdown("### Download Complete Analysis")
+        st.write("Download a CSV file with all mutation and resistance data.")
+        report_data = []
+        for gene, results in all_results.items():
+            for diff in results['mutation_data']['codon_diffs']:
+                report_data.append({
+                    'Gene': gene,
+                    'Drug': RESISTANCE_GENES[gene]['drug'],
+                    'Type': 'Codon_diff',
+                    'Codon Number': diff['codon_number'],
+                    'Reference AA': diff['ref_aa'],
+                    'Query AA': diff['query_aa']
+                })
+            for diff in results['mutation_data']['nt_diffs']:
+                report_data.append({
+                    'Gene': gene,
+                    'Drug': RESISTANCE_GENES[gene]['drug'],
+                    'Type': 'Nucleotide_diff',
+                    'Genome Position': diff['genome_pos'],
+                    'Reference Base': diff['ref_base'],
+                    'Query Base': diff['query_base']
+                })
+            for res in results['resistance']:
+                report_data.append({
+                    'Gene': gene,
+                    'Drug': RESISTANCE_GENES[gene]['drug'],
+                    'Type': 'Resistance',
+                    'Position': res['position'],
+                    'Change': res['change'],
+                    'Frequency': res['frequency'],
+                    'Confidence': res['confidence']
+                })
+        report_df = pd.DataFrame(report_data)
+        csv = report_df.to_csv(index=False)
+        st.download_button(
+            label="Download Full Report (CSV)",
+            data=csv,
+            file_name="mtb_analysis_report.csv",
+            mime="text/csv"
+        )
+if __name__ == "__main__":
+    main()