|
import streamlit as st |
|
from Bio import pairwise2 |
|
from Bio.Seq import Seq |
|
import re |
|
from collections import defaultdict |
|
import pandas as pd |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
|
|
|
|
RESISTANCE_GENES = { |
|
'rpoB': { |
|
'start': 759807, |
|
'end': 763325, |
|
'description': 'RNA polymerase β subunit', |
|
'drug': 'Rifampicin', |
|
'mutations': { |
|
'531': {'from': 'S', 'to': ['L'], 'freq': 'High', 'confidence': 'High'}, |
|
'526': {'from': 'H', 'to': ['Y', 'D', 'R'], 'freq': 'High', 'confidence': 'High'}, |
|
'516': {'from': 'D', 'to': ['V', 'G'], 'freq': 'Moderate', 'confidence': 'High'}, |
|
'511': {'from': 'L', 'to': ['P'], 'freq': 'Low', 'confidence': 'Moderate'} |
|
} |
|
}, |
|
'katG': { |
|
'start': 2153889, |
|
'end': 2156111, |
|
'description': 'Catalase-peroxidase', |
|
'drug': 'Isoniazid', |
|
'mutations': { |
|
'315': {'from': 'S', 'to': ['T', 'N'], 'freq': 'High', 'confidence': 'High'}, |
|
'463': {'from': 'R', 'to': ['L'], 'freq': 'Moderate', 'confidence': 'Moderate'} |
|
} |
|
}, |
|
'inhA': { |
|
'start': 1674202, |
|
'end': 1675011, |
|
'description': 'Enoyl-ACP reductase', |
|
'drug': 'Isoniazid/Ethionamide', |
|
'mutations': { |
|
'-15': {'from': 'C', 'to': ['T'], 'freq': 'High', 'confidence': 'High'}, |
|
'94': {'from': 'S', 'to': ['A'], 'freq': 'Moderate', 'confidence': 'High'} |
|
} |
|
}, |
|
'gyrA': { |
|
'start': 7302, |
|
'end': 9818, |
|
'description': 'DNA gyrase subunit A', |
|
'drug': 'Fluoroquinolones', |
|
'mutations': { |
|
'90': {'from': 'A', 'to': ['V'], 'freq': 'High', 'confidence': 'High'}, |
|
'94': {'from': 'D', 'to': ['G', 'A', 'N'], 'freq': 'High', 'confidence': 'High'} |
|
} |
|
} |
|
} |
|
|
|
|
|
def read_fasta_file(file_path): |
|
"""Read a FASTA file from disk""" |
|
try: |
|
with open(file_path, 'r') as handle: |
|
content = handle.read().strip() |
|
parts = content.split('\n', 1) |
|
sequence = ''.join(parts[1].split('\n')).replace(' ', '') |
|
return sequence.upper() |
|
except Exception as e: |
|
st.error(f"Error reading file {file_path}: {str(e)}") |
|
return None |
|
|
|
def read_fasta_from_upload(uploaded_file): |
|
"""Read a FASTA file from Streamlit upload""" |
|
try: |
|
content = uploaded_file.getvalue().decode('utf-8').strip() |
|
parts = content.split('\n', 1) |
|
sequence = ''.join(parts[1].split('\n')).replace(' ', '') |
|
return sequence.upper() |
|
except Exception as e: |
|
st.error(f"Error reading uploaded file: {str(e)}") |
|
return None |
|
|
|
|
|
def extract_gene_region(genome_seq, gene_start, gene_end): |
|
"""Extract a gene region with 200bp flanking for alignment context""" |
|
flank = 200 |
|
start = max(0, gene_start - flank) |
|
end = min(len(genome_seq), gene_end + flank) |
|
extracted_seq = genome_seq[start:end] |
|
return extracted_seq, start |
|
|
|
|
|
def extract_codon_alignment(ref_aligned, query_aligned, gene_start, gene_end, offset): |
|
"""Extract codon-level differences from aligned sequences""" |
|
codon_list = [] |
|
real_pos = 0 |
|
ref_codon = [] |
|
query_codon = [] |
|
|
|
for i in range(len(ref_aligned)): |
|
ref_base = ref_aligned[i] |
|
query_base = query_aligned[i] |
|
if ref_base != '-': |
|
real_pos += 1 |
|
ref_codon.append(ref_base) |
|
query_codon.append(query_base if query_base != '-' else 'N') |
|
if len(ref_codon) == 3: |
|
codon_start_pos = offset + (real_pos - 3) |
|
if (codon_start_pos >= gene_start) and (codon_start_pos + 2 <= gene_end): |
|
ref_aa = str(Seq(''.join(ref_codon)).translate()) |
|
query_aa = str(Seq(''.join(query_codon)).translate()) |
|
gene_nt_pos = codon_start_pos - gene_start + 1 |
|
codon_number = (gene_nt_pos - 1) // 3 + 1 |
|
if ref_aa != query_aa: |
|
codon_list.append({ |
|
'codon_number': codon_number, |
|
'ref_aa': ref_aa, |
|
'query_aa': query_aa |
|
}) |
|
ref_codon = [] |
|
query_codon = [] |
|
return codon_list |
|
|
|
|
|
def find_mutations_with_context(ref_seq, query_seq, gene_start, gene_end, offset=0): |
|
"""Find codon-level and nucleotide-level mutations""" |
|
alignments = pairwise2.align.globalms(ref_seq, query_seq, match=2, mismatch=-3, open=-10, extend=-0.5) |
|
if not alignments: |
|
return {'codon_diffs': [], 'nt_diffs': []} |
|
ref_aligned, query_aligned = alignments[0][0], alignments[0][1] |
|
codon_diffs = extract_codon_alignment(ref_aligned, query_aligned, gene_start, gene_end, offset) |
|
nt_diffs = [] |
|
ref_pos = 0 |
|
for i in range(len(ref_aligned)): |
|
ref_base = ref_aligned[i] |
|
query_base = query_aligned[i] |
|
if ref_base != '-': |
|
ref_pos += 1 |
|
actual_genome_pos = offset + ref_pos |
|
if ref_base != query_base and query_base != '-': |
|
nt_diffs.append({ |
|
'genome_pos': actual_genome_pos, |
|
'ref_base': ref_base, |
|
'query_base': query_base |
|
}) |
|
return {'codon_diffs': codon_diffs, 'nt_diffs': nt_diffs} |
|
|
|
|
|
def analyze_resistance(mutation_data, gene_info): |
|
"""Match mutations to known resistance patterns""" |
|
codon_diffs = mutation_data['codon_diffs'] |
|
nt_diffs = mutation_data['nt_diffs'] |
|
resistance_found = [] |
|
for key_str, pattern in gene_info['mutations'].items(): |
|
key_val = int(key_str) |
|
if key_val > 0: |
|
for diff in codon_diffs: |
|
if diff['codon_number'] == key_val and diff['ref_aa'] == pattern['from'] and diff['query_aa'] in pattern['to']: |
|
resistance_found.append({ |
|
'position': key_str, |
|
'change': f"{pattern['from']}{key_str}{diff['query_aa']}", |
|
'frequency': pattern['freq'], |
|
'confidence': pattern['confidence'] |
|
}) |
|
else: |
|
promoter_genome_pos = gene_info['start'] + key_val |
|
for diff in nt_diffs: |
|
if diff['genome_pos'] == promoter_genome_pos and diff['ref_base'] == pattern['from'] and diff['query_base'] in pattern['to']: |
|
resistance_found.append({ |
|
'position': key_str, |
|
'change': f"{pattern['from']}{key_str}{diff['query_base']}", |
|
'frequency': pattern['freq'], |
|
'confidence': pattern['confidence'] |
|
}) |
|
return resistance_found |
|
|
|
|
|
def main(): |
|
st.title("M. tuberculosis Drug Resistance Analysis") |
|
st.markdown(""" |
|
### Automated Drug Resistance Analysis Tool |
|
Upload your query genome (clinical isolate) in FASTA format to compare with the H37Rv reference. |
|
**Note**: Detects codon-based (e.g., rpoB S531L) and nucleotide-based (e.g., inhA -15C>T) mutations. |
|
""") |
|
|
|
debug_mode = st.checkbox("Enable debug mode") |
|
ref_genome = read_fasta_file("NC_000962.3.fasta") |
|
if not ref_genome: |
|
st.error("Failed to load reference genome") |
|
return |
|
st.success(f"Reference genome loaded (length: {len(ref_genome)}bp)") |
|
|
|
query_file = st.file_uploader("Upload Query Genome (FASTA)", type=['fasta', 'fa']) |
|
if query_file and st.button("Analyze Drug Resistance"): |
|
query_genome = read_fasta_from_upload(query_file) |
|
if not query_genome: |
|
return |
|
st.success(f"Query genome loaded (length: {len(query_genome)}bp)") |
|
|
|
progress_bar = st.progress(0) |
|
status_text = st.empty() |
|
all_results = {} |
|
|
|
for i, (gene, info) in enumerate(RESISTANCE_GENES.items()): |
|
status_text.text(f"Analyzing {gene} ({info['drug']})...") |
|
progress_bar.progress((i + 1) / len(RESISTANCE_GENES)) |
|
if debug_mode: |
|
st.subheader(f"Analyzing {gene}") |
|
st.write(f"Gene region: {info['start']}-{info['end']}") |
|
ref_region, ref_start = extract_gene_region(ref_genome, info['start'], info['end']) |
|
query_region, _ = extract_gene_region(query_genome, info['start'], info['end']) |
|
if ref_region and query_region: |
|
mutation_data = find_mutations_with_context(ref_region, query_region, info['start'], info['end'], ref_start) |
|
resistance = analyze_resistance(mutation_data, info) |
|
all_results[gene] = {'mutation_data': mutation_data, 'resistance': resistance} |
|
if debug_mode: |
|
st.write(f"Codon diffs: {len(mutation_data['codon_diffs'])}", mutation_data['codon_diffs']) |
|
st.write(f"Nucleotide diffs: {len(mutation_data['nt_diffs'])}", mutation_data['nt_diffs']) |
|
st.write(f"Resistance patterns: {len(resistance)}") |
|
else: |
|
st.error(f"Failed to analyze {gene}") |
|
|
|
progress_bar.empty() |
|
status_text.empty() |
|
|
|
|
|
summary_data = [ |
|
{ |
|
'Gene': gene, |
|
'Drug': RESISTANCE_GENES[gene]['drug'], |
|
'Codon Diffs': len(results['mutation_data']['codon_diffs']), |
|
'Nucleotide Diffs': len(results['mutation_data']['nt_diffs']), |
|
'Resistance Mutations': len(results['resistance']) |
|
} |
|
for gene, results in all_results.items() |
|
] |
|
summary_df = pd.DataFrame(summary_data) |
|
def highlight_resistance(row): |
|
return ['background-color: yellow' if row['Resistance Mutations'] > 0 else '' for _ in row] |
|
styled_summary = summary_df.style.apply(highlight_resistance, axis=1) |
|
st.subheader("Summary of Analysis") |
|
st.dataframe(styled_summary) |
|
|
|
|
|
for gene, results in all_results.items(): |
|
st.subheader(f"{gene} Analysis") |
|
info = RESISTANCE_GENES[gene] |
|
st.write(f"**Drug**: {info['drug']}") |
|
st.write(f"**Codon-level differences**: {len(results['mutation_data']['codon_diffs'])}") |
|
st.write(f"**Nucleotide-level differences**: {len(results['mutation_data']['nt_diffs'])}") |
|
if results['resistance']: |
|
st.warning(f"Potential resistance mutations found in {gene}") |
|
resistance_df = pd.DataFrame(results['resistance']) |
|
st.dataframe(resistance_df) |
|
else: |
|
st.info(f"No known resistance mutations found in {gene}") |
|
|
|
|
|
st.markdown("### Download Complete Analysis") |
|
st.write("Download a CSV file with all mutation and resistance data.") |
|
report_data = [] |
|
for gene, results in all_results.items(): |
|
for diff in results['mutation_data']['codon_diffs']: |
|
report_data.append({ |
|
'Gene': gene, |
|
'Drug': RESISTANCE_GENES[gene]['drug'], |
|
'Type': 'Codon_diff', |
|
'Codon Number': diff['codon_number'], |
|
'Reference AA': diff['ref_aa'], |
|
'Query AA': diff['query_aa'] |
|
}) |
|
for diff in results['mutation_data']['nt_diffs']: |
|
report_data.append({ |
|
'Gene': gene, |
|
'Drug': RESISTANCE_GENES[gene]['drug'], |
|
'Type': 'Nucleotide_diff', |
|
'Genome Position': diff['genome_pos'], |
|
'Reference Base': diff['ref_base'], |
|
'Query Base': diff['query_base'] |
|
}) |
|
for res in results['resistance']: |
|
report_data.append({ |
|
'Gene': gene, |
|
'Drug': RESISTANCE_GENES[gene]['drug'], |
|
'Type': 'Resistance', |
|
'Position': res['position'], |
|
'Change': res['change'], |
|
'Frequency': res['frequency'], |
|
'Confidence': res['confidence'] |
|
}) |
|
report_df = pd.DataFrame(report_data) |
|
csv = report_df.to_csv(index=False) |
|
st.download_button( |
|
label="Download Full Report (CSV)", |
|
data=csv, |
|
file_name="mtb_analysis_report.csv", |
|
mime="text/csv" |
|
) |
|
|
|
if __name__ == "__main__": |
|
main() |