Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,145 +1,146 @@
|
|
1 |
-
# β
Optimized mtDNA MVP UI with Faster Pipeline & Required Feedback
|
2 |
-
|
3 |
-
import gradio as gr
|
4 |
-
from collections import Counter
|
5 |
-
import csv
|
6 |
-
import os
|
7 |
-
from functools import lru_cache
|
8 |
-
from mtdna_classifier import classify_sample_location
|
9 |
-
import subprocess
|
10 |
-
|
11 |
-
import shutil
|
12 |
-
|
13 |
-
if not shutil.which("esummary"):
|
14 |
-
subprocess.run(
|
15 |
-
'yes | sh -c "$(wget -q https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh -O -)"',
|
16 |
-
shell=True
|
17 |
-
)
|
18 |
-
os.environ["PATH"] += f":{os.environ['HOME']}/edirect"
|
19 |
-
print("EDirect esummary found at:", shutil.which("esummary"))
|
20 |
-
|
21 |
-
|
22 |
-
@lru_cache(maxsize=128)
|
23 |
-
def classify_sample_location_cached(accession):
|
24 |
-
return classify_sample_location(accession)
|
25 |
-
|
26 |
-
# Count and suggest final location
|
27 |
-
def compute_final_suggested_location(rows):
|
28 |
-
candidates = [
|
29 |
-
row.get("Predicted Location", "").strip()
|
30 |
-
for row in rows
|
31 |
-
if row.get("Predicted Location", "").strip().lower() not in ["", "sample id not found"]
|
32 |
-
] + [
|
33 |
-
row.get("Inferred Region", "").strip()
|
34 |
-
for row in rows
|
35 |
-
if row.get("Inferred Region", "").strip().lower() not in ["","unknown"]
|
36 |
-
]
|
37 |
-
|
38 |
-
if not candidates:
|
39 |
-
return Counter(), ("Unknown", 0)
|
40 |
-
|
41 |
-
counts = Counter(candidates)
|
42 |
-
top_location, count = counts.most_common(1)[0]
|
43 |
-
return counts, (top_location, count)
|
44 |
-
|
45 |
-
# Store feedback (with required fields)
|
46 |
-
def store_feedback_to_drive(accession, answer1, answer2, contact=""):
|
47 |
-
if not answer1.strip() or not answer2.strip():
|
48 |
-
return "β οΈ Please answer both questions before submitting."
|
49 |
-
|
50 |
-
feedback_file = "data/user_fb/feedback_mtdna.csv"
|
51 |
-
header = ["accession", "helpful", "improvement", "contact"]
|
52 |
-
row = [accession, answer1, answer2, contact]
|
53 |
-
file_exists = os.path.isfile(feedback_file)
|
54 |
-
with open(feedback_file, "a", newline="") as f:
|
55 |
-
writer = csv.writer(f)
|
56 |
-
if not file_exists:
|
57 |
-
writer.writerow(header)
|
58 |
-
writer.writerow(row)
|
59 |
-
return "β
Feedback submitted. Thank you!"
|
60 |
-
|
61 |
-
def summarize_results(accession):
|
62 |
-
try:
|
63 |
-
output = classify_sample_location_cached(accession)
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
"
|
89 |
-
"
|
90 |
-
"
|
91 |
-
"
|
92 |
-
"
|
93 |
-
"
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
summary_lines
|
102 |
-
summary_lines
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
gr.Markdown("
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
gr.Markdown("
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
run_button.click(fn=
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
|
|
|
1 |
+
# β
Optimized mtDNA MVP UI with Faster Pipeline & Required Feedback
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
from collections import Counter
|
5 |
+
import csv
|
6 |
+
import os
|
7 |
+
from functools import lru_cache
|
8 |
+
from mtdna_classifier import classify_sample_location
|
9 |
+
import subprocess
|
10 |
+
|
11 |
+
import shutil
|
12 |
+
|
13 |
+
if not shutil.which("esummary"):
|
14 |
+
subprocess.run(
|
15 |
+
'yes | sh -c "$(wget -q https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh -O -)"',
|
16 |
+
shell=True
|
17 |
+
)
|
18 |
+
os.environ["PATH"] += f":{os.environ['HOME']}/edirect"
|
19 |
+
print("EDirect esummary found at:", shutil.which("esummary"))
|
20 |
+
|
21 |
+
|
22 |
+
@lru_cache(maxsize=128)
|
23 |
+
def classify_sample_location_cached(accession):
|
24 |
+
return classify_sample_location(accession)
|
25 |
+
|
26 |
+
# Count and suggest final location
|
27 |
+
def compute_final_suggested_location(rows):
|
28 |
+
candidates = [
|
29 |
+
row.get("Predicted Location", "").strip()
|
30 |
+
for row in rows
|
31 |
+
if row.get("Predicted Location", "").strip().lower() not in ["", "sample id not found"]
|
32 |
+
] + [
|
33 |
+
row.get("Inferred Region", "").strip()
|
34 |
+
for row in rows
|
35 |
+
if row.get("Inferred Region", "").strip().lower() not in ["","unknown"]
|
36 |
+
]
|
37 |
+
|
38 |
+
if not candidates:
|
39 |
+
return Counter(), ("Unknown", 0)
|
40 |
+
|
41 |
+
counts = Counter(candidates)
|
42 |
+
top_location, count = counts.most_common(1)[0]
|
43 |
+
return counts, (top_location, count)
|
44 |
+
|
45 |
+
# Store feedback (with required fields)
|
46 |
+
def store_feedback_to_drive(accession, answer1, answer2, contact=""):
|
47 |
+
if not answer1.strip() or not answer2.strip():
|
48 |
+
return "β οΈ Please answer both questions before submitting."
|
49 |
+
|
50 |
+
feedback_file = "data/user_fb/feedback_mtdna.csv"
|
51 |
+
header = ["accession", "helpful", "improvement", "contact"]
|
52 |
+
row = [accession, answer1, answer2, contact]
|
53 |
+
file_exists = os.path.isfile(feedback_file)
|
54 |
+
with open(feedback_file, "a", newline="") as f:
|
55 |
+
writer = csv.writer(f)
|
56 |
+
if not file_exists:
|
57 |
+
writer.writerow(header)
|
58 |
+
writer.writerow(row)
|
59 |
+
return "β
Feedback submitted. Thank you!"
|
60 |
+
|
61 |
+
def summarize_results(accession):
|
62 |
+
try:
|
63 |
+
output = classify_sample_location_cached(accession)
|
64 |
+
print(output)
|
65 |
+
except Exception as e:
|
66 |
+
return [], f"β Error: {e}"
|
67 |
+
|
68 |
+
if accession not in output:
|
69 |
+
return [], "β Accession not found in results."
|
70 |
+
|
71 |
+
isolate = next((k for k in output if k != accession), None)
|
72 |
+
row_score = []
|
73 |
+
rows = []
|
74 |
+
|
75 |
+
for key in [accession, isolate]:
|
76 |
+
if key not in output:
|
77 |
+
continue
|
78 |
+
sample_id_label = f"{key} ({'accession number' if key == accession else 'isolate of accession'})"
|
79 |
+
for section, techniques in output[key].items():
|
80 |
+
for technique, content in techniques.items():
|
81 |
+
source = content.get("source", "")
|
82 |
+
predicted = content.get("predicted_location", "")
|
83 |
+
haplogroup = content.get("haplogroup", "")
|
84 |
+
inferred = content.get("inferred_location", "")
|
85 |
+
context = content.get("context_snippet", "")[:300] if "context_snippet" in content else ""
|
86 |
+
|
87 |
+
row = {
|
88 |
+
"Sample ID": sample_id_label,
|
89 |
+
"Technique": technique,
|
90 |
+
"Source": f"The region of haplogroup is inferred\nby using this source: {source}" if technique == "haplogroup" else source,
|
91 |
+
"Predicted Location": "" if technique == "haplogroup" else predicted,
|
92 |
+
"Haplogroup": haplogroup if technique == "haplogroup" else "",
|
93 |
+
"Inferred Region": inferred if technique == "haplogroup" else "",
|
94 |
+
"Context Snippet": context
|
95 |
+
}
|
96 |
+
|
97 |
+
row_score.append(row)
|
98 |
+
rows.append(list(row.values()))
|
99 |
+
|
100 |
+
location_counts, (final_location, count) = compute_final_suggested_location(row_score)
|
101 |
+
summary_lines = [f"### π§ Location Frequency Summary", "After counting all predicted and inferred locations:\n"]
|
102 |
+
summary_lines += [f"- **{loc}**: {cnt} times" for loc, cnt in location_counts.items()]
|
103 |
+
summary_lines.append(f"\n**Final Suggested Location:** πΊοΈ **{final_location}** (mentioned {count} times)")
|
104 |
+
summary = "\n".join(summary_lines)
|
105 |
+
|
106 |
+
return rows, summary
|
107 |
+
print(summarize_results("KU131308"))
|
108 |
+
# Gradio UI
|
109 |
+
with gr.Blocks() as interface:
|
110 |
+
gr.Markdown("# 𧬠mtDNA Location Classifier (MVP)")
|
111 |
+
gr.Markdown("Enter an accession number to infer geographic origin. You'll see predictions, confidence scores, and can submit feedback.")
|
112 |
+
|
113 |
+
with gr.Row():
|
114 |
+
accession = gr.Textbox(label="Enter Accession Number (e.g., KU131308)")
|
115 |
+
run_button = gr.Button("π Submit and Classify")
|
116 |
+
reset_button = gr.Button("π Reset")
|
117 |
+
|
118 |
+
status = gr.Markdown(visible=False)
|
119 |
+
headers = ["Sample ID", "Technique", "Source", "Predicted Location", "Haplogroup", "Inferred Region", "Context Snippet"]
|
120 |
+
output_table = gr.Dataframe(headers=headers, interactive=False)
|
121 |
+
output_summary = gr.Markdown()
|
122 |
+
|
123 |
+
gr.Markdown("---")
|
124 |
+
gr.Markdown("### π¬ Feedback (required)")
|
125 |
+
q1 = gr.Textbox(label="1οΈβ£ Was the inferred location accurate or helpful? Please explain.")
|
126 |
+
q2 = gr.Textbox(label="2οΈβ£ What would improve your experience with this tool?")
|
127 |
+
contact = gr.Textbox(label="π§ Your email or institution (optional)")
|
128 |
+
submit_feedback = gr.Button("β
Submit Feedback")
|
129 |
+
feedback_status = gr.Markdown()
|
130 |
+
|
131 |
+
def classify_with_loading(accession):
|
132 |
+
return gr.update(value="β³ Please wait... processing...", visible=True)
|
133 |
+
|
134 |
+
def classify_main(accession):
|
135 |
+
table, summary = summarize_results(accession)
|
136 |
+
return table, summary, gr.update(visible=False)
|
137 |
+
|
138 |
+
def reset_fields():
|
139 |
+
return "", "", "", "", "", [], "", gr.update(visible=False)
|
140 |
+
|
141 |
+
run_button.click(fn=classify_with_loading, inputs=accession, outputs=status)
|
142 |
+
run_button.click(fn=classify_main, inputs=accession, outputs=[output_table, output_summary, status])
|
143 |
+
submit_feedback.click(fn=store_feedback_to_drive, inputs=[accession, q1, q2, contact], outputs=feedback_status)
|
144 |
+
reset_button.click(fn=reset_fields, inputs=[], outputs=[accession, q1, q2, contact, feedback_status, output_table, output_summary, status])
|
145 |
+
|
146 |
+
interface.launch(share=True)
|