Files changed (1) hide show
  1. app.py +161 -81
app.py CHANGED
@@ -1,8 +1,6 @@
1
  # --- Imports ---
2
- import spaces
3
  import gradio as gr
4
  from transformers import pipeline
5
- import os
6
 
7
  # --- Load Model ---
8
  pipe = pipeline(model="InstaDeepAI/ChatNT", trust_remote_code=True)
@@ -10,84 +8,112 @@ pipe = pipeline(model="InstaDeepAI/ChatNT", trust_remote_code=True)
10
  # --- Logs ---
11
  log_file = "logs.txt"
12
 
13
- class Log:
14
- def __init__(self, log_file):
15
- self.log_file = log_file
16
 
17
- def __call__(self):
18
- if not os.path.exists(self.log_file):
19
- return ""
20
- with open(self.log_file, "r") as f:
21
- return f.read()
22
-
23
- # --- Main Function ---
24
- @spaces.GPU
25
- def run_chatnt(dna_text, fasta_file, custom_question):
26
  with open(log_file, "a") as log:
27
- log.write("Request started\n\n")
 
28
 
29
- # Read DNA sequence from text field or file
 
 
 
 
 
 
 
30
  dna_sequence = ""
 
 
 
 
31
  if dna_text and dna_text.strip():
32
  dna_sequence = dna_text.strip().replace("\n", "")
33
- elif fasta_file is not None:
34
- with open(fasta_file.name, "r") as f:
35
- file_content = f.read()
36
- lines = file_content.splitlines()
37
- sequence = ""
38
- for line in lines:
39
- line = line.strip()
40
- if not line or line.startswith(">"):
41
- continue
42
- sequence += line
43
- dna_sequence = sequence
44
-
45
- dna_sequences = []
46
- if dna_sequence:
47
- dna_sequences.append(dna_sequence)
48
 
49
- with open(log_file, "a") as log:
50
- log.write(f"DNA sequences found: {dna_sequences}\n")
 
 
51
 
52
- # Check DNA sequences count
53
- if len(dna_sequences) > 1:
54
- return "You must use only one DNA sequence."
55
 
56
- if not custom_question or custom_question.strip() == "":
57
- return "Please provide a question."
 
58
 
59
- # Build prompt
60
- num_placeholders = custom_question.count("<DNA>")
 
 
 
61
 
62
- if len(dna_sequences) == 0:
63
- english_sequence = custom_question
64
- else:
65
- if num_placeholders == 0:
66
- return "Your question must include the <DNA> token at the position where the DNA sequence should be inserted."
67
- elif num_placeholders == 1:
68
- english_sequence = custom_question
69
- else:
70
- return "You can only provide one DNA sequence, so you must use exactly one <DNA> placeholder."
71
 
72
- with open(log_file, "a") as log:
73
- log.write(f"Initial user question: {custom_question}\n")
74
- log.write(f"Full english prompt: {english_sequence}\n")
75
- log.write("Calling model\n")
76
-
77
- output = pipe(
78
- inputs={
79
- "english_sequence": english_sequence,
80
- "dna_sequences": dna_sequences
81
- }
82
- )
83
 
84
- if len(dna_sequences) == 0:
85
- return f"{output}\n\nNote: Careful, you did not provide any DNA sequence."
 
86
 
87
- with open(log_file, "a") as log:
88
- log.write(f"Output: {output}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- return output
91
 
92
  # --- Gradio Interface ---
93
  css = """
@@ -96,50 +122,104 @@ css = """
96
  footer { display: none !important; }
97
  """
98
 
 
 
 
99
  with gr.Blocks(css=css) as demo:
100
  gr.Markdown("# 🧬 ChatNT: A Multimodal Conversational Agent for DNA, RNA and Protein Tasks")
 
 
 
 
 
 
101
 
102
  with gr.Row():
103
  with gr.Column(scale=1):
104
  dna_text = gr.Textbox(
105
  label="Paste your DNA sequence",
106
- placeholder="ATGCATGCATGC...",
107
  lines=4
108
  )
109
 
110
  fasta_file = gr.File(
111
  label="Or upload your FASTA file",
112
- file_types=[".fasta", ".fa", ".txt"]
 
113
  )
114
 
115
  custom_question = gr.Textbox(
116
  label="English Question",
117
- placeholder="e.g., Does this sequence <DNA> contain a donor splice site?"
118
  )
119
 
 
120
  submit_btn = gr.Button("Run Query", variant="primary")
121
 
122
- with gr.Row():
123
- output = gr.Textbox(label="Answer", lines=6)
 
 
 
 
 
 
 
124
 
125
  submit_btn.click(
126
  run_chatnt,
127
  inputs=[dna_text, fasta_file, custom_question],
128
- outputs=output,
129
  )
130
 
131
- gr.Markdown("""
132
- **Note:**
133
- ✅ You must use **exactly one DNA sequence** (either paste it or upload a file).
134
- Your question must include the `<DNA>` token **exactly once** at the position where the DNA will be inserted.
135
- Example: *"Does this sequence `<DNA>` contain a donor splice site?"*
136
- """)
 
 
 
 
 
 
 
 
 
 
 
137
 
138
- with gr.Accordion("Logs", open=True):
139
- log_display = Log(log_file)
140
- gr.Markdown(log_display)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
- # --- Launch ---
143
  if __name__ == "__main__":
144
  demo.queue()
145
  demo.launch(debug=True, show_error=True)
 
1
  # --- Imports ---
 
2
  import gradio as gr
3
  from transformers import pipeline
 
4
 
5
  # --- Load Model ---
6
  pipe = pipeline(model="InstaDeepAI/ChatNT", trust_remote_code=True)
 
8
  # --- Logs ---
9
  log_file = "logs.txt"
10
 
 
 
 
11
 
12
+ def log_message(message: str):
 
 
 
 
 
 
 
 
13
  with open(log_file, "a") as log:
14
+ log.write(f"{message}\n")
15
+
16
 
17
+ # --- Utilities ---
18
+ def read_dna_sequence(dna_text, fasta_file):
19
+ """
20
+ Returns:
21
+ dna_sequence: str
22
+ warning: str if any
23
+ error: str if any
24
+ """
25
  dna_sequence = ""
26
+ warning = ""
27
+ error = ""
28
+
29
+ # Pasted text
30
  if dna_text and dna_text.strip():
31
  dna_sequence = dna_text.strip().replace("\n", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ # Uploaded FASTA overrides text
34
+ if fasta_file is not None:
35
+ if dna_sequence:
36
+ warning = "Warning: Both pasted DNA and FASTA file provided. Using file only."
37
 
38
+ try:
39
+ with open(fasta_file.name, "r") as f:
40
+ content = f.read()
41
 
42
+ if not content.startswith(">"):
43
+ error = "Invalid FASTA: must start with '>' header line."
44
+ return "", warning, error
45
 
46
+ sequence = ""
47
+ for line in content.splitlines():
48
+ if not line or line.startswith(">"):
49
+ continue
50
+ sequence += line.strip()
51
 
52
+ dna_sequence = sequence
 
 
 
 
 
 
 
 
53
 
54
+ except Exception:
55
+ error = "Could not read the FASTA file."
 
 
 
 
 
 
 
 
 
56
 
57
+ if dna_sequence and not dna_sequence.isupper():
58
+ dna_sequence = dna_sequence.upper()
59
+ warning += "\nNote: DNA sequence was converted to uppercase."
60
 
61
+ return dna_sequence, warning.strip(), error
62
+
63
+
64
+ def validate_inputs(dna_sequence, custom_question):
65
+ """
66
+ Returns:
67
+ valid: bool
68
+ error: str
69
+ """
70
+ placeholders = custom_question.count("<DNA>")
71
+
72
+ if not custom_question.strip():
73
+ return False, "Please provide a question."
74
+
75
+ if dna_sequence and placeholders == 0:
76
+ log_message("Error: DNA sequence provided but no <DNA> token.")
77
+ return False, "Your question must contain the <DNA> token if you provide a DNA sequence."
78
+
79
+ if not dna_sequence and placeholders == 1:
80
+ log_message("Error: <DNA> token but no sequence.")
81
+ return False, "You must provide a DNA sequence if you use the <DNA> token."
82
+
83
+ if placeholders > 1:
84
+ return False, "Only one <DNA> token is allowed."
85
+
86
+ return True, ""
87
+
88
+
89
+ # --- Main Inference ---
90
+ def run_chatnt(dna_text, fasta_file, custom_question):
91
+ feedback_msgs = []
92
+
93
+ dna_sequence, warning, fasta_error = read_dna_sequence(dna_text, fasta_file)
94
+
95
+ if fasta_error:
96
+ return "", fasta_error
97
+
98
+ is_valid, validation_error = validate_inputs(dna_sequence, custom_question)
99
+ if not is_valid:
100
+ return "", validation_error
101
+
102
+ final_prompt = custom_question
103
+
104
+ inputs = {
105
+ "english_sequence": final_prompt,
106
+ "dna_sequences": [dna_sequence] if dna_sequence else []
107
+ }
108
+
109
+ output = pipe(inputs=inputs)
110
+ result = output
111
+
112
+ if warning:
113
+ feedback_msgs.append(warning)
114
+
115
+ return result, "\n".join(feedback_msgs)
116
 
 
117
 
118
  # --- Gradio Interface ---
119
  css = """
 
122
  footer { display: none !important; }
123
  """
124
 
125
+ example_dna = "ATGCATGCATGCATGC"
126
+ example_question = "Does this sequence <DNA> contain a donor splice site?"
127
+
128
  with gr.Blocks(css=css) as demo:
129
  gr.Markdown("# 🧬 ChatNT: A Multimodal Conversational Agent for DNA, RNA and Protein Tasks")
130
+ gr.Markdown(
131
+ "[ChatNT]{https://www.nature.com/articles/s42256-025-01047-1} is the first multimodal conversational agent designed with a deep understanding of biological sequences (DNA, RNA, proteins). It enables users — even those with no coding background — to interact with biological data through natural language and it generalizes across multiple biological tasks and modalities.\n"
132
+ "This Hugging Face Space is powered by a [ZeroGPU](https://huggingface.co/docs/hub/en/spaces-zerogpu), which is free but **limited to 5 minutes per day per user**.\n"
133
+ )
134
+
135
+ gr.Image("https://media.springernature.com/w440/springer-static/cover-hires/journal/42256/7/6")
136
 
137
  with gr.Row():
138
  with gr.Column(scale=1):
139
  dna_text = gr.Textbox(
140
  label="Paste your DNA sequence",
141
+ placeholder="ATGCATGC...",
142
  lines=4
143
  )
144
 
145
  fasta_file = gr.File(
146
  label="Or upload your FASTA file",
147
+ file_types=[".fasta", ".fa", ".txt"],
148
+ height=50
149
  )
150
 
151
  custom_question = gr.Textbox(
152
  label="English Question",
153
+ placeholder="Does this sequence <DNA> contain a donor splice site?"
154
  )
155
 
156
+ use_example = gr.Button("Use Example")
157
  submit_btn = gr.Button("Run Query", variant="primary")
158
 
159
+ with gr.Column(scale=1):
160
+ output = gr.Textbox(
161
+ label="Model Answer",
162
+ lines=12
163
+ )
164
+ error_box = gr.Textbox(
165
+ label="Execution Feedback",
166
+ lines=4
167
+ )
168
 
169
  submit_btn.click(
170
  run_chatnt,
171
  inputs=[dna_text, fasta_file, custom_question],
172
+ outputs=[output, error_box],
173
  )
174
 
175
+ use_example.click(
176
+ lambda: (example_dna, None, example_question),
177
+ inputs=[],
178
+ outputs=[dna_text, fasta_file, custom_question]
179
+ )
180
+
181
+ gr.Markdown(
182
+ """
183
+ You must use **exactly one `<DNA>` token** if you want the model to see your sequence. It is also possible to use the model without any DNA sequence (in this case, the `<DNA>` token must not be present in the question).
184
+ You can either paste a sequence or upload a FASTA file.
185
+
186
+ ---
187
+
188
+ ### ✅ Good queries
189
+ - "Does this sequence `<DNA>` contain a donor splice site?"
190
+ - "Is it possible for you to identify whether there's a substantial presence of H3 histone protein occupancy in the nucleotide sequence `<DNA>` in yeast?"
191
+ - "Determine the degradation rate of the mouse RNA sequence `<DNA>` within the -5 to 5 range."
192
 
193
+ ### What will not work properly
194
+ - "What is the length of this sequence `<DNA>`?"
195
+
196
+ For more examples, you can refer to the [training dataset]{https://huggingface.co/datasets/InstaDeepAI/ChatNT_training_data}.
197
+ """
198
+ )
199
+
200
+ gr.Markdown("""
201
+ ### 📚 Citation
202
+
203
+ If you use **ChatNT**, please cite:
204
+
205
+ ```bibtex
206
+ @article{deAlmeida2025,
207
+ title = {A multimodal conversational agent for DNA, RNA and protein tasks},
208
+ author = {de Almeida, Bernardo P. and Richard, Guillaume and Dalla-Torre, Hugo and Blum, Christopher and Hexemer, Lorenz and Pandey, Priyanka and Laurent, Stefan and Rajesh, Chandana and Lopez, Marie and Laterre, Alexandre and Lang, Maren and Şahin, Uğur and Beguir, Karim and Pierrot, Thomas},
209
+ journal = {Nature Machine Intelligence},
210
+ year = {2025},
211
+ volume = {7},
212
+ number = {6},
213
+ pages = {928--941},
214
+ doi = {10.1038/s42256-025-01047-1},
215
+ url = {https://doi.org/10.1038/s42256-025-01047-1},
216
+ issn = {2522-5839}
217
+ }
218
+ ```
219
+ """,
220
+ show_copy_button=True
221
+ )
222
 
 
223
  if __name__ == "__main__":
224
  demo.queue()
225
  demo.launch(debug=True, show_error=True)