lkjjj26 commited on
Commit
d8b5f68
Β·
1 Parent(s): e2ddb45

upload files

Browse files
Files changed (3) hide show
  1. Dockerfile +3 -2
  2. app.py +386 -142
  3. requirements.txt +7 -4
Dockerfile CHANGED
@@ -8,6 +8,7 @@ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
 
9
  COPY . .
10
 
11
- EXPOSE 7860
 
 
12
 
13
- CMD ["shiny", "run", "app.py", "--host", "0.0.0.0", "--port", "7860"]
 
8
 
9
  COPY . .
10
 
11
+ EXPOSE 8000
12
+
13
+ CMD ["python", "app.py"]
14
 
 
app.py CHANGED
@@ -1,162 +1,406 @@
1
- import faicons as fa
2
- import plotly.express as px
 
 
 
 
 
 
 
 
 
 
 
3
 
4
- # Load data and compute static values
5
- from shared import app_dir, tips
6
- from shinywidgets import render_plotly
7
 
8
- from shiny import reactive, render
9
- from shiny.express import input, ui
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- bill_rng = (min(tips.total_bill), max(tips.total_bill))
 
 
 
 
12
 
13
- # Add page title and sidebar
14
- ui.page_opts(title="Restaurant tipping", fillable=True)
 
 
 
 
15
 
16
- with ui.sidebar(open="desktop"):
17
- ui.input_slider(
18
- "total_bill",
19
- "Bill amount",
20
- min=bill_rng[0],
21
- max=bill_rng[1],
22
- value=bill_rng,
23
- pre="$",
24
- )
25
- ui.input_checkbox_group(
26
- "time",
27
- "Food service",
28
- ["Lunch", "Dinner"],
29
- selected=["Lunch", "Dinner"],
30
- inline=True,
31
- )
32
- ui.input_action_button("reset", "Reset filter")
33
-
34
- # Add main content
35
- ICONS = {
36
- "user": fa.icon_svg("user", "regular"),
37
- "wallet": fa.icon_svg("wallet"),
38
- "currency-dollar": fa.icon_svg("dollar-sign"),
39
- "ellipsis": fa.icon_svg("ellipsis"),
40
- }
41
-
42
- with ui.layout_columns(fill=False):
43
- with ui.value_box(showcase=ICONS["user"]):
44
- "Total tippers"
45
-
46
- @render.express
47
- def total_tippers():
48
- tips_data().shape[0]
49
-
50
- with ui.value_box(showcase=ICONS["wallet"]):
51
- "Average tip"
52
-
53
- @render.express
54
- def average_tip():
55
- d = tips_data()
56
- if d.shape[0] > 0:
57
- perc = d.tip / d.total_bill
58
- f"{perc.mean():.1%}"
59
-
60
- with ui.value_box(showcase=ICONS["currency-dollar"]):
61
- "Average bill"
62
-
63
- @render.express
64
- def average_bill():
65
- d = tips_data()
66
- if d.shape[0] > 0:
67
- bill = d.total_bill.mean()
68
- f"${bill:.2f}"
69
-
70
-
71
- with ui.layout_columns(col_widths=[6, 6, 12]):
72
- with ui.card(full_screen=True):
73
- ui.card_header("Tips data")
74
-
75
- @render.data_frame
76
- def table():
77
- return render.DataGrid(tips_data())
78
-
79
- with ui.card(full_screen=True):
80
- with ui.card_header(class_="d-flex justify-content-between align-items-center"):
81
- "Total bill vs tip"
82
- with ui.popover(title="Add a color variable", placement="top"):
83
- ICONS["ellipsis"]
84
- ui.input_radio_buttons(
85
- "scatter_color",
86
- None,
87
- ["none", "sex", "smoker", "day", "time"],
88
- inline=True,
89
- )
90
 
91
- @render_plotly
92
- def scatterplot():
93
- color = input.scatter_color()
94
- return px.scatter(
95
- tips_data(),
96
- x="total_bill",
97
- y="tip",
98
- color=None if color == "none" else color,
99
- trendline="lowess",
100
- )
101
 
102
- with ui.card(full_screen=True):
103
- with ui.card_header(class_="d-flex justify-content-between align-items-center"):
104
- "Tip percentages"
105
- with ui.popover(title="Add a color variable"):
106
- ICONS["ellipsis"]
107
- ui.input_radio_buttons(
108
- "tip_perc_y",
109
- "Split by:",
110
- ["sex", "smoker", "day", "time"],
111
- selected="day",
112
- inline=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
- @render_plotly
116
- def tip_perc():
117
- from ridgeplot import ridgeplot
118
-
119
- dat = tips_data()
120
- dat["percent"] = dat.tip / dat.total_bill
121
- yvar = input.tip_perc_y()
122
- uvals = dat[yvar].unique()
123
 
124
- samples = [[dat.percent[dat[yvar] == val]] for val in uvals]
125
 
126
- plt = ridgeplot(
127
- samples=samples,
128
- labels=uvals,
129
- bandwidth=0.01,
130
- colorscale="viridis",
131
- colormode="row-index",
132
- )
133
 
134
- plt.update_layout(
135
- legend=dict(
136
- orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5
137
- )
138
- )
139
 
140
- return plt
 
 
 
 
 
 
141
 
 
142
 
143
- ui.include_css(app_dir / "styles.css")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
- # --------------------------------------------------------
146
- # Reactive calculations and effects
147
- # --------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
- @reactive.calc
151
- def tips_data():
152
- bill = input.total_bill()
153
- idx1 = tips.total_bill.between(bill[0], bill[1])
154
- idx2 = tips.time.isin(input.time())
155
- return tips[idx1 & idx2]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
 
157
 
158
- @reactive.effect
159
- @reactive.event(input.reset)
160
- def _():
161
- ui.update_slider("total_bill", value=bill_rng)
162
- ui.update_checkbox_group("time", selected=["Lunch", "Dinner"])
 
1
+ from transformers import pipeline
2
+ from rcsbsearchapi import TextQuery, AttributeQuery, Query
3
+ from rcsbsearchapi.search import Sort, SequenceQuery
4
+ import os
5
+ from dotenv import load_dotenv
6
+ from shiny import App, render, ui, reactive
7
+ import pandas as pd
8
+ import warnings
9
+ import re
10
+ from UniprotKB_P_Sequence_RCSB_API_test import ProteinQuery, ProteinSearchEngine
11
+ import plotly.graph_objects as go
12
+ from shinywidgets import output_widget, render_widget
13
+ warnings.filterwarnings('ignore')
14
 
15
+ # Load environment variables from .env file
16
+ load_dotenv()
 
17
 
18
+ class PDBSearchAssistant:
19
+ def __init__(self, model_name="google/flan-t5-large"):
20
+ # Set up HuggingFace pipeline with better model
21
+ self.pipe = pipeline(
22
+ "text2text-generation",
23
+ model=model_name,
24
+ max_new_tokens=512,
25
+ temperature=0.3,
26
+ torch_dtype="auto",
27
+ device="cpu"
28
+ )
29
+
30
+ self.prompt_template = """
31
+ Extract specific search parameters from the query, if present:
32
+ 1. Resolution cutoff (in Γ…)
33
+ 2. Sequence information
34
+ 3. Specific PDB ID
35
+ 4. Experimental method (X-RAY, EM, NMR)
36
 
37
+ Format:
38
+ Resolution: [maximum resolution in Γ…, if mentioned]
39
+ Sequence: [any sequence mentioned]
40
+ PDB_ID: [specific PDB ID if mentioned]
41
+ Method: [experimental method if mentioned]
42
 
43
+ Examples:
44
+ Query: "Find X-ray structures better than 2.5Γ… resolution"
45
+ Resolution: 2.5
46
+ Sequence: none
47
+ PDB_ID: none
48
+ Method: X-RAY
49
 
50
+ Query: "Show me NMR structures of kinases"
51
+ Resolution: none
52
+ Sequence: none
53
+ PDB_ID: none
54
+ Method: NMR
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ Now analyze:
57
+ Query: {query}
58
+ """
 
 
 
 
 
 
 
59
 
60
+ def search_pdb(self, query):
61
+ try:
62
+ # Get search parameters from LLM
63
+ formatted_prompt = self.prompt_template.format(query=query)
64
+ response = self.pipe(formatted_prompt)[0]['generated_text']
65
+ print("Generated parameters:", response)
66
+
67
+ # Parse LLM response
68
+ resolution_limit = None
69
+ pdb_id = None
70
+ sequence = None
71
+ method = None
72
+ has_resolution_query = False
73
+ resolution_direction = "less"
74
+
75
+ # Check if query contains resolution-related terms
76
+ resolution_terms = {
77
+ 'better': 'less',
78
+ 'best': 'less',
79
+ 'highest': 'less',
80
+ 'good': 'less',
81
+ 'fine': 'less',
82
+ 'worse': 'greater',
83
+ 'worst': 'greater',
84
+ 'lowest': 'greater',
85
+ 'poor': 'greater',
86
+ 'resolution': None,
87
+ 'Γ₯': None,
88
+ 'angstrom': None,
89
+ 'than': None,
90
+ 'under': 'less',
91
+ 'below': 'less',
92
+ 'above': 'greater',
93
+ 'over': 'greater'
94
+ }
95
+
96
+ # Check if the original query mentions resolution
97
+ query_lower = query.lower()
98
+
99
+ # Determine resolution direction from query
100
+ for term, direction in resolution_terms.items():
101
+ if term in query_lower:
102
+ has_resolution_query = True
103
+ if direction: # if not None
104
+ resolution_direction = direction
105
+
106
+ # Also check for numerical values with Γ…
107
+ if re.search(r'\d+\.?\d*\s*Γ₯?', query_lower):
108
+ has_resolution_query = True
109
+
110
+ # Clean and parse LLM response
111
+ for line in response.split('\n'):
112
+ if 'Resolution:' in line:
113
+ value = line.split('Resolution:')[1].strip()
114
+ if value.lower() not in ['none', 'n/a'] and has_resolution_query:
115
+ try:
116
+ # Extract just the number
117
+ res_value = ''.join(c for c in value if c.isdigit() or c == '.')
118
+ resolution_limit = float(res_value)
119
+ except ValueError:
120
+ pass
121
+ elif 'Method:' in line:
122
+ value = line.split('Method:')[1].strip()
123
+ if value.lower() not in ['none', 'n/a']:
124
+ method = value.upper()
125
+ elif 'Sequence:' in line:
126
+ value = line.split('Sequence:')[1].strip()
127
+ if value.lower() not in ['none', 'n/a']:
128
+ sequence = value
129
+ elif 'PDB_ID:' in line:
130
+ value = line.split('PDB_ID:')[1].strip()
131
+ if value.lower() not in ['none', 'n/a']:
132
+ pdb_id = value
133
+
134
+ # Build search query
135
+ queries = []
136
+
137
+ # Check if the query contains a protein sequence pattern
138
+ # Check for amino acid sequence (minimum 25 residues)
139
+ query_words = query.split()
140
+ for word in query_words:
141
+ # Check if the word consists of valid amino acid letters
142
+ if (len(word) >= 25 and # minimum 25 residues requirement
143
+ all(c in 'ACDEFGHIKLMNPQRSTVWY' for c in word.upper()) and
144
+ sum(c.isupper() for c in word) / len(word) > 0.8):
145
+ sequence = word
146
+ break
147
+
148
+ # If sequence is found, use SequenceQuery
149
+ if sequence:
150
+ if len(sequence) < 25:
151
+ print("Warning: Sequence must be at least 25 residues long. Skipping sequence search.")
152
+ sequence = None
153
+ else:
154
+ print(f"Adding sequence search with identity 100% for sequence: {sequence}")
155
+ sequence_query = SequenceQuery(
156
+ sequence,
157
+ identity_cutoff=1.0, # 100% identity
158
+ evalue_cutoff=1,
159
+ sequence_type="protein"
160
+ )
161
+ queries.append(sequence_query)
162
+ # If no sequence, proceed with text search
163
+ else:
164
+ # Clean the original query and add text search
165
+ clean_query = query.lower()
166
+
167
+ # Remove resolution numbers and terms if they exist
168
+ if has_resolution_query:
169
+ clean_query = re.sub(r'\d+\.?\d*\s*Γ₯?', '', clean_query)
170
+ for term in resolution_terms:
171
+ clean_query = clean_query.replace(term, '')
172
+
173
+ # Clean up extra spaces and trim
174
+ clean_query = ' '.join(clean_query.split())
175
+
176
+ print("Cleaned query:", clean_query)
177
+
178
+ # Add text search if query is not empty
179
+ if clean_query.strip():
180
+ text_query = AttributeQuery(
181
+ attribute="struct.title",
182
+ operator="contains_phrase",
183
+ value=clean_query
184
+ )
185
+ queries.append(text_query)
186
+
187
+ # Add resolution filter if specified
188
+ if resolution_limit and has_resolution_query:
189
+ operator = "less_or_equal" if resolution_direction == "less" else "greater_or_equal"
190
+ print(f"Adding resolution filter: {operator} {resolution_limit}Γ…")
191
+ resolution_query = AttributeQuery(
192
+ attribute="rcsb_entry_info.resolution_combined",
193
+ operator=operator,
194
+ value=resolution_limit
195
  )
196
+ queries.append(resolution_query)
197
+
198
+ # Add PDB ID search if specified
199
+ if pdb_id:
200
+ print(f"Searching for specific PDB ID: {pdb_id}")
201
+ id_query = AttributeQuery(
202
+ attribute="rcsb_id",
203
+ operator="exact_match",
204
+ value=pdb_id.upper()
205
+ )
206
+ queries = [id_query] # Override other queries for direct PDB ID search
207
+
208
+ # Add experimental method filter if specified
209
+ if method:
210
+ print(f"Adding experimental method filter: {method}")
211
+ method_query = AttributeQuery(
212
+ attribute="exptl.method",
213
+ operator="exact_match",
214
+ value=method
215
+ )
216
+ queries.append(method_query)
217
+
218
+ # Combine queries with AND operator
219
+ if queries:
220
+ final_query = queries[0]
221
+ for q in queries[1:]:
222
+ final_query = final_query & q
223
+
224
+ print("Final query:", final_query)
225
+
226
+ # Execute search
227
+ session = final_query.exec()
228
+ results = []
229
+
230
+ # Process results safely with additional information
231
+ try:
232
+ for entry in session:
233
+ # Handle both string and object types
234
+ if isinstance(entry, str):
235
+ result = {
236
+ 'PDB ID': entry
237
+ }
238
+ else:
239
+ # Handle object type
240
+ result = {
241
+ 'PDB ID': entry.identifier
242
+ }
243
+
244
+ results.append(result)
245
+ except Exception as e:
246
+ print(f"Error processing results: {str(e)}")
247
+ # If error occurs during processing, at least return PDB IDs
248
+ if isinstance(entry, str):
249
+ results.append({'PDB ID': entry})
250
+
251
+ print(f"Found {len(results)} structures")
252
+ return results
253
+
254
+ return []
255
+
256
+ except Exception as e:
257
+ print(f"Error during search: {str(e)}")
258
+ print(f"Error type: {type(e)}")
259
+ return []
260
 
261
+ def pdbsummary(name):
 
 
 
 
 
 
 
262
 
263
+ search_engine = ProteinSearchEngine()
264
 
265
+ query = ProteinQuery(
266
+ name,
267
+ max_resolution= 5.0
268
+ )
 
 
 
269
 
270
+ results = search_engine.search(query)
 
 
 
 
271
 
272
+ answer = ""
273
+ for i, structure in enumerate(results, 1):
274
+ answer += f"\n{i}. PDB ID : {structure.pdb_id}\n"
275
+ answer += f"\nResolution : {structure.resolution:.2f} A \n"
276
+ answer += f"Method : {structure.method}\n Title : {structure.title}\n"
277
+ answer += f"Release Date : {structure.release_date}\n Sequence length: {len(structure.sequence)} aa\n"
278
+ answer += f" Sequence:\n {structure.sequence}\n"
279
 
280
+ return answer
281
 
282
+ def create_interactive_table(df):
283
+ if df.empty:
284
+ return go.Figure()
285
+
286
+ # Create interactive table
287
+ table = go.Figure(data=[go.Table(
288
+ header=dict(
289
+ values=list(df.columns),
290
+ fill_color='paleturquoise',
291
+ align='left',
292
+ font=dict(size=14),
293
+ ),
294
+ cells=dict(
295
+ values=[df[col] for col in df.columns],
296
+ align='left',
297
+ font=dict(size=13),
298
+ height=30
299
+ ),
300
+ columnwidth=[len(str(max(df[col], key=len))) for col in df.columns]
301
+ )])
302
+
303
+ # Update table layout
304
+ table.update_layout(
305
+ margin=dict(l=0, r=0, t=0, b=0),
306
+ height=400,
307
+ autosize=True
308
+ )
309
+
310
+ return table
311
 
312
+ # Simplified Shiny app UI definition
313
+ app_ui = ui.page_fluid(
314
+ ui.tags.head(
315
+ ui.tags.style("""
316
+ .table a {
317
+ color: #0d6efd;
318
+ text-decoration: none;
319
+ }
320
+ .table a:hover {
321
+ color: #0a58ca;
322
+ text-decoration: underline;
323
+ }
324
+ """)
325
+ ),
326
+ ui.h2("Advanced PDB Structure Search Tool"),
327
+ ui.row(
328
+ ui.column(12,
329
+ ui.input_text("query", "Search Query",
330
+ value="Human insulin"),
331
+ )
332
+ ),
333
+ ui.row(
334
+ ui.column(12,
335
+ ui.p("Example queries:"),
336
+ ui.tags.ul(
337
+ ui.tags.li("Human hemoglobin C resolution better than 2.5Γ…"),
338
+ ui.tags.li("Find structures containing sequence MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"),
339
 
340
+ ),
341
+ )
342
+ ),
343
+ ui.row(
344
+ ui.column(12,
345
+ ui.input_action_button("search", "Search", class_="btn-primary"),
346
+ )
347
+ ),
348
+ ui.row(
349
+ ui.column(12,
350
+ ui.h4("Search Parameters:"),
351
+ ui.output_text("search_conditions"),
352
+ )
353
+ ),
354
+ ui.row(
355
+ ui.column(12,
356
+ ui.h4("Top 10 Results:"),
357
+ output_widget("results_table"),
358
+ ui.download_button("download", "Download Results")
359
+ )
360
+ )
361
+ )
362
 
363
+ def server(input, output, session):
364
+ assistant = PDBSearchAssistant()
365
+ results_store = reactive.Value([])
366
+
367
+ @reactive.Effect
368
+ @reactive.event(input.search)
369
+ def _():
370
+ results = assistant.search_pdb(query=input.query())
371
+ results_store.set(results)
372
+
373
+ # Convert results to DataFrame and add hyperlinks
374
+ df = pd.DataFrame(results)
375
+ if not df.empty:
376
+ df['PDB ID'] = df['PDB ID'].apply(
377
+ lambda x: f'<a href="https://www.rcsb.org/3d-view/{x}" target="_blank">{x}</a>'
378
+ )
379
+
380
+ @output
381
+ @render_widget
382
+ def results_table():
383
+ return create_interactive_table(df) # id 순으둜 μ •λ ¬λ˜λŠ”κ±°μΈλ“― Top rank μˆœμ€ μ•„λ‹˜
384
+
385
+ @output
386
+ @render.text
387
+ def search_conditions():
388
+ results = results_store.get()
389
+ return f"""
390
+ Applied Search Conditions:
391
+ - Query: {input.query()}
392
+ - Total structures found: {len(results)}
393
+ """
394
+
395
+ @output
396
+ @render.download(filename="pdb_search_results.csv")
397
+ def download():
398
+ df = pd.DataFrame(results_store.get())
399
+ return df.to_csv(index=False)
400
 
401
+ app = App(app_ui, server)
402
 
403
+ if __name__ == "__main__":
404
+ import nest_asyncio
405
+ nest_asyncio.apply()
406
+ app.run(port=8000)
 
requirements.txt CHANGED
@@ -1,6 +1,9 @@
1
- faicons
 
 
2
  shiny
3
- shinywidgets
4
- plotly
5
  pandas
6
- ridgeplot
 
 
 
 
1
+ transformers
2
+ rcsbsearchapi
3
+ python-dotenv
4
  shiny
 
 
5
  pandas
6
+ plotly
7
+ shinywidgets
8
+ nest-asyncio
9
+ torch