Spaces:
Sleeping
Sleeping
File size: 15,114 Bytes
8fbb714 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 |
DEFAULT_MODEL_ID = "Meta-Llama-3-70B-Instruct"
DEFAULT_INTERFACE_MODEL_ID = "NumbersStation/nsql-llama-2-7B"
DEFAULT_KIND = "json"
DEFAULT_TEMPERATURE = 0.6
DEFAULT_TOP_P = 0.95
DEFAULT_FEW_SHOT_NUM = 3
DEFAULT_FEW_SHOT_SELECTION = "random"
DEFAULT_SAVE_INTERVAL = 3
DEFAULT_RES_DIR = "data/results"
DEFAULT_LOG_DIR = "logs"
DEFAULT_TABLES_DIR = "data/databases"
COOCCURRENCE_QUERY = """
WITH concept_pairs AS (
SELECT p1.concept AS concept1, p2.concept AS concept2, p1.paper_id, p1.tag_type
FROM predictions p1
JOIN predictions p2 ON p1.paper_id = p2.paper_id AND p1.concept < p2.concept
WHERE p1.tag_type = p2.tag_type
)
SELECT concept1, concept2, tag_type, COUNT(DISTINCT paper_id) AS co_occurrences
FROM concept_pairs
GROUP BY concept1, concept2, tag_type
HAVING co_occurrences > 5
ORDER BY co_occurrences DESC;
"""
canned_queries = [
(
"Modalities in Physics and Astronomy papers",
"""
SELECT DISTINCT LOWER(concept) AS concept
FROM predictions
JOIN (
SELECT paper_id, url
FROM papers
WHERE primary_category LIKE '%physics.space-ph%'
OR primary_category LIKE '%astro-ph.%'
) AS paper_ids
ON predictions.paper_id = paper_ids.paper_id
WHERE predictions.tag_type = 'modality'
""",
),
(
"Datasets in Evolutionary Biology that use PDEs",
"""
WITH pde_predictions AS (
SELECT paper_id, concept AS pde_concept, tag_type AS pde_tag_type
FROM predictions
WHERE tag_type IN ('method', 'model')
AND (
LOWER(concept) LIKE '%pde%'
OR LOWER(concept) LIKE '%partial differential equation%'
)
)
SELECT DISTINCT
papers.paper_id,
papers.url,
LOWER(p_dataset.concept) AS dataset,
pde_predictions.pde_concept AS pde_related_concept,
pde_predictions.pde_tag_type AS pde_related_type
FROM papers
JOIN pde_predictions ON papers.paper_id = pde_predictions.paper_id
LEFT JOIN predictions p_dataset ON papers.paper_id = p_dataset.paper_id
WHERE papers.primary_category LIKE '%q-bio.PE%'
AND (p_dataset.tag_type = 'dataset' OR p_dataset.tag_type IS NULL)
ORDER BY papers.paper_id, dataset, pde_related_concept;
""",
),
(
"Trends in objects of study in Cosmology since 2019",
"""
SELECT
substr(papers.updated_on, 2, 4) as year,
predictions.concept as object,
COUNT(DISTINCT papers.paper_id) as paper_count
FROM
papers
JOIN
predictions ON papers.paper_id = predictions.paper_id
WHERE
predictions.tag_type = 'object'
AND CAST(SUBSTR(papers.updated_on, 2, 4) AS INTEGER) >= 2019
GROUP BY
year, object
ORDER BY
year DESC, paper_count DESC;
""",
),
(
"New datasets in fluid dynamics since 2020",
"""
WITH ranked_datasets AS (
SELECT
p.paper_id,
p.url,
pred.concept AS dataset,
p.updated_on,
ROW_NUMBER() OVER (PARTITION BY pred.concept ORDER BY p.updated_on ASC) AS rn
FROM
papers p
JOIN
predictions pred ON p.paper_id = pred.paper_id
WHERE
pred.tag_type = 'dataset'
AND p.primary_category LIKE '%physics.flu-dyn%'
AND CAST(SUBSTR(p.updated_on, 2, 4) AS INTEGER) >= 2020
)
SELECT
paper_id,
url,
dataset,
updated_on
FROM
ranked_datasets
WHERE
rn = 1
ORDER BY
updated_on ASC
""",
),
(
"Evolutionary biology datasets that use spatiotemporal dynamics",
"""
WITH evo_bio_papers AS (
SELECT paper_id
FROM papers
WHERE primary_category LIKE '%q-bio.PE%'
),
spatiotemporal_keywords AS (
SELECT 'spatio-temporal' AS keyword
UNION SELECT 'spatiotemporal'
UNION SELECT 'spatio-temporal'
UNION SELECT 'spatial and temporal'
UNION SELECT 'space-time'
UNION SELECT 'geographic distribution'
UNION SELECT 'phylogeograph'
UNION SELECT 'biogeograph'
UNION SELECT 'dispersal'
UNION SELECT 'migration'
UNION SELECT 'range expansion'
UNION SELECT 'population dynamics'
)
SELECT DISTINCT
p.paper_id,
p.updated_on,
p.abstract,
d.concept AS dataset,
GROUP_CONCAT(DISTINCT stk.keyword) AS spatiotemporal_keywords_found
FROM
evo_bio_papers ebp
JOIN
papers p ON ebp.paper_id = p.paper_id
JOIN
predictions d ON p.paper_id = d.paper_id
JOIN
predictions st ON p.paper_id = st.paper_id
JOIN
spatiotemporal_keywords stk
WHERE
d.tag_type = 'dataset'
AND st.tag_type = 'modality'
AND LOWER(st.concept) LIKE '%' || stk.keyword || '%'
GROUP BY
p.paper_id, p.updated_on, p.abstract, d.concept
ORDER BY
p.updated_on DESC
""",
),
(
"What percentage of papers use only galaxy or spectra, or both or neither?",
"""
WITH paper_modalities AS (
SELECT
p.paper_id,
MAX(CASE WHEN LOWER(pred.concept) LIKE '%imag%' THEN 1 ELSE 0 END) AS uses_galaxy_images,
MAX(CASE WHEN LOWER(pred.concept) LIKE '%spectr%' THEN 1 ELSE 0 END) AS uses_spectra
FROM
papers p
LEFT JOIN
predictions pred ON p.paper_id = pred.paper_id
WHERE
p.primary_category LIKE '%astro-ph%'
AND pred.tag_type = 'modality'
GROUP BY
p.paper_id
),
categorized_papers AS (
SELECT
CASE
WHEN uses_galaxy_images = 1 AND uses_spectra = 1 THEN 'Both'
WHEN uses_galaxy_images = 1 THEN 'Only Galaxy Images'
WHEN uses_spectra = 1 THEN 'Only Spectra'
ELSE 'Neither'
END AS category,
COUNT(*) AS paper_count
FROM
paper_modalities
GROUP BY
CASE
WHEN uses_galaxy_images = 1 AND uses_spectra = 1 THEN 'Both'
WHEN uses_galaxy_images = 1 THEN 'Only Galaxy Images'
WHEN uses_spectra = 1 THEN 'Only Spectra'
ELSE 'Neither'
END
)
SELECT
category,
paper_count,
ROUND(CAST(paper_count AS FLOAT) / (SELECT SUM(paper_count) FROM categorized_papers) * 100, 2) AS percentage
FROM
categorized_papers
ORDER BY
paper_count DESC
""",
),
(
"What are all the next highest data modalities after images and spectra?",
"""
SELECT
LOWER(concept) AS modality,
COUNT(DISTINCT paper_id) AS usage_count
FROM
predictions
WHERE
tag_type = 'modality'
AND LOWER(concept) NOT LIKE '%imag%'
AND LOWER(concept) NOT LIKE '%spectr%'
GROUP BY
LOWER(concept)
ORDER BY
usage_count DESC
""",
),
(
"If we include the next biggest data modality, how much does coverage change?",
"""
WITH modality_counts AS (
SELECT
LOWER(concept) AS modality,
COUNT(DISTINCT paper_id) AS usage_count
FROM
predictions
WHERE
tag_type = 'modality'
AND LOWER(concept) NOT LIKE '%imag%'
AND LOWER(concept) NOT LIKE '%spectr%'
GROUP BY
LOWER(concept)
ORDER BY
usage_count DESC
LIMIT 1
),
paper_modalities AS (
SELECT
p.paper_id,
MAX(CASE WHEN LOWER(pred.concept) LIKE '%imag%' THEN 1 ELSE 0 END) AS uses_galaxy_images,
MAX(CASE WHEN LOWER(pred.concept) LIKE '%spectr%' THEN 1 ELSE 0 END) AS uses_spectra,
MAX(CASE WHEN LOWER(pred.concept) LIKE (SELECT '%' || modality || '%' FROM modality_counts) THEN 1 ELSE 0 END) AS uses_third_modality
FROM
papers p
LEFT JOIN
predictions pred ON p.paper_id = pred.paper_id
WHERE
p.primary_category LIKE '%astro-ph%'
AND pred.tag_type = 'modality'
GROUP BY
p.paper_id
),
coverage_before AS (
SELECT
SUM(CASE WHEN uses_galaxy_images = 1 OR uses_spectra = 1 THEN 1 ELSE 0 END) AS covered_papers,
COUNT(*) AS total_papers
FROM
paper_modalities
),
coverage_after AS (
SELECT
SUM(CASE WHEN uses_galaxy_images = 1 OR uses_spectra = 1 OR uses_third_modality = 1 THEN 1 ELSE 0 END) AS covered_papers,
COUNT(*) AS total_papers
FROM
paper_modalities
)
SELECT
(SELECT modality FROM modality_counts) AS third_modality,
ROUND(CAST(covered_papers AS FLOAT) / total_papers * 100, 2) AS coverage_before_percent,
ROUND(CAST((SELECT covered_papers FROM coverage_after) AS FLOAT) / total_papers * 100, 2) AS coverage_after_percent,
ROUND(CAST((SELECT covered_papers FROM coverage_after) AS FLOAT) / total_papers * 100, 2) -
ROUND(CAST(covered_papers AS FLOAT) / total_papers * 100, 2) AS coverage_increase_percent
FROM
coverage_before
""",
),
(
"Coverage if we select the next 5 highest modalities?",
"""
WITH ranked_modalities AS (
SELECT
LOWER(concept) AS modality,
COUNT(DISTINCT paper_id) AS usage_count,
ROW_NUMBER() OVER (ORDER BY COUNT(DISTINCT paper_id) DESC) AS rank
FROM
predictions
WHERE
tag_type = 'modality'
AND LOWER(concept) NOT LIKE '%imag%'
AND LOWER(concept) NOT LIKE '%spectr%'
GROUP BY
LOWER(concept)
),
paper_modalities AS (
SELECT
p.paper_id,
MAX(CASE WHEN LOWER(pred.concept) LIKE '%imag%' THEN 1 ELSE 0 END) AS uses_images,
MAX(CASE WHEN LOWER(pred.concept) LIKE '%spectr%' THEN 1 ELSE 0 END) AS uses_spectra,
MAX(CASE WHEN rm.rank = 1 THEN 1 ELSE 0 END) AS uses_modality_1,
MAX(CASE WHEN rm.rank = 2 THEN 1 ELSE 0 END) AS uses_modality_2,
MAX(CASE WHEN rm.rank = 3 THEN 1 ELSE 0 END) AS uses_modality_3,
MAX(CASE WHEN rm.rank = 4 THEN 1 ELSE 0 END) AS uses_modality_4,
MAX(CASE WHEN rm.rank = 5 THEN 1 ELSE 0 END) AS uses_modality_5
FROM
papers p
LEFT JOIN
predictions pred ON p.paper_id = pred.paper_id
LEFT JOIN
ranked_modalities rm ON LOWER(pred.concept) = rm.modality
WHERE
p.primary_category LIKE '%astro-ph%'
AND pred.tag_type = 'modality'
GROUP BY
p.paper_id
),
cumulative_coverage AS (
SELECT
'Images and Spectra' AS modalities,
0 AS added_modality_rank,
SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 THEN 1 ELSE 0 END) AS covered_papers,
COUNT(*) AS total_papers
FROM
paper_modalities
UNION ALL
SELECT
'Images, Spectra, and Modality 1' AS modalities,
1 AS added_modality_rank,
SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 THEN 1 ELSE 0 END) AS covered_papers,
COUNT(*) AS total_papers
FROM
paper_modalities
UNION ALL
SELECT
'Images, Spectra, Modality 1, and 2' AS modalities,
2 AS added_modality_rank,
SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 THEN 1 ELSE 0 END) AS covered_papers,
COUNT(*) AS total_papers
FROM
paper_modalities
UNION ALL
SELECT
'Images, Spectra, Modality 1, 2, and 3' AS modalities,
3 AS added_modality_rank,
SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 OR uses_modality_3 = 1 THEN 1 ELSE 0 END) AS covered_papers,
COUNT(*) AS total_papers
FROM
paper_modalities
UNION ALL
SELECT
'Images, Spectra, Modality 1, 2, 3, and 4' AS modalities,
4 AS added_modality_rank,
SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 OR uses_modality_3 = 1 OR uses_modality_4 = 1 THEN 1 ELSE 0 END) AS covered_papers,
COUNT(*) AS total_papers
FROM
paper_modalities
UNION ALL
SELECT
'Images, Spectra, Modality 1, 2, 3, 4, and 5' AS modalities,
5 AS added_modality_rank,
SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 OR uses_modality_3 = 1 OR uses_modality_4 = 1 OR uses_modality_5 = 1 THEN 1 ELSE 0 END) AS covered_papers,
COUNT(*) AS total_papers
FROM
paper_modalities
)
SELECT
cc.modalities,
COALESCE(rm.modality, 'N/A') AS added_modality,
rm.usage_count AS added_modality_usage,
ROUND(CAST(cc.covered_papers AS FLOAT) / cc.total_papers * 100, 2) AS coverage_percent,
ROUND(CAST(cc.covered_papers AS FLOAT) / cc.total_papers * 100, 2) -
LAG(ROUND(CAST(cc.covered_papers AS FLOAT) / cc.total_papers * 100, 2), 1, 0) OVER (ORDER BY cc.added_modality_rank) AS coverage_increase_percent
FROM
cumulative_coverage cc
LEFT JOIN
ranked_modalities rm ON cc.added_modality_rank = rm.rank
ORDER BY
cc.added_modality_rank
""",
),
(
"List all papers",
"SELECT paper_id, abstract AS abstract_preview, authors, primary_category FROM papers",
),
(
"Count papers by category",
"SELECT primary_category, COUNT(*) as paper_count FROM papers GROUP BY primary_category ORDER BY paper_count DESC",
),
(
"Top authors with most papers",
"""
WITH author_papers AS (
SELECT json_each.value AS author
FROM papers, json_each(papers.authors)
)
SELECT author, COUNT(*) as paper_count
FROM author_papers
GROUP BY author
ORDER BY paper_count DESC
""",
),
(
"Papers with 'quantum' in abstract",
"SELECT paper_id, abstract AS abstract_preview FROM papers WHERE abstract LIKE '%quantum%'",
),
(
"Most common concepts",
"SELECT concept, COUNT(*) as concept_count FROM predictions GROUP BY concept ORDER BY concept_count DESC",
),
(
"Papers with multiple authors",
"""
SELECT paper_id, json_array_length(authors) as author_count, authors
FROM papers
WHERE json_array_length(authors) > 1
ORDER BY author_count DESC
""",
),
]
|