antonlabate commited on
Commit
d758c99
·
1 Parent(s): 13c08c5
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. gap-text2sql-main/CODE_OF_CONDUCT.md +4 -0
  2. gap-text2sql-main/CONTRIBUTING.md +59 -0
  3. gap-text2sql-main/LICENSE +175 -0
  4. gap-text2sql-main/NOTICE +1 -0
  5. gap-text2sql-main/README.md +202 -0
  6. gap-text2sql-main/data/preprocessed_data/bart_parser_pretrain_label_mapping.json +11 -0
  7. gap-text2sql-main/mrat-sql-gap/.gitignore +133 -0
  8. gap-text2sql-main/mrat-sql-gap/.ptignore +19 -0
  9. gap-text2sql-main/mrat-sql-gap/BART_large.sh +33 -0
  10. gap-text2sql-main/mrat-sql-gap/BERTimbau-base.sh +34 -0
  11. gap-text2sql-main/mrat-sql-gap/BERTimbau-large.sh +33 -0
  12. gap-text2sql-main/mrat-sql-gap/crash_on_ipy.py +19 -0
  13. gap-text2sql-main/mrat-sql-gap/data/spider/generate.sh +24 -0
  14. gap-text2sql-main/mrat-sql-gap/data/spider/train_spider.json.patch +49 -0
  15. gap-text2sql-main/mrat-sql-gap/data/sqlite_files/singer/singer.sqlite +0 -0
  16. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/BART-large-en/gap-bart.jsonnet +106 -0
  17. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/BART-large-en/nl2code-base.libsonnet +109 -0
  18. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/BERTimbau-base/nl2code-base.libsonnet +109 -0
  19. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/BERTimbau-base/nl2code-bertimbau-base.jsonnet +110 -0
  20. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/BERTimbau-large/nl2code-base.libsonnet +109 -0
  21. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/BERTimbau-large/nl2code-bertimbau-large.jsonnet +110 -0
  22. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/T5-v1_1-large-170Ksteps-FIT-en-enr-enb/T5-v1_1.jsonnet +106 -0
  23. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/T5-v1_1-large-170Ksteps-FIT-en-enr-enb/nl2code-base.libsonnet +109 -0
  24. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/T5-v1_1-large-170Ksteps-FIT-en-extra-3enr-1en/T5-v1_1.jsonnet +106 -0
  25. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/T5-v1_1-large-170Ksteps-FIT-en-extra-3enr-1en/nl2code-base.libsonnet +109 -0
  26. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/T5-v1_1-large-170Ksteps-FIT-en/T5-v1_1.jsonnet +106 -0
  27. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/T5-v1_1-large-170Ksteps-FIT-en/nl2code-base.libsonnet +109 -0
  28. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/T5-v1_1-large-170Ksteps-en/T5-v1_1.jsonnet +106 -0
  29. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/T5-v1_1-large-170Ksteps-en/nl2code-base.libsonnet +109 -0
  30. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mBART50MtoM-large-en-pt-es-fr/gap-bart.jsonnet +103 -0
  31. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mBART50MtoM-large-en-pt-es-fr/nl2code-base.libsonnet +109 -0
  32. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mBART50MtoM-large-en/gap-bart.jsonnet +103 -0
  33. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mBART50MtoM-large-en/nl2code-base.libsonnet +109 -0
  34. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mBART50MtoM-large-pt-en/gap-bart.jsonnet +103 -0
  35. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mBART50MtoM-large-pt-en/nl2code-base.libsonnet +109 -0
  36. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mBART50MtoM-large-pt/gap-bart.jsonnet +103 -0
  37. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mBART50MtoM-large-pt/nl2code-base.libsonnet +109 -0
  38. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-FIT-en-pt-es-fr-extra-3enr-3ptr-3esr-3frr/mT5.jsonnet +106 -0
  39. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-FIT-en-pt-es-fr-extra-3enr-3ptr-3esr-3frr/nl2code-base.libsonnet +109 -0
  40. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-FIT-en-pt-es-fr/mT5.jsonnet +106 -0
  41. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-FIT-en-pt-es-fr/nl2code-base.libsonnet +109 -0
  42. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-en-pt-es-fr-enr-enb/mT5.jsonnet +106 -0
  43. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-en-pt-es-fr-enr-enb/nl2code-base.libsonnet +109 -0
  44. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-en-pt-es-fr/gap-mT5.jsonnet +106 -0
  45. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-en-pt-es-fr/nl2code-base.libsonnet +109 -0
  46. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-en/gap-mT5.jsonnet +106 -0
  47. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-en/nl2code-base.libsonnet +109 -0
  48. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/spider-BART-large-en-train_en-eval.jsonnet +33 -0
  49. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/spider-BERTimbau-base-pt.jsonnet +33 -0
  50. gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/spider-BERTimbau-large-pt.jsonnet +33 -0
gap-text2sql-main/CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ ## Code of Conduct
2
+ This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3
+ For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4
+ [email protected] with any additional questions or comments.
gap-text2sql-main/CONTRIBUTING.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing Guidelines
2
+
3
+ Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
4
+ documentation, we greatly value feedback and contributions from our community.
5
+
6
+ Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
7
+ information to effectively respond to your bug report or contribution.
8
+
9
+
10
+ ## Reporting Bugs/Feature Requests
11
+
12
+ We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13
+
14
+ When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15
+ reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16
+
17
+ * A reproducible test case or series of steps
18
+ * The version of our code being used
19
+ * Any modifications you've made relevant to the bug
20
+ * Anything unusual about your environment or deployment
21
+
22
+
23
+ ## Contributing via Pull Requests
24
+ Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25
+
26
+ 1. You are working against the latest source on the *main* branch.
27
+ 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28
+ 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29
+
30
+ To send us a pull request, please:
31
+
32
+ 1. Fork the repository.
33
+ 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34
+ 3. Ensure local tests pass.
35
+ 4. Commit to your fork using clear commit messages.
36
+ 5. Send us a pull request, answering any default questions in the pull request interface.
37
+ 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38
+
39
+ GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40
+ [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41
+
42
+
43
+ ## Finding contributions to work on
44
+ Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45
+
46
+
47
+ ## Code of Conduct
48
+ This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49
+ For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50
+ [email protected] with any additional questions or comments.
51
+
52
+
53
+ ## Security issue notifications
54
+ If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55
+
56
+
57
+ ## Licensing
58
+
59
+ See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
gap-text2sql-main/LICENSE ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Apache License
3
+ Version 2.0, January 2004
4
+ http://www.apache.org/licenses/
5
+
6
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
+
8
+ 1. Definitions.
9
+
10
+ "License" shall mean the terms and conditions for use, reproduction,
11
+ and distribution as defined by Sections 1 through 9 of this document.
12
+
13
+ "Licensor" shall mean the copyright owner or entity authorized by
14
+ the copyright owner that is granting the License.
15
+
16
+ "Legal Entity" shall mean the union of the acting entity and all
17
+ other entities that control, are controlled by, or are under common
18
+ control with that entity. For the purposes of this definition,
19
+ "control" means (i) the power, direct or indirect, to cause the
20
+ direction or management of such entity, whether by contract or
21
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
+ outstanding shares, or (iii) beneficial ownership of such entity.
23
+
24
+ "You" (or "Your") shall mean an individual or Legal Entity
25
+ exercising permissions granted by this License.
26
+
27
+ "Source" form shall mean the preferred form for making modifications,
28
+ including but not limited to software source code, documentation
29
+ source, and configuration files.
30
+
31
+ "Object" form shall mean any form resulting from mechanical
32
+ transformation or translation of a Source form, including but
33
+ not limited to compiled object code, generated documentation,
34
+ and conversions to other media types.
35
+
36
+ "Work" shall mean the work of authorship, whether in Source or
37
+ Object form, made available under the License, as indicated by a
38
+ copyright notice that is included in or attached to the work
39
+ (an example is provided in the Appendix below).
40
+
41
+ "Derivative Works" shall mean any work, whether in Source or Object
42
+ form, that is based on (or derived from) the Work and for which the
43
+ editorial revisions, annotations, elaborations, or other modifications
44
+ represent, as a whole, an original work of authorship. For the purposes
45
+ of this License, Derivative Works shall not include works that remain
46
+ separable from, or merely link (or bind by name) to the interfaces of,
47
+ the Work and Derivative Works thereof.
48
+
49
+ "Contribution" shall mean any work of authorship, including
50
+ the original version of the Work and any modifications or additions
51
+ to that Work or Derivative Works thereof, that is intentionally
52
+ submitted to Licensor for inclusion in the Work by the copyright owner
53
+ or by an individual or Legal Entity authorized to submit on behalf of
54
+ the copyright owner. For the purposes of this definition, "submitted"
55
+ means any form of electronic, verbal, or written communication sent
56
+ to the Licensor or its representatives, including but not limited to
57
+ communication on electronic mailing lists, source code control systems,
58
+ and issue tracking systems that are managed by, or on behalf of, the
59
+ Licensor for the purpose of discussing and improving the Work, but
60
+ excluding communication that is conspicuously marked or otherwise
61
+ designated in writing by the copyright owner as "Not a Contribution."
62
+
63
+ "Contributor" shall mean Licensor and any individual or Legal Entity
64
+ on behalf of whom a Contribution has been received by Licensor and
65
+ subsequently incorporated within the Work.
66
+
67
+ 2. Grant of Copyright License. Subject to the terms and conditions of
68
+ this License, each Contributor hereby grants to You a perpetual,
69
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
+ copyright license to reproduce, prepare Derivative Works of,
71
+ publicly display, publicly perform, sublicense, and distribute the
72
+ Work and such Derivative Works in Source or Object form.
73
+
74
+ 3. Grant of Patent License. Subject to the terms and conditions of
75
+ this License, each Contributor hereby grants to You a perpetual,
76
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
+ (except as stated in this section) patent license to make, have made,
78
+ use, offer to sell, sell, import, and otherwise transfer the Work,
79
+ where such license applies only to those patent claims licensable
80
+ by such Contributor that are necessarily infringed by their
81
+ Contribution(s) alone or by combination of their Contribution(s)
82
+ with the Work to which such Contribution(s) was submitted. If You
83
+ institute patent litigation against any entity (including a
84
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
85
+ or a Contribution incorporated within the Work constitutes direct
86
+ or contributory patent infringement, then any patent licenses
87
+ granted to You under this License for that Work shall terminate
88
+ as of the date such litigation is filed.
89
+
90
+ 4. Redistribution. You may reproduce and distribute copies of the
91
+ Work or Derivative Works thereof in any medium, with or without
92
+ modifications, and in Source or Object form, provided that You
93
+ meet the following conditions:
94
+
95
+ (a) You must give any other recipients of the Work or
96
+ Derivative Works a copy of this License; and
97
+
98
+ (b) You must cause any modified files to carry prominent notices
99
+ stating that You changed the files; and
100
+
101
+ (c) You must retain, in the Source form of any Derivative Works
102
+ that You distribute, all copyright, patent, trademark, and
103
+ attribution notices from the Source form of the Work,
104
+ excluding those notices that do not pertain to any part of
105
+ the Derivative Works; and
106
+
107
+ (d) If the Work includes a "NOTICE" text file as part of its
108
+ distribution, then any Derivative Works that You distribute must
109
+ include a readable copy of the attribution notices contained
110
+ within such NOTICE file, excluding those notices that do not
111
+ pertain to any part of the Derivative Works, in at least one
112
+ of the following places: within a NOTICE text file distributed
113
+ as part of the Derivative Works; within the Source form or
114
+ documentation, if provided along with the Derivative Works; or,
115
+ within a display generated by the Derivative Works, if and
116
+ wherever such third-party notices normally appear. The contents
117
+ of the NOTICE file are for informational purposes only and
118
+ do not modify the License. You may add Your own attribution
119
+ notices within Derivative Works that You distribute, alongside
120
+ or as an addendum to the NOTICE text from the Work, provided
121
+ that such additional attribution notices cannot be construed
122
+ as modifying the License.
123
+
124
+ You may add Your own copyright statement to Your modifications and
125
+ may provide additional or different license terms and conditions
126
+ for use, reproduction, or distribution of Your modifications, or
127
+ for any such Derivative Works as a whole, provided Your use,
128
+ reproduction, and distribution of the Work otherwise complies with
129
+ the conditions stated in this License.
130
+
131
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
132
+ any Contribution intentionally submitted for inclusion in the Work
133
+ by You to the Licensor shall be under the terms and conditions of
134
+ this License, without any additional terms or conditions.
135
+ Notwithstanding the above, nothing herein shall supersede or modify
136
+ the terms of any separate license agreement you may have executed
137
+ with Licensor regarding such Contributions.
138
+
139
+ 6. Trademarks. This License does not grant permission to use the trade
140
+ names, trademarks, service marks, or product names of the Licensor,
141
+ except as required for reasonable and customary use in describing the
142
+ origin of the Work and reproducing the content of the NOTICE file.
143
+
144
+ 7. Disclaimer of Warranty. Unless required by applicable law or
145
+ agreed to in writing, Licensor provides the Work (and each
146
+ Contributor provides its Contributions) on an "AS IS" BASIS,
147
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
+ implied, including, without limitation, any warranties or conditions
149
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
+ PARTICULAR PURPOSE. You are solely responsible for determining the
151
+ appropriateness of using or redistributing the Work and assume any
152
+ risks associated with Your exercise of permissions under this License.
153
+
154
+ 8. Limitation of Liability. In no event and under no legal theory,
155
+ whether in tort (including negligence), contract, or otherwise,
156
+ unless required by applicable law (such as deliberate and grossly
157
+ negligent acts) or agreed to in writing, shall any Contributor be
158
+ liable to You for damages, including any direct, indirect, special,
159
+ incidental, or consequential damages of any character arising as a
160
+ result of this License or out of the use or inability to use the
161
+ Work (including but not limited to damages for loss of goodwill,
162
+ work stoppage, computer failure or malfunction, or any and all
163
+ other commercial damages or losses), even if such Contributor
164
+ has been advised of the possibility of such damages.
165
+
166
+ 9. Accepting Warranty or Additional Liability. While redistributing
167
+ the Work or Derivative Works thereof, You may choose to offer,
168
+ and charge a fee for, acceptance of support, warranty, indemnity,
169
+ or other liability obligations and/or rights consistent with this
170
+ License. However, in accepting such obligations, You may act only
171
+ on Your own behalf and on Your sole responsibility, not on behalf
172
+ of any other Contributor, and only if You agree to indemnify,
173
+ defend, and hold each Contributor harmless for any liability
174
+ incurred by, or claims asserted against, such Contributor by reason
175
+ of your accepting any such warranty or additional liability.
gap-text2sql-main/NOTICE ADDED
@@ -0,0 +1 @@
 
 
1
+ Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
gap-text2sql-main/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mRAT-SQL+GAP - Multilingual version of the RAT-SQL+GAP
2
+ Code and model from our BRACIS 2021 [paper published in Springer Lecture Notes in Computer Science](https://link.springer.com/chapter/10.1007%2F978-3-030-91699-2_35), [here the pre-print in arXiv](https://arxiv.org/abs/2110.03546).
3
+
4
+ Based on: RAT-SQL+GAP: [Github](https://github.com/awslabs/gap-text2sql). Paper: [AAAI 2021 paper](https://arxiv.org/abs/2012.10309)
5
+
6
+
7
+ ## Abstract
8
+
9
+ mRAT-SQL+GAP is a multilingual version of the RAT-SQL+GAP, wich start with Portuguese Language. Here is available the code, dataset and the results.
10
+
11
+
12
+ ## Directory Structure
13
+ Go to the directory where you want to install the structure
14
+ ```bash
15
+ git clone https://github.com/C4AI/gap-text2sql
16
+ cd gap-text2sql/mrat-sql-gap
17
+ ```
18
+
19
+ ## Conda mtext2slq Environment Setup
20
+ ```bash
21
+ conda create --name mtext2sql python=3.7
22
+ conda activate mtext2sql
23
+ conda install pytorch=1.5 cudatoolkit=10.2 -c pytorch
24
+ pip install gdown
25
+ conda install -c conda-forge jsonnet
26
+ pip install -r requirements.txt
27
+ python -c "import nltk; nltk.download('stopwords'); nltk.download('punkt')"
28
+ conda install jupyter notebook
29
+ conda install -c conda-forge jupyter_contrib_nbextensions
30
+
31
+ ```
32
+
33
+
34
+
35
+ ## Setup Script
36
+ Just run this script below, it will copy the datasets.
37
+ The original version of the Spider dataset is distributed under the [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/legalcode) license.
38
+ The modified versions (translated to Portuguese, Spanish, French, double-size(English and Portuguese) and quad-size (English, Portuguese, Spanish and French)) of train_spider.json, train_others.json, and dev.json are distributed under the [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/legalcode) license, respecting ShareAlike.
39
+
40
+ ```bash
41
+ chmod +x setup.sh
42
+ ./setup.sh
43
+ ```
44
+
45
+ ## Specific setup
46
+ The models and checkpoints have big files (GBytes), so if you have enough disk space you can run all shell scripts. To understand how things work, run just BART_large.sh and after run the others.
47
+ ```bash
48
+ ./BART_large.sh
49
+ ./mBART50MtoM-large.sh
50
+ ./mT5_large.sh
51
+ ./BERTimbau-base.sh
52
+ ./BERTimbau-large.sh
53
+ ```
54
+
55
+ ## Environment Test
56
+ Now the environment is ready for training (fine-tune) and inferences. The training is very slow more than 60 hours for BART, BERTimbau, mBART50, and more than 28 hours for mT5. Therefore I recommend testing the environment with the inference.
57
+
58
+ ### Preprocess dataset
59
+ This preprocess step is necessary both for inference and for training. It will take some time, maybe 40 minutes.
60
+ I will use the script for BART, but you can use the other, look the directory experiments/spider-configs
61
+ ```bash
62
+ python run.py preprocess experiments/spider-configs/spider-BART-large-en-train_en-eval.jsonnet
63
+ ```
64
+ You can see the files processed in the paths:
65
+ `data/spider-en/nl2code-1115,output_from=true,fs=2,emb=bart,cvlink`
66
+
67
+ ## Inference
68
+ I will use the script for BART again.
69
+ Note: We are making inferences using the checkpoint already trained (directory logdir) and defined in:
70
+ `experiments/spider-configs/spider-BART-large-en-train_en-eval.jsonnet`
71
+ `logdir: "logdir/BART-large-en-train",` and
72
+ `eval_steps: [40300],`
73
+ ```bash
74
+ python run.py eval experiments/spider-configs/spider-BART-large-en-train_en-eval.jsonnet
75
+ ```
76
+
77
+ You then get the inference results and evaluation results in the paths:
78
+
79
+ `ie_dirs/BART-large-en-train/bart-large-en_run_1_true_1-step41000.infer`
80
+
81
+ and
82
+
83
+ `ie_dirs/BART-large-en-train/bart-large-en_run_1_true_1-step41000.eval`.
84
+
85
+ ## Training
86
+ Execute if it is really necessary, if you want to fine-tune the model, this will take a long time. But if you have a good machine available and want to see different checkpoints in the logdir, do it.
87
+
88
+ ```bash
89
+ python run.py train experiments/spider-configs/spider-BART-large-en-train_en-eval.jsonnet
90
+ ```
91
+ You then get the training checkpoints in the paths:
92
+ `logdir/BART-large-en-train`
93
+
94
+
95
+ ## Checkpoints and Inferences
96
+
97
+ The checkpoints are available here (ESM - Exact Set Matching Accuracy):
98
+ Paper mRAT-SQL+GAP - Multilingual version of the RAT-SQL+GAP
99
+ * BART-large trained in English [, ESM all: 0.718]
100
+ * Checkpoint: [40300](https://drive.google.com/file/d/1F4R-WkJKtJ4lFni3q4lBug6tzSo0H5Qe/view?usp=sharing)
101
+ * Inference - English: [ESM all: 0.718 - Baseline](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/BART-large-en-train/spider_eval_match_ratsqlgap-BART-ori-step40300.txt)
102
+ * BERTimbau-base trained in Portuguese
103
+ * Checkpoint: [24100](https://drive.google.com/file/d/1gIZS0RuIxdjmm7sNbA3R6p6--9iMJmW8/view?usp=sharing)
104
+ * Inference - Portuguese: [ESM all: 0.417](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/BERTimbau-base-pt-train/spider_eval_match_ratsqlgap-bertimbau-base-step24100.txt)
105
+ * mBART50MtoM-large trained in English
106
+ * Checkpoint [23100](https://drive.google.com/file/d/16mQf1gMTVGkvONUGpzELzkjCFX5M74cO/view?usp=sharing)
107
+ * Inference - English: [ESM all: 0.651](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mBART50MtoM-large-en-train/spider_eval_match_ratsqlgap-mBART50MtoM-large-en-ori-data-step23100.txt)
108
+ * mBART50MtoM-large trained in Portuguese
109
+ * Checkpoint [39100](https://drive.google.com/file/d/1fWPH4bG9-UjW-p6OgmpINWLLsnOopWLh/view?usp=sharing)
110
+ * Inference - Portuguese: [ESM all: 0.588](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mBART50MtoM-large-pt-train/spider_eval_match_ratsqlgap-mBART50MtoM-largeSimplemmaPtEn-step39100.txt)
111
+ * mBART50MtoM-large trained in English and Portuguese (together)
112
+ * Checkpoint [41000](https://drive.google.com/file/d/1szb44h_2t3fK2Vc02PdaAjDqnkWqM-0U/view?usp=sharing)
113
+ * Inference - English: [ESM all: 0.664](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mBART50MtoM-large-en-pt-train/41000/spider_eval_match_ratsqlgap-mBART50MtoM-largeSimplemmaPtEn-pt-en-Eval-en-step41000.txt)
114
+ * Inference - Portuguese: [ESM all: 0.595 Best inferences in Portuguese](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mBART50MtoM-large-en-pt-train/41000/spider_eval_match_ratsqlgap-mBART50MtoM-largeSimplemmaPtEn-pt-en-Eval-pt-step41000.txt)
115
+ * Checkpoint [21100](https://drive.google.com/file/d/1MeLkvGf9-5it1JXnUvU9AmXVnnbAAfP0/view?usp=sharing)
116
+ * Inference - English: [ESM all: 0.678 Best inferences in English](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mBART50MtoM-large-en-pt-train/21100/spider_eval_match_ratsqlgap-mBART50MtoM-largeSimplemmaPtEn-pt-en-Eval-en-step21100.txt)
117
+ * Inference - Portuguese: [ESM all: 0.581](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mBART50MtoM-large-en-pt-train/21100/spider_eval_match_ratsqlgap-mBART50MtoM-largeSimplemmaPtEn-pt-en-Eval-pt-step21100.txt)
118
+
119
+ Future work of the paper mRAT-SQL+GAP
120
+ * BERTimbau-large trained in Portuguese
121
+ * Checkpoint: [40100](https://drive.google.com/file/d/1q1NOxisOcIdkMftzGPVxBDn989LDDG3X/view?usp=sharing)
122
+ * Inference - Portuguese: [ESM all: 0.418](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/BERTimbau-large-pt-train/spider_eval_match_ratsqlgap-BERTimbau-large-pt-train-Eval-pt-step40100.txt)
123
+ * mBART50MtoM-large trained in English, Portuguese, Spanish and French (together) - just best inferences
124
+ * Checkpoint: [39100](https://drive.google.com/file/d/18nioEDEpZf-6CNH_sU3IMZxsSNts_a4y/view?usp=sharing)
125
+ * Inference - English: [ESM all: 0.696](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mBART50MtoM-large-en-pt-es-fr-train/39100/spider_eval_match_ratsqlgap-mBART50MtoM-large-en-pt-es-fr-train_en-eval-step39100.txt)
126
+ * Checkpoint: [42100](https://drive.google.com/file/d/1AmJjyVHiP9V-FzW9Q1sXge4YMWAP-srg/view?usp=sharing)
127
+ * Inference - Portuguese: [ESM all pt: 0.626](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mBART50MtoM-large-en-pt-es-fr-train/42100/spider_eval_match_ratsqlgap-mBART50MtoM-large-en-pt-es-fr-train_pt-eval-step42100.txt)
128
+ * Inference - Spanish: [ESM all: 0.628](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mBART50MtoM-large-en-pt-es-fr-train/42100/spider_eval_match_ratsqlgap-mBART50MtoM-large-en-pt-es-fr-train_es-eval-step42100.txt)
129
+ * Checkpoint: [44100](https://drive.google.com/file/d/1P0F218tNkW42Pb7okn3uFyTT5sy4zGZR/view?usp=sharing)
130
+ * Inference - French: [ESM all: 0.649](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mBART50MtoM-large-en-pt-es-fr-train/44100/spider_eval_match_ratsqlgap-mBART50MtoM-large-en-pt-es-fr-train_fr-eval-step44100.txt)
131
+
132
+
133
+ Paper mRAT-SQL-FIT
134
+
135
+
136
+ * mT5-large trained in English 51Ksteps
137
+ * Checkpoint: [50100](https://drive.google.com/file/d/1BZ519XxYtXpxxO1iiBy8kSLG4eq34yEX/view?usp=sharing)
138
+ * Inference - English: [ESM all: 0.684](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mT5-large-en-train/spider_eval_match_ratsqlgap-mT5-large-en-train_en-eval-step50100.txt)
139
+
140
+ * mT5-large trained in English, Portuguese, Spanish and French (together) 51Ksteps - just best inferences
141
+ * Checkpoint: [51100](https://drive.google.com/file/d/1GSQX0dJlsipQPBvYrBqY0SMFhFGCTW0E/view?usp=sharing)
142
+ * Inference - English: [ESM all: 0.715](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mT5-large-en-pt-es-fr-51Ksteps-train/51100/spider_eval_match_ratsqlgap-mT5-large-NoGAP-en-pt-es-fr-train_en-eval-step51100.txt)
143
+ * Checkpoint: [42100](https://drive.google.com/file/d/1qx2QeZhoygCstZP_QgtLkTlVfBVHzZin/view?usp=sharing)
144
+ * Inference - Portuguese: [ESM all: 0.680](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mT5-large-en-pt-es-fr-51Ksteps-train/42100/spider_eval_match_ratsqlgap-mT5-large-NoGAP-en-pt-es-fr-train_pt-eval-step42100.txt)
145
+ * Checkpoint: [50100](https://drive.google.com/file/d/1eDm7SHz2il1RYryGLSYVGxt-ozHugJcf/view?usp=sharing)
146
+ * Inference - Spanish: [ESM all: 0.660](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mT5-large-en-pt-es-fr-51Ksteps-train/50100/spider_eval_match_ratsqlgap-mT5-large-NoGAP-en-pt-es-fr-train_es-eval-step50100.txt)
147
+ * Inference - French: [ESM all: 0.672](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mT5-large-en-pt-es-fr-51Ksteps-train/50100/spider_eval_match_ratsqlgap-mT5-large-NoGAP-en-pt-es-fr-train_fr-eval-step50100.txt)
148
+
149
+
150
+ * mT5-large trained in English, Portuguese, Spanish and French (together) 120Ksteps - just best inferences
151
+ * Checkpoint: [77500](https://drive.google.com/file/d/1eUYr_i5O9U1ldm_pBdGozmiRt_42BCh8/view?usp=sharing)
152
+ * Inference - English: [ESM all: 0.718](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mT5-large-en-pt-es-fr-120Ksteps-train//77500/spider_eval_match_ratsqlgap-mT5-large-NoGAP-en-pt-es-fr-train_Div-en-eval-step77500.txt)
153
+ * Checkpoint: [85500](https://drive.google.com/file/d/1n55OlnyE3RDQtUXMHPwC99Za0xfQavrK/view?usp=sharing)
154
+ * Inference - Portuguese: [ESM all: 0.675](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mT5-large-en-pt-es-fr-120Ksteps-train//85500/spider_eval_match_ratsqlgap-mT5-large-NoGAP-en-pt-es-fr-train_Div-pt-eval-step85500.txt)
155
+ * Checkpoint: [76500](https://drive.google.com/file/d/1Qs-f2gIgWTJWiWrYGsiULxTBwwpgGatc/view?usp=sharing)
156
+ * Inference - Spanish: [ESM all: 0.675](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mT5-large-en-pt-es-fr-120Ksteps-train//76500/spider_eval_match_ratsqlgap-mT5-large-NoGAP-en-pt-es-fr-train_Div-es-eval-step76500.txt)
157
+ * Checkpoint: [67500](https://drive.google.com/file/d/1cpTEXMhJXVbJfDc8sW1nfSX91p5VSJtn/view?usp=sharing)
158
+ * Inference - French: [ESM all: 0.681](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mT5-large-en-pt-es-fr-120Ksteps-train//67500/spider_eval_match_ratsqlgap-mT5-large-NoGAP-en-pt-es-fr-train_Div-fr-eval-step67500.txt)
159
+
160
+
161
+ * mT5-large trained in English, Portuguese, Spanish and French (together) FIT 120Ksteps - just best inferences
162
+ * Checkpoint: [105100](https://drive.google.com/file/d/1h0knsFfD6XCXxoEVSFR_I1WdYvMOkcvA/view?usp=sharing)
163
+ * Inference - English: (simplemma.load_data('en','pt','es','fr')): [ESM all: 0.735 Best inferences in English](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mT5-large-FIT-en-pt-es-fr-120Ksteps-train//105100/spider_eval_match_ratsqlgap-mT5-large-NoGAP-120Ksteps-FIT-en-pt-es-fr_Div-en-eval-step105100.txt)
164
+ * Inference - English: (simplemma.load_data('en'): [ESM all: 0.736 Best inferences in English](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mT5-large-FIT-en-pt-es-fr-120Ksteps-train//105100/0.736/spider_eval_match_ratsqlgap-mT5-large-NoGAP-120Ksteps-FIT-en-pt-es-fr_Div-en-eval-step105100.txt)
165
+ * Checkpoint: [102100](https://drive.google.com/file/d/1VCfLnQgZsrb8lJFkhxzPoyfqc7dEY_K-/view?usp=sharing)
166
+ * Inference - Portuguese: [ESM all: 0.687](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mT5-large-FIT-en-pt-es-fr-120Ksteps-train//102100/spider_eval_match_ratsqlgap-mT5-large-NoGAP-120Ksteps-FIT-en-pt-es-fr_Div-pt-eval-step102100.txt)
167
+ * Checkpoint: [114100](https://drive.google.com/file/d/13DIB5p97bUnquLpd-dO0-Q1bA9LWjIXD/view?usp=sharing)
168
+ * Inference - Spanish: [ESM all: 0.689](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mT5-large-FIT-en-pt-es-fr-120Ksteps-train//114100/spider_eval_match_ratsqlgap-mT5-large-NoGAP-120Ksteps-FIT-en-pt-es-fr_Div-es-eval-step114100.txt)
169
+ * Inference - French: [ESM all: 0.698](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mT5-large-FIT-en-pt-es-fr-120Ksteps-train//114100/spider_eval_match_ratsqlgap-mT5-large-NoGAP-120Ksteps-FIT-en-pt-es-fr_Div-fr-eval-step114100.txt)
170
+
171
+
172
+ * mT5-large trained in English, Portuguese, Spanish and French (together) 2048TKs - 480Ksteps - just inference in English
173
+ * Checkpoint: [290100](https://drive.google.com/file/d/19Uvfw7QL-8i3yKvybRe9ADLDuSa-MeJv/view?usp=sharing)
174
+ * Inference - English: [ESM all: 0.697](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mT5-large-en-pt-es-fr-2048TKs-480Ksteps-train/290100/spider_eval_match_ratsqlgap-mT5-large-2048TKs-en-pt-es-fr_Div-en-eval-step290100.txt)
175
+
176
+
177
+ Other Best Results
178
+ * T5-v1_1-large trained in English FIT 150Ksteps
179
+ * Checkpoint: [150300](https://drive.google.com/file/d/14iAERUfhNdU7Gdx9gD9HGuGrnumHaPtq/view?usp=sharing)
180
+ * Inference - English: [ESM all: 0.736](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/T5-v1_1-large-FIT-en-150Ksteps-train/spider_eval_match_ratsqlgap-T5-v1_1-large-NoGAP-150Ksteps-FIT-en_One-en-eval-step150300.txt)
181
+
182
+ * mT5-large trained in English, Portuguese, Spanish and French (together) + Non Linear Data Augmentation by rules for extra question 3enr-3ptr-3esr-3frr FIT 150Ksteps
183
+ * Checkpoint: [128100](https://drive.google.com/file/d/1OjBr9CR0B9feRuk5-Wjh5VpAosB15uNR/view?usp=sharing)
184
+ * Inference - English: [ESM all: 0.726](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mT5-large-FIT-extra-150Ksteps-train/128100/spider_eval_match_ratsqlgap-mT5-large-NoGAP-150Ksteps-FIT-en-pt-es-fr-extra-3enr-3ptr-3esr-3frr_Div-en-eval-step128100.txt)
185
+ * Checkpoint: [125100](https://drive.google.com/file/d/1bLkTK7qJmwQatK_r6tjSmo6fYYtR4oJ3/view?usp=sharing)
186
+ * Inference - Portuguese: [ESM all: 0.698](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mT5-large-FIT-extra-150Ksteps-train/125100/spider_eval_match_ratsqlgap-mT5-large-NoGAP-150Ksteps-FIT-en-pt-es-fr-extra-3enr-3ptr-3esr-3frr_Div-pt-eval-step125100.txt)
187
+ * Inference - French: [ESM all: 0.700](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mT5-large-FIT-extra-150Ksteps-train/125100/spider_eval_match_ratsqlgap-mT5-large-NoGAP-150Ksteps-FIT-en-pt-es-fr-extra-3enr-3ptr-3esr-3frr_Div-fr-eval-step125100.txt)
188
+
189
+ * Checkpoint: [136100](https://drive.google.com/file/d/1zAcHQS0iNOXRm4iaiqybuAFETfitwBnR/view?usp=sharing)
190
+ * Inference - Spanish: [ESM all: 0.691](https://github.com/C4AI/gap-text2sql/blob/main/mrat-sql-gap/inference-results/mT5-large-FIT-extra-150Ksteps-train/136100/spider_eval_match_ratsqlgap-mT5-large-NoGAP-150Ksteps-FIT-en-pt-es-fr-extra-3enr-3ptr-3esr-3frr_Div-es-eval-step136100.txt)
191
+
192
+ ## Results
193
+
194
+ All intermediate files of the results are in the directory [inference-results](https://github.com/C4AI/gap-text2sql/tree/main/mrat-sql-gap/inference-results).
195
+
196
+ ## Security
197
+
198
+ See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
199
+
200
+ ## License
201
+
202
+ This project is licensed under the Apache-2.0 License.
gap-text2sql-main/data/preprocessed_data/bart_parser_pretrain_label_mapping.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "keyword": ["<s>", "<pad>", "</s>", "select", "where", "_value_", "_values_", "group", "by", "order", "limit", "from", "join", "on",
3
+ "count", "distinct", "and", "desc", "avg", "having", "max", "in", "<", "sum", "intersect", "not", "min",
4
+ "except", "all", "or", "asc", "like", "!=", "union", "between", "into",
5
+ "when", "else", "case", "then", "true", "false", "end", "as", "left", "right", "natural", "full", "convert", "cast",
6
+ "is", "null", "<=>", "/", "(", ".", "$", "=>", "_", "]", "[", ">", "#", "!", ",", "*", "&", "|", "?", "~", "-", "<=", "'",
7
+ ")", "}", "+", "\"", "{", "=", "^", "@", "<", ">="],
8
+ "label_padding_token": "<pad>",
9
+ "label_eos_token": "</s>",
10
+ "label_bos_token": "<s>"
11
+ }
gap-text2sql-main/mrat-sql-gap/.gitignore ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib64/
18
+ parts/
19
+ sdist/
20
+ var/
21
+ wheels/
22
+ share/python-wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+ MANIFEST
27
+
28
+ # PyInstaller
29
+ # Usually these files are written by a python script from a template
30
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
31
+ *.manifest
32
+ *.spec
33
+
34
+ # Installer logs
35
+ pip-log.txt
36
+ pip-delete-this-directory.txt
37
+
38
+ # Unit test / coverage reports
39
+ htmlcov/
40
+ .tox/
41
+ .nox/
42
+ .coverage
43
+ .coverage.*
44
+ .cache
45
+ nosetests.xml
46
+ coverage.xml
47
+ *.cover
48
+ .hypothesis/
49
+ .pytest_cache/
50
+
51
+ # Translations
52
+ *.mo
53
+ *.pot
54
+
55
+ # Django stuff:
56
+ *.log
57
+ local_settings.py
58
+ db.sqlite3
59
+
60
+ # Flask stuff:
61
+ instance/
62
+ .webassets-cache
63
+
64
+ # Scrapy stuff:
65
+ .scrapy
66
+
67
+ # Sphinx documentation
68
+ docs/_build/
69
+
70
+ # PyBuilder
71
+ target/
72
+
73
+ # Jupyter Notebook
74
+ .ipynb_checkpoints
75
+
76
+ # IPython
77
+ profile_default/
78
+ ipython_config.py
79
+
80
+ # pyenv
81
+ .python-version
82
+
83
+ # celery beat schedule file
84
+ celerybeat-schedule
85
+
86
+ # SageMath parsed files
87
+ *.sage.py
88
+
89
+ # Environments
90
+ .env
91
+ .venv
92
+ env/
93
+ venv/
94
+ ENV/
95
+ env.bak/
96
+ venv.bak/
97
+
98
+ # Spyder project settings
99
+ .spyderproject
100
+ .spyproject
101
+
102
+ # Rope project settings
103
+ .ropeproject
104
+
105
+ # mkdocs documentation
106
+ /site
107
+
108
+ # mypy
109
+ .mypy_cache/
110
+ .dmypy.json
111
+ dmypy.json
112
+
113
+ # Pyre type checker
114
+ .pyre/
115
+
116
+ *.DS_Store
117
+
118
+
119
+ ### Project data ###
120
+
121
+ .idea/
122
+ .vscode/
123
+ .vector_cache/
124
+ third_party/stanford-corenlp-full-2018-10-05/
125
+
126
+ ### Logs & Experiment dirs ###
127
+
128
+ logdirs/
129
+ pt-exps/
130
+ philly-exps/
131
+ ie_dirs/
132
+ dumps/
133
+ att_dirs/
gap-text2sql-main/mrat-sql-gap/.ptignore ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .vector_cache/
2
+ .vscode/
3
+ .idea/
4
+ data/
5
+ venv/
6
+ build/
7
+
8
+ notebooks/
9
+ third_party/stanford-corenlp-full-2018-10-05/
10
+ third_party/syntaxSQL/
11
+
12
+ *.egg
13
+ *.egg-info
14
+
15
+ logdirs/
16
+ pt-exps/
17
+ philly-exps/
18
+ ie_dirs/
19
+ experiments/
gap-text2sql-main/mrat-sql-gap/BART_large.sh ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ echo "Folders structure preparation"
3
+ Name="BART-large"
4
+
5
+ #mkdir logdir/${Name}-en-train
6
+ #mkdir ie_dirs/${Name}-en-train
7
+ #--
8
+ mkdir models/${Name}
9
+ mkdir models/${Name}/pretrained_checkpoint
10
+
11
+ echo "Model Download - ATTENTION: REVIEW THE FILES SIZES "
12
+ cd models/${Name}/pretrained_checkpoint
13
+
14
+ curl https://gap-text2sql-public.s3.amazonaws.com/checkpoint-artifacts/pretrained-checkpoint -o pytorch_model.bin
15
+ cd ..
16
+ cd ..
17
+ cd ..
18
+
19
+ echo "Download Checkpoint"
20
+ cd logdir/${Name}-en-train
21
+ mkdir bs=12,lr=1.0e-04,bert_lr=1.0e-05,end_lr=0e0,att=1
22
+ cd bs=12,lr=1.0e-04,bert_lr=1.0e-05,end_lr=0e0,att=1
23
+ gdown --id 1F4R-WkJKtJ4lFni3q4lBug6tzSo0H5Qe
24
+ curl https://gap-text2sql-public.s3.amazonaws.com/checkpoint-artifacts/gap-finetuned-checkpoint -o model_checkpoint-00041000
25
+ cd ..
26
+ cd ..
27
+ cd ..
28
+
29
+
30
+
31
+
32
+
33
+
gap-text2sql-main/mrat-sql-gap/BERTimbau-base.sh ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ echo "Folders structure preparation"
3
+ Name="BERTimbau-base"
4
+
5
+ #mkdir logdir/${Name}-pt-train
6
+ #mkdir ie_dirs/${Name}-pt-train
7
+ #--
8
+ #mkdir models/${Name}
9
+ #mkdir models/${Name}/pretrained_checkpoint
10
+
11
+ #echo "Download Pretrained Model"
12
+ #cd models/${Name}/pretrained_checkpoint
13
+ #curl https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/added_tokens.json -o added_tokens.json
14
+ #curl https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/config.json -o config.json
15
+ #curl https://cdn-lfs.huggingface.co/neuralmind/bert-base-portuguese-cased/96d2144445b6ba3530c27e38e7e27139fd0b0a5e36d9ca66f4155da7c5f199b0 -o flax_model.msgpack
16
+ #curl https://cdn-lfs.huggingface.co/neuralmind/bert-base-portuguese-cased/cb1693767adef60abf23d9fde3996f0c1e6310afad103a2db94ad44854568955 -o pytorch_model.bin
17
+ #curl https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/special_tokens_map.json -o special_tokens_map.json
18
+ #curl https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/tokenizer_config.json -o tokenizer_config.json
19
+ #curl https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/vocab.txt -o vocab.txt
20
+ #cd ..
21
+ #cd ..
22
+ #cd ..
23
+
24
+ echo "Download Checkpoint"
25
+ cd logdir/${Name}-pt-train
26
+ mkdir bs=6,lr=7.4e-04,bert_lr=3.0e-06,end_lr=0e0,att=1
27
+ cd bs=6,lr=7.4e-04,bert_lr=3.0e-06,end_lr=0e0,att=1
28
+ gdown --id 1gIZS0RuIxdjmm7sNbA3R6p6--9iMJmW8
29
+ cd ..
30
+ cd ..
31
+ cd ..
32
+
33
+
34
+
gap-text2sql-main/mrat-sql-gap/BERTimbau-large.sh ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ echo "Folders structure preparation"
3
+ Name="BERTimbau-large"
4
+
5
+ #mkdir logdir/${Name}-pt-train
6
+ #mkdir ie_dirs/${Name}-pt-train
7
+ #--
8
+ #mkdir models/${Name}
9
+ #mkdir models/${Name}/pretrained_checkpoint
10
+
11
+ #echo "Download Pretrained Model"
12
+ #cd models/${Name}/pretrained_checkpoint
13
+ #curl https://huggingface.co/neuralmind/bert-large-portuguese-cased/resolve/main/added_tokens.json -o added_tokens.json
14
+ #curl https://huggingface.co/neuralmind/bert-large-portuguese-cased/resolve/main/config.json -o config.json
15
+ #curl https://cdn-lfs.huggingface.co/neuralmind/bert-large-portuguese-cased/9af4f60f0bdd71e483baf8a1dd3e3dc509ceeaa7dd2007ed63f110b5c990e6e6 -o flax_model.msgpack
16
+ #curl https://cdn-lfs.huggingface.co/neuralmind/bert-large-portuguese-cased/48f211712fdad2263e35c368b0ec79ad635c2df0acb275152e0f7cbd165bb7ca -o pytorch_model.bin
17
+ #curl https://huggingface.co/neuralmind/bert-large-portuguese-cased/resolve/main/special_tokens_map.json -o special_tokens_map.json
18
+ #curl https://huggingface.co/neuralmind/bert-large-portuguese-cased/resolve/main/tokenizer_config.json -o tokenizer_config.json
19
+ #curl https://huggingface.co/neuralmind/bert-large-portuguese-cased/resolve/main/vocab.txt -o vocab.txt
20
+ #cd ..
21
+ #cd ..
22
+ #cd ..
23
+
24
+ echo "Download Checkpoint"
25
+ cd logdir/${Name}-pt-train
26
+ mkdir bs=6,lr=7.4e-04,bert_lr=3.0e-06,end_lr=0e0,att=1
27
+ cd bs=6,lr=7.4e-04,bert_lr=3.0e-06,end_lr=0e0,att=1
28
+ gdown --id 1q1NOxisOcIdkMftzGPVxBDn989LDDG3X
29
+ cd ..
30
+ cd ..
31
+ cd ..
32
+
33
+
gap-text2sql-main/mrat-sql-gap/crash_on_ipy.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ class ExceptionHook:
4
+ instance = None
5
+
6
+ def __call__(self, type, value, tb):
7
+ if self.instance is None:
8
+ if hasattr(sys, 'ps1') or not sys.stderr.isatty():
9
+ sys.__excepthook__(type, value, tb)
10
+ else:
11
+ import traceback
12
+ # from IPython.core import ultratb
13
+ # self.instance = ultratb.FormattedTB(mode='Plain',
14
+ # color_scheme='Linux', call_pdb=1)
15
+ import pudb
16
+ traceback.print_exception(type, value, tb)
17
+ pudb.post_mortem(tb)
18
+
19
+ sys.excepthook = ExceptionHook()
gap-text2sql-main/mrat-sql-gap/data/spider/generate.sh ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ if [ "$#" -ne 1 ]; then
3
+ echo "Please specify directory containing Spider files."
4
+ exit 1
5
+ fi
6
+
7
+ BASE=$(realpath $(dirname $0))
8
+
9
+ # Re-generate 'sql' to fix bad parsing
10
+ cp $1/tables.json ${BASE}
11
+ for input in train_others train_spider dev; do
12
+ echo Procesing $input
13
+ cp $1/${input}.json ${BASE}
14
+ if [[ -e ${BASE}/${input}.json.patch ]]; then
15
+ pushd ${BASE} >& /dev/null
16
+ patch < ${input}.json.patch
17
+ popd >& /dev/null
18
+ fi
19
+ python -m seq2struct.datasets.spider_lib.preprocess.parse_raw_json \
20
+ --tables ${BASE}/tables.json \
21
+ --input ${BASE}/${input}.json \
22
+ --output ${BASE}/${input}.json
23
+ echo
24
+ done
gap-text2sql-main/mrat-sql-gap/data/spider/train_spider.json.patch ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --- train_spider.json 2018-09-23 20:27:49.000000000 -0700
2
+ +++ train_spider_fixed.json 2019-02-05 19:09:31.000000000 -0800
3
+ @@ -429190,10 +429190,10 @@
4
+ },
5
+ {
6
+ "db_id": "assets_maintenance",
7
+ - "query": "SELECT T1.company_name FROM Third_Party_Companies AS T1 JOIN Maintenance_Contracts AS T2 ON T1.company_id = T2.maintenance_contract_company_id JOIN Ref_Company_Types AS T3 ON T1.company_type_code = T3.company_type_code ORDER BY T2.contract_end_date DESC LIMIT 1",
8
+ + "query": "SELECT T1.company_type FROM Third_Party_Companies AS T1 JOIN Maintenance_Contracts AS T2 ON T1.company_id = T2.maintenance_contract_company_id ORDER BY T2.contract_end_date DESC LIMIT 1",
9
+ "query_toks": [
10
+ "SELECT",
11
+ - "T1.company_name",
12
+ + "T1.company_type",
13
+ "FROM",
14
+ "Third_Party_Companies",
15
+ "AS",
16
+ @@ -429206,14 +429206,6 @@
17
+ "T1.company_id",
18
+ "=",
19
+ "T2.maintenance_contract_company_id",
20
+ - "JOIN",
21
+ - "Ref_Company_Types",
22
+ - "AS",
23
+ - "T3",
24
+ - "ON",
25
+ - "T1.company_type_code",
26
+ - "=",
27
+ - "T3.company_type_code",
28
+ "ORDER",
29
+ "BY",
30
+ "T2.contract_end_date",
31
+ @@ -429242,18 +429234,6 @@
32
+ "t2",
33
+ ".",
34
+ "maintenance_contract_company_id",
35
+ - "join",
36
+ - "ref_company_types",
37
+ - "as",
38
+ - "t3",
39
+ - "on",
40
+ - "t1",
41
+ - ".",
42
+ - "company_type_code",
43
+ - "=",
44
+ - "t3",
45
+ - ".",
46
+ - "company_type_code",
47
+ "order",
48
+ "by",
49
+ "t2",
gap-text2sql-main/mrat-sql-gap/data/sqlite_files/singer/singer.sqlite ADDED
Binary file (20.5 kB). View file
 
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/BART-large-en/gap-bart.jsonnet ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local _0428_base = import 'nl2code-base.libsonnet';
2
+ local _data_path = 'data/spider-en/';
3
+ local _output_from = true;
4
+ local _fs = 2;
5
+
6
+ function(args) _0428_base(output_from=_output_from, data_path=_data_path) + {
7
+ local lr_s = '%0.1e' % args.lr,
8
+ local bert_lr_s = '%0.1e' % args.bert_lr,
9
+ local end_lr_s = if args.end_lr == 0 then '0e0' else '%0.1e' % args.end_lr,
10
+
11
+ local base_bert_enc_size = 1024,
12
+ local enc_size = base_bert_enc_size,
13
+
14
+ model_name: 'bs=%(bs)d,lr=%(lr)s,bert_lr=%(bert_lr)s,end_lr=%(end_lr)s,att=%(att)d' % (args + {
15
+ lr: lr_s,
16
+ bert_lr: bert_lr_s,
17
+ end_lr: end_lr_s,
18
+ }),
19
+
20
+ model+: {
21
+ encoder+: {
22
+ name: 'spider-bart',
23
+ batch_encs_update:: null,
24
+ question_encoder:: null,
25
+ column_encoder:: null,
26
+ table_encoder:: null,
27
+ dropout:: null,
28
+ update_config+: {
29
+ name: 'relational_transformer',
30
+ num_layers: args.num_layers,
31
+ num_heads: 8,
32
+ sc_link: args.sc_link,
33
+ cv_link: args.cv_link,
34
+ },
35
+ summarize_header: args.summarize_header,
36
+ use_column_type: args.use_column_type,
37
+ bart_version: args.bart_version,
38
+ pretrained_checkpoint: args.pretrained_checkpoint,
39
+ top_k_learnable:: null,
40
+ word_emb_size:: null,
41
+ },
42
+ encoder_preproc+: {
43
+ word_emb:: null,
44
+ min_freq:: null,
45
+ max_count:: null,
46
+ db_path: _data_path + "database",
47
+ compute_sc_link: args.sc_link,
48
+ compute_cv_link: args.cv_link,
49
+ fix_issue_16_primary_keys: true,
50
+ bart_version: args.bart_version,
51
+ pretrained_checkpoint: args.pretrained_checkpoint,
52
+ count_tokens_in_word_emb_for_vocab:: null,
53
+ save_path: _data_path + 'BART-large-nl2code-1115,output_from=%s,fs=%d,emb=bart,cvlink' % [_output_from, _fs],
54
+ },
55
+ decoder_preproc+: {
56
+ grammar+: {
57
+ end_with_from: args.end_with_from,
58
+ clause_order: args.clause_order,
59
+ infer_from_conditions: true,
60
+ factorize_sketch: _fs,
61
+ },
62
+ save_path: _data_path + 'BART-large-nl2code-1115,output_from=%s,fs=%d,emb=bart,cvlink' % [_output_from, _fs],
63
+
64
+ compute_sc_link:: null,
65
+ compute_cv_link:: null,
66
+ db_path:: null,
67
+ fix_issue_16_primary_keys:: null,
68
+ bart_version:: null,
69
+ pretrained_checkpoint:: null,
70
+ },
71
+ decoder+: {
72
+ name: 'NL2Code',
73
+ dropout: 0.20687225956012834,
74
+ desc_attn: 'mha',
75
+ enc_recurrent_size: enc_size,
76
+ recurrent_size : args.decoder_hidden_size,
77
+ loss_type: 'softmax',
78
+ use_align_mat: args.use_align_mat,
79
+ use_align_loss: args.use_align_loss,
80
+ },
81
+ },
82
+
83
+ train+: {
84
+ batch_size: args.bs,
85
+ num_batch_accumulated: args.num_batch_accumulated,
86
+ clip_grad: 1,
87
+
88
+ model_seed: args.att,
89
+ data_seed: args.att,
90
+ init_seed: args.att,
91
+ },
92
+
93
+ optimizer: {
94
+ name: 'bertAdamw',
95
+ lr: 0.0,
96
+ bert_lr: 0.0,
97
+ },
98
+
99
+ lr_scheduler+: {
100
+ name: 'bert_warmup_polynomial_group',
101
+ start_lrs: [args.lr, args.bert_lr],
102
+ end_lr: args.end_lr,
103
+ num_warmup_steps: $.train.max_steps / 8,
104
+ },
105
+
106
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/BART-large-en/nl2code-base.libsonnet ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model details:
2
+ # - NL2Code
3
+ # - Pretrained, fixed word embeddings
4
+ # - glove-42B
5
+ # - min_freq 50
6
+ # - Spiderv2 encoder
7
+ # - question_encoder ['emb', 'bilstm']
8
+ # - column_encoder ['emb', 'bilstm-summarize']
9
+ # - table_encoder ['emb', 'bilstm-summarize']
10
+ # - upd_steps 4
11
+ # - Optimization
12
+ # - max_steps 40k
13
+ # - batch_size 10
14
+ # - Adam with lr 1e-3
15
+
16
+ function(output_from, data_path='data/spider-en/') {
17
+ local PREFIX = data_path,
18
+
19
+ data: {
20
+ train: {
21
+ name: 'spider',
22
+ paths: [
23
+ PREFIX + 'train_%s.json' % [s]
24
+ for s in ['spider', 'others']],
25
+ tables_paths: [
26
+ PREFIX + 'tables.json',
27
+ ],
28
+ db_path: PREFIX + 'database',
29
+ },
30
+ val: {
31
+ name: 'spider',
32
+ paths: [PREFIX + 'dev.json'],
33
+ tables_paths: [PREFIX + 'tables.json'],
34
+ db_path: PREFIX + 'database',
35
+ },
36
+ },
37
+
38
+ model: {
39
+ name: 'EncDec',
40
+ encoder: {
41
+ name: 'spiderv2',
42
+ dropout: 0.2,
43
+ word_emb_size: 300,
44
+ question_encoder: ['emb', 'bilstm'],
45
+ column_encoder: ['emb', 'bilstm-summarize'],
46
+ table_encoder: ['emb', 'bilstm-summarize'],
47
+ update_config: {
48
+ name: 'relational_transformer',
49
+ num_layers: 4,
50
+ num_heads: 8,
51
+ },
52
+ },
53
+ decoder: {
54
+ name: 'NL2Code',
55
+ dropout: 0.2,
56
+ desc_attn: 'mha',
57
+ },
58
+ encoder_preproc: {
59
+ word_emb: {
60
+ name: 'glove',
61
+ kind: '42B',
62
+ },
63
+ count_tokens_in_word_emb_for_vocab: false,
64
+ min_freq: 50,
65
+ max_count: 5000,
66
+ include_table_name_in_column: false,
67
+
68
+ save_path: PREFIX + 'nl2code-0401,output_from=%s,emb=glove-42B,min_freq=50/' % [output_from],
69
+ },
70
+ decoder_preproc: self.encoder_preproc {
71
+ grammar: {
72
+ name: 'spider',
73
+ output_from: output_from,
74
+ use_table_pointer: output_from,
75
+ include_literals: false,
76
+ },
77
+ use_seq_elem_rules: true,
78
+
79
+ word_emb:: null,
80
+ include_table_name_in_column:: null,
81
+ count_tokens_in_word_emb_for_vocab:: null,
82
+ },
83
+ },
84
+
85
+ train: {
86
+ batch_size: 10,
87
+ eval_batch_size: 50,
88
+
89
+ keep_every_n: 1000,
90
+ eval_every_n: 100,
91
+ save_every_n: 100,
92
+ report_every_n: 10,
93
+
94
+ max_steps: 51000,
95
+ num_eval_items: 50,
96
+ },
97
+ optimizer: {
98
+ name: 'adam',
99
+ lr: 0.0,
100
+ },
101
+ lr_scheduler: {
102
+ name: 'warmup_polynomial',
103
+ num_warmup_steps: $.train.max_steps / 20,
104
+ start_lr: 1e-3,
105
+ end_lr: 0,
106
+ decay_steps: $.train.max_steps - self.num_warmup_steps,
107
+ power: 0.5,
108
+ }
109
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/BERTimbau-base/nl2code-base.libsonnet ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model details:
2
+ # - NL2Code
3
+ # - Pretrained, fixed word embeddings
4
+ # - glove-42B
5
+ # - min_freq 50
6
+ # - Spiderv2 encoder
7
+ # - question_encoder ['emb', 'bilstm']
8
+ # - column_encoder ['emb', 'bilstm-summarize']
9
+ # - table_encoder ['emb', 'bilstm-summarize']
10
+ # - upd_steps 4
11
+ # - Optimization
12
+ # - max_steps 40k
13
+ # - batch_size 10
14
+ # - Adam with lr 1e-3
15
+
16
+ function(output_from, data_path='data/spider-pt/') {
17
+ local PREFIX = data_path,
18
+
19
+ data: {
20
+ train: {
21
+ name: 'spider',
22
+ paths: [
23
+ PREFIX + 'train_%s.json' % [s]
24
+ for s in ['spider', 'others']],
25
+ tables_paths: [
26
+ PREFIX + 'tables.json',
27
+ ],
28
+ db_path: PREFIX + 'database',
29
+ },
30
+ val: {
31
+ name: 'spider',
32
+ paths: [PREFIX + 'dev.json'],
33
+ tables_paths: [PREFIX + 'tables.json'],
34
+ db_path: PREFIX + 'database',
35
+ },
36
+ },
37
+
38
+ model: {
39
+ name: 'EncDec',
40
+ encoder: {
41
+ name: 'spiderv2',
42
+ dropout: 0.2,
43
+ word_emb_size: 300,
44
+ question_encoder: ['emb', 'bilstm'],
45
+ column_encoder: ['emb', 'bilstm-summarize'],
46
+ table_encoder: ['emb', 'bilstm-summarize'],
47
+ update_config: {
48
+ name: 'relational_transformer',
49
+ num_layers: 4,
50
+ num_heads: 8,
51
+ },
52
+ },
53
+ decoder: {
54
+ name: 'NL2Code',
55
+ dropout: 0.2,
56
+ desc_attn: 'mha',
57
+ },
58
+ encoder_preproc: {
59
+ word_emb: {
60
+ name: 'glove',
61
+ kind: '42B',
62
+ },
63
+ count_tokens_in_word_emb_for_vocab: false,
64
+ min_freq: 50,
65
+ max_count: 5000,
66
+ include_table_name_in_column: false,
67
+
68
+ save_path: PREFIX + 'BERTimbau-base_nl2code,output_from=%s,emb=glove-42B,min_freq=50/' % [output_from],
69
+ },
70
+ decoder_preproc: self.encoder_preproc {
71
+ grammar: {
72
+ name: 'spider',
73
+ output_from: output_from,
74
+ use_table_pointer: output_from,
75
+ include_literals: false,
76
+ },
77
+ use_seq_elem_rules: true,
78
+
79
+ word_emb:: null,
80
+ include_table_name_in_column:: null,
81
+ count_tokens_in_word_emb_for_vocab:: null,
82
+ },
83
+ },
84
+
85
+ train: {
86
+ batch_size: 10,
87
+ eval_batch_size: 50,
88
+
89
+ keep_every_n: 1000,
90
+ eval_every_n: 100,
91
+ save_every_n: 100,
92
+ report_every_n: 10,
93
+
94
+ max_steps: 40000,
95
+ num_eval_items: 50,
96
+ },
97
+ optimizer: {
98
+ name: 'adam',
99
+ lr: 0.0,
100
+ },
101
+ lr_scheduler: {
102
+ name: 'warmup_polynomial',
103
+ num_warmup_steps: $.train.max_steps / 20,
104
+ start_lr: 1e-3,
105
+ end_lr: 0,
106
+ decay_steps: $.train.max_steps - self.num_warmup_steps,
107
+ power: 0.5,
108
+ }
109
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/BERTimbau-base/nl2code-bertimbau-base.jsonnet ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local _base = import 'nl2code-base.libsonnet';
2
+ local _output_from = true;
3
+ local _fs = 2;
4
+
5
+ function(args) _base(output_from=_output_from, data_path=args.data_path) + {
6
+ local data_path = args.data_path,
7
+
8
+ local lr_s = '%0.1e' % args.lr,
9
+ local bert_lr_s = '%0.1e' % args.bert_lr,
10
+ local end_lr_s = if args.end_lr == 0 then '0e0' else '%0.1e' % args.end_lr,
11
+
12
+ local base_bert_enc_size = 768,
13
+ local enc_size = base_bert_enc_size,
14
+
15
+ model_name: 'bs=%(bs)d,lr=%(lr)s,bert_lr=%(bert_lr)s,end_lr=%(end_lr)s,att=%(att)d' % (args + {
16
+ lr: lr_s,
17
+ bert_lr: bert_lr_s,
18
+ end_lr: end_lr_s,
19
+ }),
20
+
21
+ model+: {
22
+ encoder+: {
23
+ name: 'spider-bert',
24
+ batch_encs_update:: null,
25
+ question_encoder:: null,
26
+ column_encoder:: null,
27
+ table_encoder:: null,
28
+ dropout:: null,
29
+ update_config+: {
30
+ name: 'relational_transformer',
31
+ num_layers: args.num_layers,
32
+ num_heads: 8,
33
+ sc_link: args.sc_link,
34
+ cv_link: args.cv_link,
35
+ },
36
+ summarize_header: args.summarize_header,
37
+ use_column_type: args.use_column_type,
38
+ bert_version: args.bert_version,
39
+ bert_token_type: args.bert_token_type,
40
+ top_k_learnable:: null,
41
+ word_emb_size:: null,
42
+ },
43
+ encoder_preproc+: {
44
+ word_emb:: null,
45
+ min_freq:: null,
46
+ max_count:: null,
47
+ db_path: data_path + "database",
48
+ compute_sc_link: args.sc_link,
49
+ compute_cv_link: args.cv_link,
50
+ fix_issue_16_primary_keys: true,
51
+ bert_version: args.bert_version,
52
+ count_tokens_in_word_emb_for_vocab:: null,
53
+ save_path: data_path + 'BERTimbau-base_nl2code,output_from=%s,fs=%d,emb=bert,cvlink' % [_output_from, _fs],
54
+ },
55
+ decoder_preproc+: {
56
+ grammar+: {
57
+ end_with_from: args.end_with_from,
58
+ clause_order: args.clause_order,
59
+ infer_from_conditions: true,
60
+ factorize_sketch: _fs,
61
+ },
62
+ save_path: data_path + 'BERTimbau-base_nl2code,output_from=%s,fs=%d,emb=bert,cvlink' % [_output_from, _fs],
63
+
64
+ compute_sc_link:: null,
65
+ compute_cv_link:: null,
66
+ db_path:: null,
67
+ fix_issue_16_primary_keys:: null,
68
+ bert_version:: null,
69
+ },
70
+ decoder+: {
71
+ name: 'NL2Code',
72
+ dropout: 0.20687225956012834,
73
+ desc_attn: 'mha',
74
+ enc_recurrent_size: enc_size,
75
+ recurrent_size : args.decoder_hidden_size,
76
+ loss_type: 'softmax',
77
+ use_align_mat: args.use_align_mat,
78
+ use_align_loss: args.use_align_loss,
79
+ }
80
+ },
81
+
82
+ train+: {
83
+ batch_size: args.bs,
84
+ num_batch_accumulated: args.num_batch_accumulated,
85
+ clip_grad: 1,
86
+
87
+ model_seed: args.att,
88
+ data_seed: args.att,
89
+ init_seed: args.att,
90
+
91
+ max_steps: args.max_steps,
92
+ },
93
+
94
+ optimizer: {
95
+ name: 'bertAdamw',
96
+ lr: 0.0,
97
+ bert_lr: 0.0,
98
+ },
99
+
100
+ lr_scheduler+: {
101
+ name: 'bert_warmup_polynomial_group',
102
+ start_lrs: [args.lr, args.bert_lr],
103
+ end_lr: args.end_lr,
104
+ num_warmup_steps: $.train.max_steps / 8,
105
+ },
106
+
107
+ log: {
108
+ reopen_to_flush: true,
109
+ }
110
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/BERTimbau-large/nl2code-base.libsonnet ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model details:
2
+ # - NL2Code
3
+ # - Pretrained, fixed word embeddings
4
+ # - glove-42B
5
+ # - min_freq 50
6
+ # - Spiderv2 encoder
7
+ # - question_encoder ['emb', 'bilstm']
8
+ # - column_encoder ['emb', 'bilstm-summarize']
9
+ # - table_encoder ['emb', 'bilstm-summarize']
10
+ # - upd_steps 4
11
+ # - Optimization
12
+ # - max_steps 40k
13
+ # - batch_size 10
14
+ # - Adam with lr 1e-3
15
+
16
+ function(output_from, data_path='data/spider-pt/') {
17
+ local PREFIX = data_path,
18
+
19
+ data: {
20
+ train: {
21
+ name: 'spider',
22
+ paths: [
23
+ PREFIX + 'train_%s.json' % [s]
24
+ for s in ['spider', 'others']],
25
+ tables_paths: [
26
+ PREFIX + 'tables.json',
27
+ ],
28
+ db_path: PREFIX + 'database',
29
+ },
30
+ val: {
31
+ name: 'spider',
32
+ paths: [PREFIX + 'dev.json'],
33
+ tables_paths: [PREFIX + 'tables.json'],
34
+ db_path: PREFIX + 'database',
35
+ },
36
+ },
37
+
38
+ model: {
39
+ name: 'EncDec',
40
+ encoder: {
41
+ name: 'spiderv2',
42
+ dropout: 0.2,
43
+ word_emb_size: 300,
44
+ question_encoder: ['emb', 'bilstm'],
45
+ column_encoder: ['emb', 'bilstm-summarize'],
46
+ table_encoder: ['emb', 'bilstm-summarize'],
47
+ update_config: {
48
+ name: 'relational_transformer',
49
+ num_layers: 4,
50
+ num_heads: 8,
51
+ },
52
+ },
53
+ decoder: {
54
+ name: 'NL2Code',
55
+ dropout: 0.2,
56
+ desc_attn: 'mha',
57
+ },
58
+ encoder_preproc: {
59
+ word_emb: {
60
+ name: 'glove',
61
+ kind: '42B',
62
+ },
63
+ count_tokens_in_word_emb_for_vocab: false,
64
+ min_freq: 50,
65
+ max_count: 5000,
66
+ include_table_name_in_column: false,
67
+
68
+ save_path: PREFIX + 'BERTimbau-large_nl2code,output_from=%s,emb=glove-42B,min_freq=50/' % [output_from],
69
+ },
70
+ decoder_preproc: self.encoder_preproc {
71
+ grammar: {
72
+ name: 'spider',
73
+ output_from: output_from,
74
+ use_table_pointer: output_from,
75
+ include_literals: false,
76
+ },
77
+ use_seq_elem_rules: true,
78
+
79
+ word_emb:: null,
80
+ include_table_name_in_column:: null,
81
+ count_tokens_in_word_emb_for_vocab:: null,
82
+ },
83
+ },
84
+
85
+ train: {
86
+ batch_size: 10,
87
+ eval_batch_size: 50,
88
+
89
+ keep_every_n: 1000,
90
+ eval_every_n: 100,
91
+ save_every_n: 100,
92
+ report_every_n: 10,
93
+
94
+ max_steps: 44100,
95
+ num_eval_items: 50,
96
+ },
97
+ optimizer: {
98
+ name: 'adam',
99
+ lr: 0.0,
100
+ },
101
+ lr_scheduler: {
102
+ name: 'warmup_polynomial',
103
+ num_warmup_steps: $.train.max_steps / 20,
104
+ start_lr: 1e-3,
105
+ end_lr: 0,
106
+ decay_steps: $.train.max_steps - self.num_warmup_steps,
107
+ power: 0.5,
108
+ }
109
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/BERTimbau-large/nl2code-bertimbau-large.jsonnet ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local _base = import 'nl2code-base.libsonnet';
2
+ local _output_from = true;
3
+ local _fs = 2;
4
+
5
+ function(args) _base(output_from=_output_from, data_path=args.data_path) + {
6
+ local data_path = args.data_path,
7
+
8
+ local lr_s = '%0.1e' % args.lr,
9
+ local bert_lr_s = '%0.1e' % args.bert_lr,
10
+ local end_lr_s = if args.end_lr == 0 then '0e0' else '%0.1e' % args.end_lr,
11
+
12
+ local base_bert_enc_size = 1024,
13
+ local enc_size = base_bert_enc_size,
14
+
15
+ model_name: 'bs=%(bs)d,lr=%(lr)s,bert_lr=%(bert_lr)s,end_lr=%(end_lr)s,att=%(att)d' % (args + {
16
+ lr: lr_s,
17
+ bert_lr: bert_lr_s,
18
+ end_lr: end_lr_s,
19
+ }),
20
+
21
+ model+: {
22
+ encoder+: {
23
+ name: 'spider-bert',
24
+ batch_encs_update:: null,
25
+ question_encoder:: null,
26
+ column_encoder:: null,
27
+ table_encoder:: null,
28
+ dropout:: null,
29
+ update_config+: {
30
+ name: 'relational_transformer',
31
+ num_layers: args.num_layers,
32
+ num_heads: 8,
33
+ sc_link: args.sc_link,
34
+ cv_link: args.cv_link,
35
+ },
36
+ summarize_header: args.summarize_header,
37
+ use_column_type: args.use_column_type,
38
+ bert_version: args.bert_version,
39
+ bert_token_type: args.bert_token_type,
40
+ top_k_learnable:: null,
41
+ word_emb_size:: null,
42
+ },
43
+ encoder_preproc+: {
44
+ word_emb:: null,
45
+ min_freq:: null,
46
+ max_count:: null,
47
+ db_path: data_path + "database",
48
+ compute_sc_link: args.sc_link,
49
+ compute_cv_link: args.cv_link,
50
+ fix_issue_16_primary_keys: true,
51
+ bert_version: args.bert_version,
52
+ count_tokens_in_word_emb_for_vocab:: null,
53
+ save_path: data_path + 'BERTimbau-large_nl2code,output_from=%s,fs=%d,emb=bert,cvlink' % [_output_from, _fs],
54
+ },
55
+ decoder_preproc+: {
56
+ grammar+: {
57
+ end_with_from: args.end_with_from,
58
+ clause_order: args.clause_order,
59
+ infer_from_conditions: true,
60
+ factorize_sketch: _fs,
61
+ },
62
+ save_path: data_path + 'BERTimbau-large_nl2code,output_from=%s,fs=%d,emb=bert,cvlink' % [_output_from, _fs],
63
+
64
+ compute_sc_link:: null,
65
+ compute_cv_link:: null,
66
+ db_path:: null,
67
+ fix_issue_16_primary_keys:: null,
68
+ bert_version:: null,
69
+ },
70
+ decoder+: {
71
+ name: 'NL2Code',
72
+ dropout: 0.20687225956012834,
73
+ desc_attn: 'mha',
74
+ enc_recurrent_size: enc_size,
75
+ recurrent_size : args.decoder_hidden_size,
76
+ loss_type: 'softmax',
77
+ use_align_mat: args.use_align_mat,
78
+ use_align_loss: args.use_align_loss,
79
+ }
80
+ },
81
+
82
+ train+: {
83
+ batch_size: args.bs,
84
+ num_batch_accumulated: args.num_batch_accumulated,
85
+ clip_grad: 1,
86
+
87
+ model_seed: args.att,
88
+ data_seed: args.att,
89
+ init_seed: args.att,
90
+
91
+ max_steps: args.max_steps,
92
+ },
93
+
94
+ optimizer: {
95
+ name: 'bertAdamw',
96
+ lr: 0.0,
97
+ bert_lr: 0.0,
98
+ },
99
+
100
+ lr_scheduler+: {
101
+ name: 'bert_warmup_polynomial_group',
102
+ start_lrs: [args.lr, args.bert_lr],
103
+ end_lr: args.end_lr,
104
+ num_warmup_steps: $.train.max_steps / 8,
105
+ },
106
+
107
+ log: {
108
+ reopen_to_flush: true,
109
+ }
110
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/T5-v1_1-large-170Ksteps-FIT-en-enr-enb/T5-v1_1.jsonnet ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local _0428_base = import 'nl2code-base.libsonnet';
2
+ local _data_path = 'data/spider-FIT-en-enr-enb/';
3
+ local _output_from = true;
4
+ local _fs = 2;
5
+
6
+ function(args) _0428_base(output_from=_output_from, data_path=_data_path) + {
7
+ local lr_s = '%0.1e' % args.lr,
8
+ local bert_lr_s = '%0.1e' % args.bert_lr,
9
+ local end_lr_s = if args.end_lr == 0 then '0e0' else '%0.1e' % args.end_lr,
10
+
11
+ local base_bert_enc_size = 1024,
12
+ local enc_size = base_bert_enc_size,
13
+
14
+ model_name: 'bs=%(bs)d,lr=%(lr)s,bert_lr=%(bert_lr)s,end_lr=%(end_lr)s,att=%(att)d' % (args + {
15
+ lr: lr_s,
16
+ bert_lr: bert_lr_s,
17
+ end_lr: end_lr_s,
18
+ }),
19
+
20
+ model+: {
21
+ encoder+: {
22
+ name: 'spider-t5',
23
+ batch_encs_update:: null,
24
+ question_encoder:: null,
25
+ column_encoder:: null,
26
+ table_encoder:: null,
27
+ dropout:: null,
28
+ update_config+: {
29
+ name: 'relational_transformer',
30
+ num_layers: args.num_layers,
31
+ num_heads: 8,
32
+ sc_link: args.sc_link,
33
+ cv_link: args.cv_link,
34
+ },
35
+ summarize_header: args.summarize_header,
36
+ use_column_type: args.use_column_type,
37
+ t5_version: args.t5_version,
38
+ pretrained_checkpoint: args.pretrained_checkpoint,
39
+ top_k_learnable:: null,
40
+ word_emb_size:: null,
41
+ },
42
+ encoder_preproc+: {
43
+ word_emb:: null,
44
+ min_freq:: null,
45
+ max_count:: null,
46
+ db_path: _data_path + "database",
47
+ compute_sc_link: args.sc_link,
48
+ compute_cv_link: args.cv_link,
49
+ fix_issue_16_primary_keys: true,
50
+ t5_version: args.t5_version,
51
+ pretrained_checkpoint: args.pretrained_checkpoint,
52
+ count_tokens_in_word_emb_for_vocab:: null,
53
+ save_path: _data_path + 'T5-v1_1-large-NoGAP-nl2code-1115,output_from=%s,fs=%d,emb=t5,cvlink' % [_output_from, _fs],
54
+ },
55
+ decoder_preproc+: {
56
+ grammar+: {
57
+ end_with_from: args.end_with_from,
58
+ clause_order: args.clause_order,
59
+ infer_from_conditions: true,
60
+ factorize_sketch: _fs,
61
+ },
62
+ save_path: _data_path + 'T5-v1_1-large-NoGAP-nl2code-1115,output_from=%s,fs=%d,emb=t5,cvlink' % [_output_from, _fs],
63
+
64
+ compute_sc_link:: null,
65
+ compute_cv_link:: null,
66
+ db_path:: null,
67
+ fix_issue_16_primary_keys:: null,
68
+ t5_version:: null,
69
+ pretrained_checkpoint:: null,
70
+ },
71
+ decoder+: {
72
+ name: 'NL2Code',
73
+ dropout: 0.20687225956012834,
74
+ desc_attn: 'mha',
75
+ enc_recurrent_size: enc_size,
76
+ recurrent_size : args.decoder_hidden_size,
77
+ loss_type: 'softmax',
78
+ use_align_mat: args.use_align_mat,
79
+ use_align_loss: args.use_align_loss,
80
+ },
81
+ },
82
+
83
+ train+: {
84
+ batch_size: args.bs,
85
+ num_batch_accumulated: args.num_batch_accumulated,
86
+ clip_grad: 1,
87
+
88
+ model_seed: args.att,
89
+ data_seed: args.att,
90
+ init_seed: args.att,
91
+ },
92
+
93
+ optimizer: {
94
+ name: 'bertAdamw',
95
+ lr: 0.0,
96
+ bert_lr: 0.0,
97
+ },
98
+
99
+ lr_scheduler+: {
100
+ name: 'bert_warmup_polynomial_group',
101
+ start_lrs: [args.lr, args.bert_lr],
102
+ end_lr: args.end_lr,
103
+ num_warmup_steps: $.train.max_steps / 8,
104
+ },
105
+
106
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/T5-v1_1-large-170Ksteps-FIT-en-enr-enb/nl2code-base.libsonnet ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model details:
2
+ # - NL2Code
3
+ # - Pretrained, fixed word embeddings
4
+ # - glove-42B
5
+ # - min_freq 50
6
+ # - Spiderv2 encoder
7
+ # - question_encoder ['emb', 'bilstm']
8
+ # - column_encoder ['emb', 'bilstm-summarize']
9
+ # - table_encoder ['emb', 'bilstm-summarize']
10
+ # - upd_steps 4
11
+ # - Optimization
12
+ # - max_steps 40k
13
+ # - batch_size 10
14
+ # - Adam with lr 1e-3
15
+
16
+ function(output_from, data_path='data/spider-FIT-en-enr-enb/') {
17
+ local PREFIX = data_path,
18
+
19
+ data: {
20
+ train: {
21
+ name: 'spider',
22
+ paths: [
23
+ PREFIX + 'train_%s.json' % [s]
24
+ for s in ['spider', 'others']],
25
+ tables_paths: [
26
+ PREFIX + 'tables.json',
27
+ ],
28
+ db_path: PREFIX + 'database',
29
+ },
30
+ val: {
31
+ name: 'spider',
32
+ paths: [PREFIX + 'dev.json'],
33
+ tables_paths: [PREFIX + 'tables.json'],
34
+ db_path: PREFIX + 'database',
35
+ },
36
+ },
37
+
38
+ model: {
39
+ name: 'EncDec',
40
+ encoder: {
41
+ name: 'spiderv2',
42
+ dropout: 0.2,
43
+ word_emb_size: 300,
44
+ question_encoder: ['emb', 'bilstm'],
45
+ column_encoder: ['emb', 'bilstm-summarize'],
46
+ table_encoder: ['emb', 'bilstm-summarize'],
47
+ update_config: {
48
+ name: 'relational_transformer',
49
+ num_layers: 4,
50
+ num_heads: 8,
51
+ },
52
+ },
53
+ decoder: {
54
+ name: 'NL2Code',
55
+ dropout: 0.2,
56
+ desc_attn: 'mha',
57
+ },
58
+ encoder_preproc: {
59
+ word_emb: {
60
+ name: 'glove',
61
+ kind: '42B',
62
+ },
63
+ count_tokens_in_word_emb_for_vocab: false,
64
+ min_freq: 50,
65
+ max_count: 5000,
66
+ include_table_name_in_column: false,
67
+
68
+ save_path: PREFIX + 'nl2code-0401,output_from=%s,emb=glove-42B,min_freq=50/' % [output_from],
69
+ },
70
+ decoder_preproc: self.encoder_preproc {
71
+ grammar: {
72
+ name: 'spider',
73
+ output_from: output_from,
74
+ use_table_pointer: output_from,
75
+ include_literals: false,
76
+ },
77
+ use_seq_elem_rules: true,
78
+
79
+ word_emb:: null,
80
+ include_table_name_in_column:: null,
81
+ count_tokens_in_word_emb_for_vocab:: null,
82
+ },
83
+ },
84
+
85
+ train: {
86
+ batch_size: 10,
87
+ eval_batch_size: 50,
88
+
89
+ keep_every_n: 1000,
90
+ eval_every_n: 100,
91
+ save_every_n: 100,
92
+ report_every_n: 10,
93
+
94
+ max_steps: 170300,
95
+ num_eval_items: 50,
96
+ },
97
+ optimizer: {
98
+ name: 'adam',
99
+ lr: 0.0,
100
+ },
101
+ lr_scheduler: {
102
+ name: 'warmup_polynomial',
103
+ num_warmup_steps: $.train.max_steps / 20,
104
+ start_lr: 1e-3,
105
+ end_lr: 0,
106
+ decay_steps: $.train.max_steps - self.num_warmup_steps,
107
+ power: 0.5,
108
+ }
109
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/T5-v1_1-large-170Ksteps-FIT-en-extra-3enr-1en/T5-v1_1.jsonnet ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local _0428_base = import 'nl2code-base.libsonnet';
2
+ local _data_path = 'data/spider-FIT-en-extra-3enr-1enb/';
3
+ local _output_from = true;
4
+ local _fs = 2;
5
+
6
+ function(args) _0428_base(output_from=_output_from, data_path=_data_path) + {
7
+ local lr_s = '%0.1e' % args.lr,
8
+ local bert_lr_s = '%0.1e' % args.bert_lr,
9
+ local end_lr_s = if args.end_lr == 0 then '0e0' else '%0.1e' % args.end_lr,
10
+
11
+ local base_bert_enc_size = 1024,
12
+ local enc_size = base_bert_enc_size,
13
+
14
+ model_name: 'bs=%(bs)d,lr=%(lr)s,bert_lr=%(bert_lr)s,end_lr=%(end_lr)s,att=%(att)d' % (args + {
15
+ lr: lr_s,
16
+ bert_lr: bert_lr_s,
17
+ end_lr: end_lr_s,
18
+ }),
19
+
20
+ model+: {
21
+ encoder+: {
22
+ name: 'spider-t5',
23
+ batch_encs_update:: null,
24
+ question_encoder:: null,
25
+ column_encoder:: null,
26
+ table_encoder:: null,
27
+ dropout:: null,
28
+ update_config+: {
29
+ name: 'relational_transformer',
30
+ num_layers: args.num_layers,
31
+ num_heads: 8,
32
+ sc_link: args.sc_link,
33
+ cv_link: args.cv_link,
34
+ },
35
+ summarize_header: args.summarize_header,
36
+ use_column_type: args.use_column_type,
37
+ t5_version: args.t5_version,
38
+ pretrained_checkpoint: args.pretrained_checkpoint,
39
+ top_k_learnable:: null,
40
+ word_emb_size:: null,
41
+ },
42
+ encoder_preproc+: {
43
+ word_emb:: null,
44
+ min_freq:: null,
45
+ max_count:: null,
46
+ db_path: _data_path + "database",
47
+ compute_sc_link: args.sc_link,
48
+ compute_cv_link: args.cv_link,
49
+ fix_issue_16_primary_keys: true,
50
+ t5_version: args.t5_version,
51
+ pretrained_checkpoint: args.pretrained_checkpoint,
52
+ count_tokens_in_word_emb_for_vocab:: null,
53
+ save_path: _data_path + 'T5-v1_1-large-NoGAP-nl2code-1115,output_from=%s,fs=%d,emb=t5,cvlink' % [_output_from, _fs],
54
+ },
55
+ decoder_preproc+: {
56
+ grammar+: {
57
+ end_with_from: args.end_with_from,
58
+ clause_order: args.clause_order,
59
+ infer_from_conditions: true,
60
+ factorize_sketch: _fs,
61
+ },
62
+ save_path: _data_path + 'T5-v1_1-large-NoGAP-nl2code-1115,output_from=%s,fs=%d,emb=t5,cvlink' % [_output_from, _fs],
63
+
64
+ compute_sc_link:: null,
65
+ compute_cv_link:: null,
66
+ db_path:: null,
67
+ fix_issue_16_primary_keys:: null,
68
+ t5_version:: null,
69
+ pretrained_checkpoint:: null,
70
+ },
71
+ decoder+: {
72
+ name: 'NL2Code',
73
+ dropout: 0.20687225956012834,
74
+ desc_attn: 'mha',
75
+ enc_recurrent_size: enc_size,
76
+ recurrent_size : args.decoder_hidden_size,
77
+ loss_type: 'softmax',
78
+ use_align_mat: args.use_align_mat,
79
+ use_align_loss: args.use_align_loss,
80
+ },
81
+ },
82
+
83
+ train+: {
84
+ batch_size: args.bs,
85
+ num_batch_accumulated: args.num_batch_accumulated,
86
+ clip_grad: 1,
87
+
88
+ model_seed: args.att,
89
+ data_seed: args.att,
90
+ init_seed: args.att,
91
+ },
92
+
93
+ optimizer: {
94
+ name: 'bertAdamw',
95
+ lr: 0.0,
96
+ bert_lr: 0.0,
97
+ },
98
+
99
+ lr_scheduler+: {
100
+ name: 'bert_warmup_polynomial_group',
101
+ start_lrs: [args.lr, args.bert_lr],
102
+ end_lr: args.end_lr,
103
+ num_warmup_steps: $.train.max_steps / 8,
104
+ },
105
+
106
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/T5-v1_1-large-170Ksteps-FIT-en-extra-3enr-1en/nl2code-base.libsonnet ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model details:
2
+ # - NL2Code
3
+ # - Pretrained, fixed word embeddings
4
+ # - glove-42B
5
+ # - min_freq 50
6
+ # - Spiderv2 encoder
7
+ # - question_encoder ['emb', 'bilstm']
8
+ # - column_encoder ['emb', 'bilstm-summarize']
9
+ # - table_encoder ['emb', 'bilstm-summarize']
10
+ # - upd_steps 4
11
+ # - Optimization
12
+ # - max_steps 40k
13
+ # - batch_size 10
14
+ # - Adam with lr 1e-3
15
+
16
+ function(output_from, data_path='data/spider-FIT-en-extra-3enr-1enb/') {
17
+ local PREFIX = data_path,
18
+
19
+ data: {
20
+ train: {
21
+ name: 'spider',
22
+ paths: [
23
+ PREFIX + 'train_%s.json' % [s]
24
+ for s in ['spider', 'others']],
25
+ tables_paths: [
26
+ PREFIX + 'tables.json',
27
+ ],
28
+ db_path: PREFIX + 'database',
29
+ },
30
+ val: {
31
+ name: 'spider',
32
+ paths: [PREFIX + 'dev.json'],
33
+ tables_paths: [PREFIX + 'tables.json'],
34
+ db_path: PREFIX + 'database',
35
+ },
36
+ },
37
+
38
+ model: {
39
+ name: 'EncDec',
40
+ encoder: {
41
+ name: 'spiderv2',
42
+ dropout: 0.2,
43
+ word_emb_size: 300,
44
+ question_encoder: ['emb', 'bilstm'],
45
+ column_encoder: ['emb', 'bilstm-summarize'],
46
+ table_encoder: ['emb', 'bilstm-summarize'],
47
+ update_config: {
48
+ name: 'relational_transformer',
49
+ num_layers: 4,
50
+ num_heads: 8,
51
+ },
52
+ },
53
+ decoder: {
54
+ name: 'NL2Code',
55
+ dropout: 0.2,
56
+ desc_attn: 'mha',
57
+ },
58
+ encoder_preproc: {
59
+ word_emb: {
60
+ name: 'glove',
61
+ kind: '42B',
62
+ },
63
+ count_tokens_in_word_emb_for_vocab: false,
64
+ min_freq: 50,
65
+ max_count: 5000,
66
+ include_table_name_in_column: false,
67
+
68
+ save_path: PREFIX + 'nl2code-0401,output_from=%s,emb=glove-42B,min_freq=50/' % [output_from],
69
+ },
70
+ decoder_preproc: self.encoder_preproc {
71
+ grammar: {
72
+ name: 'spider',
73
+ output_from: output_from,
74
+ use_table_pointer: output_from,
75
+ include_literals: false,
76
+ },
77
+ use_seq_elem_rules: true,
78
+
79
+ word_emb:: null,
80
+ include_table_name_in_column:: null,
81
+ count_tokens_in_word_emb_for_vocab:: null,
82
+ },
83
+ },
84
+
85
+ train: {
86
+ batch_size: 10,
87
+ eval_batch_size: 50,
88
+
89
+ keep_every_n: 1000,
90
+ eval_every_n: 100,
91
+ save_every_n: 100,
92
+ report_every_n: 10,
93
+
94
+ max_steps: 170300,
95
+ num_eval_items: 50,
96
+ },
97
+ optimizer: {
98
+ name: 'adam',
99
+ lr: 0.0,
100
+ },
101
+ lr_scheduler: {
102
+ name: 'warmup_polynomial',
103
+ num_warmup_steps: $.train.max_steps / 20,
104
+ start_lr: 1e-3,
105
+ end_lr: 0,
106
+ decay_steps: $.train.max_steps - self.num_warmup_steps,
107
+ power: 0.5,
108
+ }
109
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/T5-v1_1-large-170Ksteps-FIT-en/T5-v1_1.jsonnet ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local _0428_base = import 'nl2code-base.libsonnet';
2
+ local _data_path = 'data/spider-FIT-en/';
3
+ local _output_from = true;
4
+ local _fs = 2;
5
+
6
+ function(args) _0428_base(output_from=_output_from, data_path=_data_path) + {
7
+ local lr_s = '%0.1e' % args.lr,
8
+ local bert_lr_s = '%0.1e' % args.bert_lr,
9
+ local end_lr_s = if args.end_lr == 0 then '0e0' else '%0.1e' % args.end_lr,
10
+
11
+ local base_bert_enc_size = 1024,
12
+ local enc_size = base_bert_enc_size,
13
+
14
+ model_name: 'bs=%(bs)d,lr=%(lr)s,bert_lr=%(bert_lr)s,end_lr=%(end_lr)s,att=%(att)d' % (args + {
15
+ lr: lr_s,
16
+ bert_lr: bert_lr_s,
17
+ end_lr: end_lr_s,
18
+ }),
19
+
20
+ model+: {
21
+ encoder+: {
22
+ name: 'spider-t5',
23
+ batch_encs_update:: null,
24
+ question_encoder:: null,
25
+ column_encoder:: null,
26
+ table_encoder:: null,
27
+ dropout:: null,
28
+ update_config+: {
29
+ name: 'relational_transformer',
30
+ num_layers: args.num_layers,
31
+ num_heads: 8,
32
+ sc_link: args.sc_link,
33
+ cv_link: args.cv_link,
34
+ },
35
+ summarize_header: args.summarize_header,
36
+ use_column_type: args.use_column_type,
37
+ t5_version: args.t5_version,
38
+ pretrained_checkpoint: args.pretrained_checkpoint,
39
+ top_k_learnable:: null,
40
+ word_emb_size:: null,
41
+ },
42
+ encoder_preproc+: {
43
+ word_emb:: null,
44
+ min_freq:: null,
45
+ max_count:: null,
46
+ db_path: _data_path + "database",
47
+ compute_sc_link: args.sc_link,
48
+ compute_cv_link: args.cv_link,
49
+ fix_issue_16_primary_keys: true,
50
+ t5_version: args.t5_version,
51
+ pretrained_checkpoint: args.pretrained_checkpoint,
52
+ count_tokens_in_word_emb_for_vocab:: null,
53
+ save_path: _data_path + 'T5-v1_1-large-NoGAP-nl2code-1115,output_from=%s,fs=%d,emb=t5,cvlink' % [_output_from, _fs],
54
+ },
55
+ decoder_preproc+: {
56
+ grammar+: {
57
+ end_with_from: args.end_with_from,
58
+ clause_order: args.clause_order,
59
+ infer_from_conditions: true,
60
+ factorize_sketch: _fs,
61
+ },
62
+ save_path: _data_path + 'T5-v1_1-large-NoGAP-nl2code-1115,output_from=%s,fs=%d,emb=t5,cvlink' % [_output_from, _fs],
63
+
64
+ compute_sc_link:: null,
65
+ compute_cv_link:: null,
66
+ db_path:: null,
67
+ fix_issue_16_primary_keys:: null,
68
+ t5_version:: null,
69
+ pretrained_checkpoint:: null,
70
+ },
71
+ decoder+: {
72
+ name: 'NL2Code',
73
+ dropout: 0.20687225956012834,
74
+ desc_attn: 'mha',
75
+ enc_recurrent_size: enc_size,
76
+ recurrent_size : args.decoder_hidden_size,
77
+ loss_type: 'softmax',
78
+ use_align_mat: args.use_align_mat,
79
+ use_align_loss: args.use_align_loss,
80
+ },
81
+ },
82
+
83
+ train+: {
84
+ batch_size: args.bs,
85
+ num_batch_accumulated: args.num_batch_accumulated,
86
+ clip_grad: 1,
87
+
88
+ model_seed: args.att,
89
+ data_seed: args.att,
90
+ init_seed: args.att,
91
+ },
92
+
93
+ optimizer: {
94
+ name: 'bertAdamw',
95
+ lr: 0.0,
96
+ bert_lr: 0.0,
97
+ },
98
+
99
+ lr_scheduler+: {
100
+ name: 'bert_warmup_polynomial_group',
101
+ start_lrs: [args.lr, args.bert_lr],
102
+ end_lr: args.end_lr,
103
+ num_warmup_steps: $.train.max_steps / 8,
104
+ },
105
+
106
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/T5-v1_1-large-170Ksteps-FIT-en/nl2code-base.libsonnet ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model details:
2
+ # - NL2Code
3
+ # - Pretrained, fixed word embeddings
4
+ # - glove-42B
5
+ # - min_freq 50
6
+ # - Spiderv2 encoder
7
+ # - question_encoder ['emb', 'bilstm']
8
+ # - column_encoder ['emb', 'bilstm-summarize']
9
+ # - table_encoder ['emb', 'bilstm-summarize']
10
+ # - upd_steps 4
11
+ # - Optimization
12
+ # - max_steps 40k
13
+ # - batch_size 10
14
+ # - Adam with lr 1e-3
15
+
16
+ function(output_from, data_path='data/spider-FIT-en/') {
17
+ local PREFIX = data_path,
18
+
19
+ data: {
20
+ train: {
21
+ name: 'spider',
22
+ paths: [
23
+ PREFIX + 'train_%s.json' % [s]
24
+ for s in ['spider', 'others']],
25
+ tables_paths: [
26
+ PREFIX + 'tables.json',
27
+ ],
28
+ db_path: PREFIX + 'database',
29
+ },
30
+ val: {
31
+ name: 'spider',
32
+ paths: [PREFIX + 'dev.json'],
33
+ tables_paths: [PREFIX + 'tables.json'],
34
+ db_path: PREFIX + 'database',
35
+ },
36
+ },
37
+
38
+ model: {
39
+ name: 'EncDec',
40
+ encoder: {
41
+ name: 'spiderv2',
42
+ dropout: 0.2,
43
+ word_emb_size: 300,
44
+ question_encoder: ['emb', 'bilstm'],
45
+ column_encoder: ['emb', 'bilstm-summarize'],
46
+ table_encoder: ['emb', 'bilstm-summarize'],
47
+ update_config: {
48
+ name: 'relational_transformer',
49
+ num_layers: 4,
50
+ num_heads: 8,
51
+ },
52
+ },
53
+ decoder: {
54
+ name: 'NL2Code',
55
+ dropout: 0.2,
56
+ desc_attn: 'mha',
57
+ },
58
+ encoder_preproc: {
59
+ word_emb: {
60
+ name: 'glove',
61
+ kind: '42B',
62
+ },
63
+ count_tokens_in_word_emb_for_vocab: false,
64
+ min_freq: 50,
65
+ max_count: 5000,
66
+ include_table_name_in_column: false,
67
+
68
+ save_path: PREFIX + 'nl2code-0401,output_from=%s,emb=glove-42B,min_freq=50/' % [output_from],
69
+ },
70
+ decoder_preproc: self.encoder_preproc {
71
+ grammar: {
72
+ name: 'spider',
73
+ output_from: output_from,
74
+ use_table_pointer: output_from,
75
+ include_literals: false,
76
+ },
77
+ use_seq_elem_rules: true,
78
+
79
+ word_emb:: null,
80
+ include_table_name_in_column:: null,
81
+ count_tokens_in_word_emb_for_vocab:: null,
82
+ },
83
+ },
84
+
85
+ train: {
86
+ batch_size: 10,
87
+ eval_batch_size: 50,
88
+
89
+ keep_every_n: 1000,
90
+ eval_every_n: 100,
91
+ save_every_n: 100,
92
+ report_every_n: 10,
93
+
94
+ max_steps: 170300,
95
+ num_eval_items: 50,
96
+ },
97
+ optimizer: {
98
+ name: 'adam',
99
+ lr: 0.0,
100
+ },
101
+ lr_scheduler: {
102
+ name: 'warmup_polynomial',
103
+ num_warmup_steps: $.train.max_steps / 20,
104
+ start_lr: 1e-3,
105
+ end_lr: 0,
106
+ decay_steps: $.train.max_steps - self.num_warmup_steps,
107
+ power: 0.5,
108
+ }
109
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/T5-v1_1-large-170Ksteps-en/T5-v1_1.jsonnet ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local _0428_base = import 'nl2code-base.libsonnet';
2
+ local _data_path = 'data/spider-en/';
3
+ local _output_from = true;
4
+ local _fs = 2;
5
+
6
+ function(args) _0428_base(output_from=_output_from, data_path=_data_path) + {
7
+ local lr_s = '%0.1e' % args.lr,
8
+ local bert_lr_s = '%0.1e' % args.bert_lr,
9
+ local end_lr_s = if args.end_lr == 0 then '0e0' else '%0.1e' % args.end_lr,
10
+
11
+ local base_bert_enc_size = 1024,
12
+ local enc_size = base_bert_enc_size,
13
+
14
+ model_name: 'bs=%(bs)d,lr=%(lr)s,bert_lr=%(bert_lr)s,end_lr=%(end_lr)s,att=%(att)d' % (args + {
15
+ lr: lr_s,
16
+ bert_lr: bert_lr_s,
17
+ end_lr: end_lr_s,
18
+ }),
19
+
20
+ model+: {
21
+ encoder+: {
22
+ name: 'spider-t5',
23
+ batch_encs_update:: null,
24
+ question_encoder:: null,
25
+ column_encoder:: null,
26
+ table_encoder:: null,
27
+ dropout:: null,
28
+ update_config+: {
29
+ name: 'relational_transformer',
30
+ num_layers: args.num_layers,
31
+ num_heads: 8,
32
+ sc_link: args.sc_link,
33
+ cv_link: args.cv_link,
34
+ },
35
+ summarize_header: args.summarize_header,
36
+ use_column_type: args.use_column_type,
37
+ t5_version: args.t5_version,
38
+ pretrained_checkpoint: args.pretrained_checkpoint,
39
+ top_k_learnable:: null,
40
+ word_emb_size:: null,
41
+ },
42
+ encoder_preproc+: {
43
+ word_emb:: null,
44
+ min_freq:: null,
45
+ max_count:: null,
46
+ db_path: _data_path + "database",
47
+ compute_sc_link: args.sc_link,
48
+ compute_cv_link: args.cv_link,
49
+ fix_issue_16_primary_keys: true,
50
+ t5_version: args.t5_version,
51
+ pretrained_checkpoint: args.pretrained_checkpoint,
52
+ count_tokens_in_word_emb_for_vocab:: null,
53
+ save_path: _data_path + 'T5-v1_1-large-NoGAP-nl2code-1115,output_from=%s,fs=%d,emb=t5,cvlink' % [_output_from, _fs],
54
+ },
55
+ decoder_preproc+: {
56
+ grammar+: {
57
+ end_with_from: args.end_with_from,
58
+ clause_order: args.clause_order,
59
+ infer_from_conditions: true,
60
+ factorize_sketch: _fs,
61
+ },
62
+ save_path: _data_path + 'T5-v1_1-large-NoGAP-nl2code-1115,output_from=%s,fs=%d,emb=t5,cvlink' % [_output_from, _fs],
63
+
64
+ compute_sc_link:: null,
65
+ compute_cv_link:: null,
66
+ db_path:: null,
67
+ fix_issue_16_primary_keys:: null,
68
+ t5_version:: null,
69
+ pretrained_checkpoint:: null,
70
+ },
71
+ decoder+: {
72
+ name: 'NL2Code',
73
+ dropout: 0.20687225956012834,
74
+ desc_attn: 'mha',
75
+ enc_recurrent_size: enc_size,
76
+ recurrent_size : args.decoder_hidden_size,
77
+ loss_type: 'softmax',
78
+ use_align_mat: args.use_align_mat,
79
+ use_align_loss: args.use_align_loss,
80
+ },
81
+ },
82
+
83
+ train+: {
84
+ batch_size: args.bs,
85
+ num_batch_accumulated: args.num_batch_accumulated,
86
+ clip_grad: 1,
87
+
88
+ model_seed: args.att,
89
+ data_seed: args.att,
90
+ init_seed: args.att,
91
+ },
92
+
93
+ optimizer: {
94
+ name: 'bertAdamw',
95
+ lr: 0.0,
96
+ bert_lr: 0.0,
97
+ },
98
+
99
+ lr_scheduler+: {
100
+ name: 'bert_warmup_polynomial_group',
101
+ start_lrs: [args.lr, args.bert_lr],
102
+ end_lr: args.end_lr,
103
+ num_warmup_steps: $.train.max_steps / 8,
104
+ },
105
+
106
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/T5-v1_1-large-170Ksteps-en/nl2code-base.libsonnet ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model details:
2
+ # - NL2Code
3
+ # - Pretrained, fixed word embeddings
4
+ # - glove-42B
5
+ # - min_freq 50
6
+ # - Spiderv2 encoder
7
+ # - question_encoder ['emb', 'bilstm']
8
+ # - column_encoder ['emb', 'bilstm-summarize']
9
+ # - table_encoder ['emb', 'bilstm-summarize']
10
+ # - upd_steps 4
11
+ # - Optimization
12
+ # - max_steps 40k
13
+ # - batch_size 10
14
+ # - Adam with lr 1e-3
15
+
16
+ function(output_from, data_path='data/spider-en/') {
17
+ local PREFIX = data_path,
18
+
19
+ data: {
20
+ train: {
21
+ name: 'spider',
22
+ paths: [
23
+ PREFIX + 'train_%s.json' % [s]
24
+ for s in ['spider', 'others']],
25
+ tables_paths: [
26
+ PREFIX + 'tables.json',
27
+ ],
28
+ db_path: PREFIX + 'database',
29
+ },
30
+ val: {
31
+ name: 'spider',
32
+ paths: [PREFIX + 'dev.json'],
33
+ tables_paths: [PREFIX + 'tables.json'],
34
+ db_path: PREFIX + 'database',
35
+ },
36
+ },
37
+
38
+ model: {
39
+ name: 'EncDec',
40
+ encoder: {
41
+ name: 'spiderv2',
42
+ dropout: 0.2,
43
+ word_emb_size: 300,
44
+ question_encoder: ['emb', 'bilstm'],
45
+ column_encoder: ['emb', 'bilstm-summarize'],
46
+ table_encoder: ['emb', 'bilstm-summarize'],
47
+ update_config: {
48
+ name: 'relational_transformer',
49
+ num_layers: 4,
50
+ num_heads: 8,
51
+ },
52
+ },
53
+ decoder: {
54
+ name: 'NL2Code',
55
+ dropout: 0.2,
56
+ desc_attn: 'mha',
57
+ },
58
+ encoder_preproc: {
59
+ word_emb: {
60
+ name: 'glove',
61
+ kind: '42B',
62
+ },
63
+ count_tokens_in_word_emb_for_vocab: false,
64
+ min_freq: 50,
65
+ max_count: 5000,
66
+ include_table_name_in_column: false,
67
+
68
+ save_path: PREFIX + 'nl2code-0401,output_from=%s,emb=glove-42B,min_freq=50/' % [output_from],
69
+ },
70
+ decoder_preproc: self.encoder_preproc {
71
+ grammar: {
72
+ name: 'spider',
73
+ output_from: output_from,
74
+ use_table_pointer: output_from,
75
+ include_literals: false,
76
+ },
77
+ use_seq_elem_rules: true,
78
+
79
+ word_emb:: null,
80
+ include_table_name_in_column:: null,
81
+ count_tokens_in_word_emb_for_vocab:: null,
82
+ },
83
+ },
84
+
85
+ train: {
86
+ batch_size: 10,
87
+ eval_batch_size: 50,
88
+
89
+ keep_every_n: 1000,
90
+ eval_every_n: 100,
91
+ save_every_n: 100,
92
+ report_every_n: 10,
93
+
94
+ max_steps: 170300,
95
+ num_eval_items: 50,
96
+ },
97
+ optimizer: {
98
+ name: 'adam',
99
+ lr: 0.0,
100
+ },
101
+ lr_scheduler: {
102
+ name: 'warmup_polynomial',
103
+ num_warmup_steps: $.train.max_steps / 20,
104
+ start_lr: 1e-3,
105
+ end_lr: 0,
106
+ decay_steps: $.train.max_steps - self.num_warmup_steps,
107
+ power: 0.5,
108
+ }
109
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mBART50MtoM-large-en-pt-es-fr/gap-bart.jsonnet ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local _0428_base = import 'nl2code-base.libsonnet';
2
+ local _data_path = 'data/spider-en-pt-es-fr/';
3
+ local _output_from = true;
4
+ local _fs = 2;
5
+
6
+ function(args) _0428_base(output_from=_output_from, data_path=_data_path) + {
7
+ local lr_s = '%0.1e' % args.lr,
8
+ local bert_lr_s = '%0.1e' % args.bert_lr,
9
+ local end_lr_s = if args.end_lr == 0 then '0e0' else '%0.1e' % args.end_lr,
10
+
11
+ local base_bert_enc_size = 1024,
12
+ local enc_size = base_bert_enc_size,
13
+
14
+ model_name: 'bs=%(bs)d,lr=%(lr)s,bert_lr=%(bert_lr)s,end_lr=%(end_lr)s,att=%(att)d' % (args + {
15
+ lr: lr_s,
16
+ bert_lr: bert_lr_s,
17
+ end_lr: end_lr_s,
18
+ }),
19
+
20
+ model+: {
21
+ encoder+: {
22
+ name: 'spider-bart',
23
+ batch_encs_update:: null,
24
+ question_encoder:: null,
25
+ column_encoder:: null,
26
+ table_encoder:: null,
27
+ dropout:: null,
28
+ update_config+: {
29
+ name: 'relational_transformer',
30
+ num_layers: args.num_layers,
31
+ num_heads: 8,
32
+ sc_link: args.sc_link,
33
+ cv_link: args.cv_link,
34
+ },
35
+ summarize_header: args.summarize_header,
36
+ use_column_type: args.use_column_type,
37
+ bart_version: args.bart_version,
38
+ top_k_learnable:: null,
39
+ word_emb_size:: null,
40
+ },
41
+ encoder_preproc+: {
42
+ word_emb:: null,
43
+ min_freq:: null,
44
+ max_count:: null,
45
+ db_path: _data_path + "database",
46
+ compute_sc_link: args.sc_link,
47
+ compute_cv_link: args.cv_link,
48
+ fix_issue_16_primary_keys: true,
49
+ bart_version: args.bart_version,
50
+ count_tokens_in_word_emb_for_vocab:: null,
51
+ save_path: _data_path + 'mBART50MtoM-large-nl2code-1115,output_from=%s,fs=%d,emb=bart,cvlink' % [_output_from, _fs],
52
+ },
53
+ decoder_preproc+: {
54
+ grammar+: {
55
+ end_with_from: args.end_with_from,
56
+ clause_order: args.clause_order,
57
+ infer_from_conditions: true,
58
+ factorize_sketch: _fs,
59
+ },
60
+ save_path: _data_path + 'mBART50MtoM-large-nl2code-1115,output_from=%s,fs=%d,emb=bart,cvlink' % [_output_from, _fs],
61
+
62
+ compute_sc_link:: null,
63
+ compute_cv_link:: null,
64
+ db_path:: null,
65
+ fix_issue_16_primary_keys:: null,
66
+ bart_version:: null,
67
+ },
68
+ decoder+: {
69
+ name: 'NL2Code',
70
+ dropout: 0.20687225956012834,
71
+ desc_attn: 'mha',
72
+ enc_recurrent_size: enc_size,
73
+ recurrent_size : args.decoder_hidden_size,
74
+ loss_type: 'softmax',
75
+ use_align_mat: args.use_align_mat,
76
+ use_align_loss: args.use_align_loss,
77
+ },
78
+ },
79
+
80
+ train+: {
81
+ batch_size: args.bs,
82
+ num_batch_accumulated: args.num_batch_accumulated,
83
+ clip_grad: 1,
84
+
85
+ model_seed: args.att,
86
+ data_seed: args.att,
87
+ init_seed: args.att,
88
+ },
89
+
90
+ optimizer: {
91
+ name: 'bertAdamw',
92
+ lr: 0.0,
93
+ bert_lr: 0.0,
94
+ },
95
+
96
+ lr_scheduler+: {
97
+ name: 'bert_warmup_polynomial_group',
98
+ start_lrs: [args.lr, args.bert_lr],
99
+ end_lr: args.end_lr,
100
+ num_warmup_steps: $.train.max_steps / 8,
101
+ },
102
+
103
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mBART50MtoM-large-en-pt-es-fr/nl2code-base.libsonnet ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model details:
2
+ # - NL2Code
3
+ # - Pretrained, fixed word embeddings
4
+ # - glove-42B
5
+ # - min_freq 50
6
+ # - Spiderv2 encoder
7
+ # - question_encoder ['emb', 'bilstm']
8
+ # - column_encoder ['emb', 'bilstm-summarize']
9
+ # - table_encoder ['emb', 'bilstm-summarize']
10
+ # - upd_steps 4
11
+ # - Optimization
12
+ # - max_steps 40k
13
+ # - batch_size 10
14
+ # - Adam with lr 1e-3
15
+
16
+ function(output_from, data_path='data/spider-en-pt-es-fr/') {
17
+ local PREFIX = data_path,
18
+
19
+ data: {
20
+ train: {
21
+ name: 'spider',
22
+ paths: [
23
+ PREFIX + 'train_%s.json' % [s]
24
+ for s in ['spider', 'others']],
25
+ tables_paths: [
26
+ PREFIX + 'tables.json',
27
+ ],
28
+ db_path: PREFIX + 'database',
29
+ },
30
+ val: {
31
+ name: 'spider',
32
+ paths: [PREFIX + 'dev.json'],
33
+ tables_paths: [PREFIX + 'tables.json'],
34
+ db_path: PREFIX + 'database',
35
+ },
36
+ },
37
+
38
+ model: {
39
+ name: 'EncDec',
40
+ encoder: {
41
+ name: 'spiderv2',
42
+ dropout: 0.2,
43
+ word_emb_size: 300,
44
+ question_encoder: ['emb', 'bilstm'],
45
+ column_encoder: ['emb', 'bilstm-summarize'],
46
+ table_encoder: ['emb', 'bilstm-summarize'],
47
+ update_config: {
48
+ name: 'relational_transformer',
49
+ num_layers: 4,
50
+ num_heads: 8,
51
+ },
52
+ },
53
+ decoder: {
54
+ name: 'NL2Code',
55
+ dropout: 0.2,
56
+ desc_attn: 'mha',
57
+ },
58
+ encoder_preproc: {
59
+ word_emb: {
60
+ name: 'glove',
61
+ kind: '42B',
62
+ },
63
+ count_tokens_in_word_emb_for_vocab: false,
64
+ min_freq: 50,
65
+ max_count: 5000,
66
+ include_table_name_in_column: false,
67
+
68
+ save_path: PREFIX + 'nl2code-0401,output_from=%s,emb=glove-42B,min_freq=50/' % [output_from],
69
+ },
70
+ decoder_preproc: self.encoder_preproc {
71
+ grammar: {
72
+ name: 'spider',
73
+ output_from: output_from,
74
+ use_table_pointer: output_from,
75
+ include_literals: false,
76
+ },
77
+ use_seq_elem_rules: true,
78
+
79
+ word_emb:: null,
80
+ include_table_name_in_column:: null,
81
+ count_tokens_in_word_emb_for_vocab:: null,
82
+ },
83
+ },
84
+
85
+ train: {
86
+ batch_size: 10,
87
+ eval_batch_size: 50,
88
+
89
+ keep_every_n: 1000,
90
+ eval_every_n: 100,
91
+ save_every_n: 100,
92
+ report_every_n: 10,
93
+
94
+ max_steps: 44100,
95
+ num_eval_items: 50,
96
+ },
97
+ optimizer: {
98
+ name: 'adam',
99
+ lr: 0.0,
100
+ },
101
+ lr_scheduler: {
102
+ name: 'warmup_polynomial',
103
+ num_warmup_steps: $.train.max_steps / 20,
104
+ start_lr: 1e-3,
105
+ end_lr: 0,
106
+ decay_steps: $.train.max_steps - self.num_warmup_steps,
107
+ power: 0.5,
108
+ }
109
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mBART50MtoM-large-en/gap-bart.jsonnet ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local _0428_base = import 'nl2code-base.libsonnet';
2
+ local _data_path = 'data/spider-en/';
3
+ local _output_from = true;
4
+ local _fs = 2;
5
+
6
+ function(args) _0428_base(output_from=_output_from, data_path=_data_path) + {
7
+ local lr_s = '%0.1e' % args.lr,
8
+ local bert_lr_s = '%0.1e' % args.bert_lr,
9
+ local end_lr_s = if args.end_lr == 0 then '0e0' else '%0.1e' % args.end_lr,
10
+
11
+ local base_bert_enc_size = 1024,
12
+ local enc_size = base_bert_enc_size,
13
+
14
+ model_name: 'bs=%(bs)d,lr=%(lr)s,bert_lr=%(bert_lr)s,end_lr=%(end_lr)s,att=%(att)d' % (args + {
15
+ lr: lr_s,
16
+ bert_lr: bert_lr_s,
17
+ end_lr: end_lr_s,
18
+ }),
19
+
20
+ model+: {
21
+ encoder+: {
22
+ name: 'spider-bart',
23
+ batch_encs_update:: null,
24
+ question_encoder:: null,
25
+ column_encoder:: null,
26
+ table_encoder:: null,
27
+ dropout:: null,
28
+ update_config+: {
29
+ name: 'relational_transformer',
30
+ num_layers: args.num_layers,
31
+ num_heads: 8,
32
+ sc_link: args.sc_link,
33
+ cv_link: args.cv_link,
34
+ },
35
+ summarize_header: args.summarize_header,
36
+ use_column_type: args.use_column_type,
37
+ bart_version: args.bart_version,
38
+ top_k_learnable:: null,
39
+ word_emb_size:: null,
40
+ },
41
+ encoder_preproc+: {
42
+ word_emb:: null,
43
+ min_freq:: null,
44
+ max_count:: null,
45
+ db_path: _data_path + "database",
46
+ compute_sc_link: args.sc_link,
47
+ compute_cv_link: args.cv_link,
48
+ fix_issue_16_primary_keys: true,
49
+ bart_version: args.bart_version,
50
+ count_tokens_in_word_emb_for_vocab:: null,
51
+ save_path: _data_path + 'mBART50MtoM-large-large-nl2code-1115,output_from=%s,fs=%d,emb=bart,cvlink' % [_output_from, _fs],
52
+ },
53
+ decoder_preproc+: {
54
+ grammar+: {
55
+ end_with_from: args.end_with_from,
56
+ clause_order: args.clause_order,
57
+ infer_from_conditions: true,
58
+ factorize_sketch: _fs,
59
+ },
60
+ save_path: _data_path + 'mBART50MtoM-large-large-nl2code-1115,output_from=%s,fs=%d,emb=bart,cvlink' % [_output_from, _fs],
61
+
62
+ compute_sc_link:: null,
63
+ compute_cv_link:: null,
64
+ db_path:: null,
65
+ fix_issue_16_primary_keys:: null,
66
+ bart_version:: null,
67
+ },
68
+ decoder+: {
69
+ name: 'NL2Code',
70
+ dropout: 0.20687225956012834,
71
+ desc_attn: 'mha',
72
+ enc_recurrent_size: enc_size,
73
+ recurrent_size : args.decoder_hidden_size,
74
+ loss_type: 'softmax',
75
+ use_align_mat: args.use_align_mat,
76
+ use_align_loss: args.use_align_loss,
77
+ },
78
+ },
79
+
80
+ train+: {
81
+ batch_size: args.bs,
82
+ num_batch_accumulated: args.num_batch_accumulated,
83
+ clip_grad: 1,
84
+
85
+ model_seed: args.att,
86
+ data_seed: args.att,
87
+ init_seed: args.att,
88
+ },
89
+
90
+ optimizer: {
91
+ name: 'bertAdamw',
92
+ lr: 0.0,
93
+ bert_lr: 0.0,
94
+ },
95
+
96
+ lr_scheduler+: {
97
+ name: 'bert_warmup_polynomial_group',
98
+ start_lrs: [args.lr, args.bert_lr],
99
+ end_lr: args.end_lr,
100
+ num_warmup_steps: $.train.max_steps / 8,
101
+ },
102
+
103
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mBART50MtoM-large-en/nl2code-base.libsonnet ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model details:
2
+ # - NL2Code
3
+ # - Pretrained, fixed word embeddings
4
+ # - glove-42B
5
+ # - min_freq 50
6
+ # - Spiderv2 encoder
7
+ # - question_encoder ['emb', 'bilstm']
8
+ # - column_encoder ['emb', 'bilstm-summarize']
9
+ # - table_encoder ['emb', 'bilstm-summarize']
10
+ # - upd_steps 4
11
+ # - Optimization
12
+ # - max_steps 40k
13
+ # - batch_size 10
14
+ # - Adam with lr 1e-3
15
+
16
+ function(output_from, data_path='data/spider-en/') {
17
+ local PREFIX = data_path,
18
+
19
+ data: {
20
+ train: {
21
+ name: 'spider',
22
+ paths: [
23
+ PREFIX + 'train_%s.json' % [s]
24
+ for s in ['spider', 'others']],
25
+ tables_paths: [
26
+ PREFIX + 'tables.json',
27
+ ],
28
+ db_path: PREFIX + 'database',
29
+ },
30
+ val: {
31
+ name: 'spider',
32
+ paths: [PREFIX + 'dev.json'],
33
+ tables_paths: [PREFIX + 'tables.json'],
34
+ db_path: PREFIX + 'database',
35
+ },
36
+ },
37
+
38
+ model: {
39
+ name: 'EncDec',
40
+ encoder: {
41
+ name: 'spiderv2',
42
+ dropout: 0.2,
43
+ word_emb_size: 300,
44
+ question_encoder: ['emb', 'bilstm'],
45
+ column_encoder: ['emb', 'bilstm-summarize'],
46
+ table_encoder: ['emb', 'bilstm-summarize'],
47
+ update_config: {
48
+ name: 'relational_transformer',
49
+ num_layers: 4,
50
+ num_heads: 8,
51
+ },
52
+ },
53
+ decoder: {
54
+ name: 'NL2Code',
55
+ dropout: 0.2,
56
+ desc_attn: 'mha',
57
+ },
58
+ encoder_preproc: {
59
+ word_emb: {
60
+ name: 'glove',
61
+ kind: '42B',
62
+ },
63
+ count_tokens_in_word_emb_for_vocab: false,
64
+ min_freq: 50,
65
+ max_count: 5000,
66
+ include_table_name_in_column: false,
67
+
68
+ save_path: PREFIX + 'nl2code-0401,output_from=%s,emb=glove-42B,min_freq=50/' % [output_from],
69
+ },
70
+ decoder_preproc: self.encoder_preproc {
71
+ grammar: {
72
+ name: 'spider',
73
+ output_from: output_from,
74
+ use_table_pointer: output_from,
75
+ include_literals: false,
76
+ },
77
+ use_seq_elem_rules: true,
78
+
79
+ word_emb:: null,
80
+ include_table_name_in_column:: null,
81
+ count_tokens_in_word_emb_for_vocab:: null,
82
+ },
83
+ },
84
+
85
+ train: {
86
+ batch_size: 10,
87
+ eval_batch_size: 50,
88
+
89
+ keep_every_n: 1000,
90
+ eval_every_n: 100,
91
+ save_every_n: 100,
92
+ report_every_n: 10,
93
+
94
+ max_steps: 41000,
95
+ num_eval_items: 50,
96
+ },
97
+ optimizer: {
98
+ name: 'adam',
99
+ lr: 0.0,
100
+ },
101
+ lr_scheduler: {
102
+ name: 'warmup_polynomial',
103
+ num_warmup_steps: $.train.max_steps / 20,
104
+ start_lr: 1e-3,
105
+ end_lr: 0,
106
+ decay_steps: $.train.max_steps - self.num_warmup_steps,
107
+ power: 0.5,
108
+ }
109
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mBART50MtoM-large-pt-en/gap-bart.jsonnet ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local _0428_base = import 'nl2code-base.libsonnet';
2
+ local _data_path = 'data/spider-en-pt/';
3
+ local _output_from = true;
4
+ local _fs = 2;
5
+
6
+ function(args) _0428_base(output_from=_output_from, data_path=_data_path) + {
7
+ local lr_s = '%0.1e' % args.lr,
8
+ local bert_lr_s = '%0.1e' % args.bert_lr,
9
+ local end_lr_s = if args.end_lr == 0 then '0e0' else '%0.1e' % args.end_lr,
10
+
11
+ local base_bert_enc_size = 1024,
12
+ local enc_size = base_bert_enc_size,
13
+
14
+ model_name: 'bs=%(bs)d,lr=%(lr)s,bert_lr=%(bert_lr)s,end_lr=%(end_lr)s,att=%(att)d' % (args + {
15
+ lr: lr_s,
16
+ bert_lr: bert_lr_s,
17
+ end_lr: end_lr_s,
18
+ }),
19
+
20
+ model+: {
21
+ encoder+: {
22
+ name: 'spider-bart',
23
+ batch_encs_update:: null,
24
+ question_encoder:: null,
25
+ column_encoder:: null,
26
+ table_encoder:: null,
27
+ dropout:: null,
28
+ update_config+: {
29
+ name: 'relational_transformer',
30
+ num_layers: args.num_layers,
31
+ num_heads: 8,
32
+ sc_link: args.sc_link,
33
+ cv_link: args.cv_link,
34
+ },
35
+ summarize_header: args.summarize_header,
36
+ use_column_type: args.use_column_type,
37
+ bart_version: args.bart_version,
38
+ top_k_learnable:: null,
39
+ word_emb_size:: null,
40
+ },
41
+ encoder_preproc+: {
42
+ word_emb:: null,
43
+ min_freq:: null,
44
+ max_count:: null,
45
+ db_path: _data_path + "database",
46
+ compute_sc_link: args.sc_link,
47
+ compute_cv_link: args.cv_link,
48
+ fix_issue_16_primary_keys: true,
49
+ bart_version: args.bart_version,
50
+ count_tokens_in_word_emb_for_vocab:: null,
51
+ save_path: _data_path + 'mBART50MtoM-large-nl2code-1115,output_from=%s,fs=%d,emb=bart,cvlink' % [_output_from, _fs],
52
+ },
53
+ decoder_preproc+: {
54
+ grammar+: {
55
+ end_with_from: args.end_with_from,
56
+ clause_order: args.clause_order,
57
+ infer_from_conditions: true,
58
+ factorize_sketch: _fs,
59
+ },
60
+ save_path: _data_path + 'mBART50MtoM-large-nl2code-1115,output_from=%s,fs=%d,emb=bart,cvlink' % [_output_from, _fs],
61
+
62
+ compute_sc_link:: null,
63
+ compute_cv_link:: null,
64
+ db_path:: null,
65
+ fix_issue_16_primary_keys:: null,
66
+ bart_version:: null,
67
+ },
68
+ decoder+: {
69
+ name: 'NL2Code',
70
+ dropout: 0.20687225956012834,
71
+ desc_attn: 'mha',
72
+ enc_recurrent_size: enc_size,
73
+ recurrent_size : args.decoder_hidden_size,
74
+ loss_type: 'softmax',
75
+ use_align_mat: args.use_align_mat,
76
+ use_align_loss: args.use_align_loss,
77
+ },
78
+ },
79
+
80
+ train+: {
81
+ batch_size: args.bs,
82
+ num_batch_accumulated: args.num_batch_accumulated,
83
+ clip_grad: 1,
84
+
85
+ model_seed: args.att,
86
+ data_seed: args.att,
87
+ init_seed: args.att,
88
+ },
89
+
90
+ optimizer: {
91
+ name: 'bertAdamw',
92
+ lr: 0.0,
93
+ bert_lr: 0.0,
94
+ },
95
+
96
+ lr_scheduler+: {
97
+ name: 'bert_warmup_polynomial_group',
98
+ start_lrs: [args.lr, args.bert_lr],
99
+ end_lr: args.end_lr,
100
+ num_warmup_steps: $.train.max_steps / 8,
101
+ },
102
+
103
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mBART50MtoM-large-pt-en/nl2code-base.libsonnet ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model details:
2
+ # - NL2Code
3
+ # - Pretrained, fixed word embeddings
4
+ # - glove-42B
5
+ # - min_freq 50
6
+ # - Spiderv2 encoder
7
+ # - question_encoder ['emb', 'bilstm']
8
+ # - column_encoder ['emb', 'bilstm-summarize']
9
+ # - table_encoder ['emb', 'bilstm-summarize']
10
+ # - upd_steps 4
11
+ # - Optimization
12
+ # - max_steps 40k
13
+ # - batch_size 10
14
+ # - Adam with lr 1e-3
15
+
16
+ function(output_from, data_path='data/spider-en-pt/') {
17
+ local PREFIX = data_path,
18
+
19
+ data: {
20
+ train: {
21
+ name: 'spider',
22
+ paths: [
23
+ PREFIX + 'train_%s.json' % [s]
24
+ for s in ['spider', 'others']],
25
+ tables_paths: [
26
+ PREFIX + 'tables.json',
27
+ ],
28
+ db_path: PREFIX + 'database',
29
+ },
30
+ val: {
31
+ name: 'spider',
32
+ paths: [PREFIX + 'dev.json'],
33
+ tables_paths: [PREFIX + 'tables.json'],
34
+ db_path: PREFIX + 'database',
35
+ },
36
+ },
37
+
38
+ model: {
39
+ name: 'EncDec',
40
+ encoder: {
41
+ name: 'spiderv2',
42
+ dropout: 0.2,
43
+ word_emb_size: 300,
44
+ question_encoder: ['emb', 'bilstm'],
45
+ column_encoder: ['emb', 'bilstm-summarize'],
46
+ table_encoder: ['emb', 'bilstm-summarize'],
47
+ update_config: {
48
+ name: 'relational_transformer',
49
+ num_layers: 4,
50
+ num_heads: 8,
51
+ },
52
+ },
53
+ decoder: {
54
+ name: 'NL2Code',
55
+ dropout: 0.2,
56
+ desc_attn: 'mha',
57
+ },
58
+ encoder_preproc: {
59
+ word_emb: {
60
+ name: 'glove',
61
+ kind: '42B',
62
+ },
63
+ count_tokens_in_word_emb_for_vocab: false,
64
+ min_freq: 50,
65
+ max_count: 5000,
66
+ include_table_name_in_column: false,
67
+
68
+ save_path: PREFIX + 'nl2code-0401,output_from=%s,emb=glove-42B,min_freq=50/' % [output_from],
69
+ },
70
+ decoder_preproc: self.encoder_preproc {
71
+ grammar: {
72
+ name: 'spider',
73
+ output_from: output_from,
74
+ use_table_pointer: output_from,
75
+ include_literals: false,
76
+ },
77
+ use_seq_elem_rules: true,
78
+
79
+ word_emb:: null,
80
+ include_table_name_in_column:: null,
81
+ count_tokens_in_word_emb_for_vocab:: null,
82
+ },
83
+ },
84
+
85
+ train: {
86
+ batch_size: 10,
87
+ eval_batch_size: 50,
88
+
89
+ keep_every_n: 1000,
90
+ eval_every_n: 100,
91
+ save_every_n: 100,
92
+ report_every_n: 10,
93
+
94
+ max_steps: 41000,
95
+ num_eval_items: 50,
96
+ },
97
+ optimizer: {
98
+ name: 'adam',
99
+ lr: 0.0,
100
+ },
101
+ lr_scheduler: {
102
+ name: 'warmup_polynomial',
103
+ num_warmup_steps: $.train.max_steps / 20,
104
+ start_lr: 1e-3,
105
+ end_lr: 0,
106
+ decay_steps: $.train.max_steps - self.num_warmup_steps,
107
+ power: 0.5,
108
+ }
109
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mBART50MtoM-large-pt/gap-bart.jsonnet ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local _0428_base = import 'nl2code-base.libsonnet';
2
+ local _data_path = 'data/spider-pt/';
3
+ local _output_from = true;
4
+ local _fs = 2;
5
+
6
+ function(args) _0428_base(output_from=_output_from, data_path=_data_path) + {
7
+ local lr_s = '%0.1e' % args.lr,
8
+ local bert_lr_s = '%0.1e' % args.bert_lr,
9
+ local end_lr_s = if args.end_lr == 0 then '0e0' else '%0.1e' % args.end_lr,
10
+
11
+ local base_bert_enc_size = 1024,
12
+ local enc_size = base_bert_enc_size,
13
+
14
+ model_name: 'bs=%(bs)d,lr=%(lr)s,bert_lr=%(bert_lr)s,end_lr=%(end_lr)s,att=%(att)d' % (args + {
15
+ lr: lr_s,
16
+ bert_lr: bert_lr_s,
17
+ end_lr: end_lr_s,
18
+ }),
19
+
20
+ model+: {
21
+ encoder+: {
22
+ name: 'spider-bart',
23
+ batch_encs_update:: null,
24
+ question_encoder:: null,
25
+ column_encoder:: null,
26
+ table_encoder:: null,
27
+ dropout:: null,
28
+ update_config+: {
29
+ name: 'relational_transformer',
30
+ num_layers: args.num_layers,
31
+ num_heads: 8,
32
+ sc_link: args.sc_link,
33
+ cv_link: args.cv_link,
34
+ },
35
+ summarize_header: args.summarize_header,
36
+ use_column_type: args.use_column_type,
37
+ bart_version: args.bart_version,
38
+ top_k_learnable:: null,
39
+ word_emb_size:: null,
40
+ },
41
+ encoder_preproc+: {
42
+ word_emb:: null,
43
+ min_freq:: null,
44
+ max_count:: null,
45
+ db_path: _data_path + "database",
46
+ compute_sc_link: args.sc_link,
47
+ compute_cv_link: args.cv_link,
48
+ fix_issue_16_primary_keys: true,
49
+ bart_version: args.bart_version,
50
+ count_tokens_in_word_emb_for_vocab:: null,
51
+ save_path: _data_path + 'mBART50MtoM-large-nl2code-1115,output_from=%s,fs=%d,emb=bart,cvlink' % [_output_from, _fs],
52
+ },
53
+ decoder_preproc+: {
54
+ grammar+: {
55
+ end_with_from: args.end_with_from,
56
+ clause_order: args.clause_order,
57
+ infer_from_conditions: true,
58
+ factorize_sketch: _fs,
59
+ },
60
+ save_path: _data_path + 'mBART50MtoM-large-nl2code-1115,output_from=%s,fs=%d,emb=bart,cvlink' % [_output_from, _fs],
61
+
62
+ compute_sc_link:: null,
63
+ compute_cv_link:: null,
64
+ db_path:: null,
65
+ fix_issue_16_primary_keys:: null,
66
+ bart_version:: null,
67
+ },
68
+ decoder+: {
69
+ name: 'NL2Code',
70
+ dropout: 0.20687225956012834,
71
+ desc_attn: 'mha',
72
+ enc_recurrent_size: enc_size,
73
+ recurrent_size : args.decoder_hidden_size,
74
+ loss_type: 'softmax',
75
+ use_align_mat: args.use_align_mat,
76
+ use_align_loss: args.use_align_loss,
77
+ },
78
+ },
79
+
80
+ train+: {
81
+ batch_size: args.bs,
82
+ num_batch_accumulated: args.num_batch_accumulated,
83
+ clip_grad: 1,
84
+
85
+ model_seed: args.att,
86
+ data_seed: args.att,
87
+ init_seed: args.att,
88
+ },
89
+
90
+ optimizer: {
91
+ name: 'bertAdamw',
92
+ lr: 0.0,
93
+ bert_lr: 0.0,
94
+ },
95
+
96
+ lr_scheduler+: {
97
+ name: 'bert_warmup_polynomial_group',
98
+ start_lrs: [args.lr, args.bert_lr],
99
+ end_lr: args.end_lr,
100
+ num_warmup_steps: $.train.max_steps / 8,
101
+ },
102
+
103
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mBART50MtoM-large-pt/nl2code-base.libsonnet ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model details:
2
+ # - NL2Code
3
+ # - Pretrained, fixed word embeddings
4
+ # - glove-42B
5
+ # - min_freq 50
6
+ # - Spiderv2 encoder
7
+ # - question_encoder ['emb', 'bilstm']
8
+ # - column_encoder ['emb', 'bilstm-summarize']
9
+ # - table_encoder ['emb', 'bilstm-summarize']
10
+ # - upd_steps 4
11
+ # - Optimization
12
+ # - max_steps 40k
13
+ # - batch_size 10
14
+ # - Adam with lr 1e-3
15
+
16
+ function(output_from, data_path='data/spider-pt/') {
17
+ local PREFIX = data_path,
18
+
19
+ data: {
20
+ train: {
21
+ name: 'spider',
22
+ paths: [
23
+ PREFIX + 'train_%s.json' % [s]
24
+ for s in ['spider', 'others']],
25
+ tables_paths: [
26
+ PREFIX + 'tables.json',
27
+ ],
28
+ db_path: PREFIX + 'database',
29
+ },
30
+ val: {
31
+ name: 'spider',
32
+ paths: [PREFIX + 'dev.json'],
33
+ tables_paths: [PREFIX + 'tables.json'],
34
+ db_path: PREFIX + 'database',
35
+ },
36
+ },
37
+
38
+ model: {
39
+ name: 'EncDec',
40
+ encoder: {
41
+ name: 'spiderv2',
42
+ dropout: 0.2,
43
+ word_emb_size: 300,
44
+ question_encoder: ['emb', 'bilstm'],
45
+ column_encoder: ['emb', 'bilstm-summarize'],
46
+ table_encoder: ['emb', 'bilstm-summarize'],
47
+ update_config: {
48
+ name: 'relational_transformer',
49
+ num_layers: 4,
50
+ num_heads: 8,
51
+ },
52
+ },
53
+ decoder: {
54
+ name: 'NL2Code',
55
+ dropout: 0.2,
56
+ desc_attn: 'mha',
57
+ },
58
+ encoder_preproc: {
59
+ word_emb: {
60
+ name: 'glove',
61
+ kind: '42B',
62
+ },
63
+ count_tokens_in_word_emb_for_vocab: false,
64
+ min_freq: 50,
65
+ max_count: 5000,
66
+ include_table_name_in_column: false,
67
+
68
+ save_path: PREFIX + 'nl2code-0401,output_from=%s,emb=glove-42B,min_freq=50/' % [output_from],
69
+ },
70
+ decoder_preproc: self.encoder_preproc {
71
+ grammar: {
72
+ name: 'spider',
73
+ output_from: output_from,
74
+ use_table_pointer: output_from,
75
+ include_literals: false,
76
+ },
77
+ use_seq_elem_rules: true,
78
+
79
+ word_emb:: null,
80
+ include_table_name_in_column:: null,
81
+ count_tokens_in_word_emb_for_vocab:: null,
82
+ },
83
+ },
84
+
85
+ train: {
86
+ batch_size: 10,
87
+ eval_batch_size: 50,
88
+
89
+ keep_every_n: 1000,
90
+ eval_every_n: 100,
91
+ save_every_n: 100,
92
+ report_every_n: 10,
93
+
94
+ max_steps: 41000,
95
+ num_eval_items: 50,
96
+ },
97
+ optimizer: {
98
+ name: 'adam',
99
+ lr: 0.0,
100
+ },
101
+ lr_scheduler: {
102
+ name: 'warmup_polynomial',
103
+ num_warmup_steps: $.train.max_steps / 20,
104
+ start_lr: 1e-3,
105
+ end_lr: 0,
106
+ decay_steps: $.train.max_steps - self.num_warmup_steps,
107
+ power: 0.5,
108
+ }
109
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-FIT-en-pt-es-fr-extra-3enr-3ptr-3esr-3frr/mT5.jsonnet ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local _0428_base = import 'nl2code-base.libsonnet';
2
+ local _data_path = 'data/spider-FIT-en-pt-es-fr-extra-3enr-3ptr-3esr-3frr/';
3
+ local _output_from = true;
4
+ local _fs = 2;
5
+
6
+ function(args) _0428_base(output_from=_output_from, data_path=_data_path) + {
7
+ local lr_s = '%0.1e' % args.lr,
8
+ local bert_lr_s = '%0.1e' % args.bert_lr,
9
+ local end_lr_s = if args.end_lr == 0 then '0e0' else '%0.1e' % args.end_lr,
10
+
11
+ local base_bert_enc_size = 1024,
12
+ local enc_size = base_bert_enc_size,
13
+
14
+ model_name: 'bs=%(bs)d,lr=%(lr)s,bert_lr=%(bert_lr)s,end_lr=%(end_lr)s,att=%(att)d' % (args + {
15
+ lr: lr_s,
16
+ bert_lr: bert_lr_s,
17
+ end_lr: end_lr_s,
18
+ }),
19
+
20
+ model+: {
21
+ encoder+: {
22
+ name: 'spider-t5',
23
+ batch_encs_update:: null,
24
+ question_encoder:: null,
25
+ column_encoder:: null,
26
+ table_encoder:: null,
27
+ dropout:: null,
28
+ update_config+: {
29
+ name: 'relational_transformer',
30
+ num_layers: args.num_layers,
31
+ num_heads: 8,
32
+ sc_link: args.sc_link,
33
+ cv_link: args.cv_link,
34
+ },
35
+ summarize_header: args.summarize_header,
36
+ use_column_type: args.use_column_type,
37
+ t5_version: args.t5_version,
38
+ pretrained_checkpoint: args.pretrained_checkpoint,
39
+ top_k_learnable:: null,
40
+ word_emb_size:: null,
41
+ },
42
+ encoder_preproc+: {
43
+ word_emb:: null,
44
+ min_freq:: null,
45
+ max_count:: null,
46
+ db_path: _data_path + "database",
47
+ compute_sc_link: args.sc_link,
48
+ compute_cv_link: args.cv_link,
49
+ fix_issue_16_primary_keys: true,
50
+ t5_version: args.t5_version,
51
+ pretrained_checkpoint: args.pretrained_checkpoint,
52
+ count_tokens_in_word_emb_for_vocab:: null,
53
+ save_path: _data_path + 'mT5-large-NoGAP-nl2code-1115,output_from=%s,fs=%d,emb=t5,cvlink' % [_output_from, _fs],
54
+ },
55
+ decoder_preproc+: {
56
+ grammar+: {
57
+ end_with_from: args.end_with_from,
58
+ clause_order: args.clause_order,
59
+ infer_from_conditions: true,
60
+ factorize_sketch: _fs,
61
+ },
62
+ save_path: _data_path + 'mT5-large-NoGAP-nl2code-1115,output_from=%s,fs=%d,emb=t5,cvlink' % [_output_from, _fs],
63
+
64
+ compute_sc_link:: null,
65
+ compute_cv_link:: null,
66
+ db_path:: null,
67
+ fix_issue_16_primary_keys:: null,
68
+ t5_version:: null,
69
+ pretrained_checkpoint:: null,
70
+ },
71
+ decoder+: {
72
+ name: 'NL2Code',
73
+ dropout: 0.20687225956012834,
74
+ desc_attn: 'mha',
75
+ enc_recurrent_size: enc_size,
76
+ recurrent_size : args.decoder_hidden_size,
77
+ loss_type: 'softmax',
78
+ use_align_mat: args.use_align_mat,
79
+ use_align_loss: args.use_align_loss,
80
+ },
81
+ },
82
+
83
+ train+: {
84
+ batch_size: args.bs,
85
+ num_batch_accumulated: args.num_batch_accumulated,
86
+ clip_grad: 1,
87
+
88
+ model_seed: args.att,
89
+ data_seed: args.att,
90
+ init_seed: args.att,
91
+ },
92
+
93
+ optimizer: {
94
+ name: 'bertAdamw',
95
+ lr: 0.0,
96
+ bert_lr: 0.0,
97
+ },
98
+
99
+ lr_scheduler+: {
100
+ name: 'bert_warmup_polynomial_group',
101
+ start_lrs: [args.lr, args.bert_lr],
102
+ end_lr: args.end_lr,
103
+ num_warmup_steps: $.train.max_steps / 8,
104
+ },
105
+
106
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-FIT-en-pt-es-fr-extra-3enr-3ptr-3esr-3frr/nl2code-base.libsonnet ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model details:
2
+ # - NL2Code
3
+ # - Pretrained, fixed word embeddings
4
+ # - glove-42B
5
+ # - min_freq 50
6
+ # - Spiderv2 encoder
7
+ # - question_encoder ['emb', 'bilstm']
8
+ # - column_encoder ['emb', 'bilstm-summarize']
9
+ # - table_encoder ['emb', 'bilstm-summarize']
10
+ # - upd_steps 4
11
+ # - Optimization
12
+ # - max_steps 40k
13
+ # - batch_size 10
14
+ # - Adam with lr 1e-3
15
+
16
+ function(output_from, data_path='data/spider-FIT-en-pt-es-fr-extra-3enr-3ptr-3esr-3frr/') {
17
+ local PREFIX = data_path,
18
+
19
+ data: {
20
+ train: {
21
+ name: 'spider',
22
+ paths: [
23
+ PREFIX + 'train_%s.json' % [s]
24
+ for s in ['spider', 'others']],
25
+ tables_paths: [
26
+ PREFIX + 'tables.json',
27
+ ],
28
+ db_path: PREFIX + 'database',
29
+ },
30
+ val: {
31
+ name: 'spider',
32
+ paths: [PREFIX + 'dev.json'],
33
+ tables_paths: [PREFIX + 'tables.json'],
34
+ db_path: PREFIX + 'database',
35
+ },
36
+ },
37
+
38
+ model: {
39
+ name: 'EncDec',
40
+ encoder: {
41
+ name: 'spiderv2',
42
+ dropout: 0.2,
43
+ word_emb_size: 300,
44
+ question_encoder: ['emb', 'bilstm'],
45
+ column_encoder: ['emb', 'bilstm-summarize'],
46
+ table_encoder: ['emb', 'bilstm-summarize'],
47
+ update_config: {
48
+ name: 'relational_transformer',
49
+ num_layers: 4,
50
+ num_heads: 8,
51
+ },
52
+ },
53
+ decoder: {
54
+ name: 'NL2Code',
55
+ dropout: 0.2,
56
+ desc_attn: 'mha',
57
+ },
58
+ encoder_preproc: {
59
+ word_emb: {
60
+ name: 'glove',
61
+ kind: '42B',
62
+ },
63
+ count_tokens_in_word_emb_for_vocab: false,
64
+ min_freq: 50,
65
+ max_count: 5000,
66
+ include_table_name_in_column: false,
67
+
68
+ save_path: PREFIX + 'nl2code-0401,output_from=%s,emb=glove-42B,min_freq=50/' % [output_from],
69
+ },
70
+ decoder_preproc: self.encoder_preproc {
71
+ grammar: {
72
+ name: 'spider',
73
+ output_from: output_from,
74
+ use_table_pointer: output_from,
75
+ include_literals: false,
76
+ },
77
+ use_seq_elem_rules: true,
78
+
79
+ word_emb:: null,
80
+ include_table_name_in_column:: null,
81
+ count_tokens_in_word_emb_for_vocab:: null,
82
+ },
83
+ },
84
+
85
+ train: {
86
+ batch_size: 10,
87
+ eval_batch_size: 50,
88
+
89
+ keep_every_n: 1000,
90
+ eval_every_n: 100,
91
+ save_every_n: 100,
92
+ report_every_n: 10,
93
+
94
+ max_steps: 150300,
95
+ num_eval_items: 50,
96
+ },
97
+ optimizer: {
98
+ name: 'adam',
99
+ lr: 0.0,
100
+ },
101
+ lr_scheduler: {
102
+ name: 'warmup_polynomial',
103
+ num_warmup_steps: $.train.max_steps / 20,
104
+ start_lr: 1e-3,
105
+ end_lr: 0,
106
+ decay_steps: $.train.max_steps - self.num_warmup_steps,
107
+ power: 0.5,
108
+ }
109
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-FIT-en-pt-es-fr/mT5.jsonnet ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local _0428_base = import 'nl2code-base.libsonnet';
2
+ local _data_path = 'data/spider-FIT-en-pt-es-fr/';
3
+ local _output_from = true;
4
+ local _fs = 2;
5
+
6
+ function(args) _0428_base(output_from=_output_from, data_path=_data_path) + {
7
+ local lr_s = '%0.1e' % args.lr,
8
+ local bert_lr_s = '%0.1e' % args.bert_lr,
9
+ local end_lr_s = if args.end_lr == 0 then '0e0' else '%0.1e' % args.end_lr,
10
+
11
+ local base_bert_enc_size = 1024,
12
+ local enc_size = base_bert_enc_size,
13
+
14
+ model_name: 'bs=%(bs)d,lr=%(lr)s,bert_lr=%(bert_lr)s,end_lr=%(end_lr)s,att=%(att)d' % (args + {
15
+ lr: lr_s,
16
+ bert_lr: bert_lr_s,
17
+ end_lr: end_lr_s,
18
+ }),
19
+
20
+ model+: {
21
+ encoder+: {
22
+ name: 'spider-t5',
23
+ batch_encs_update:: null,
24
+ question_encoder:: null,
25
+ column_encoder:: null,
26
+ table_encoder:: null,
27
+ dropout:: null,
28
+ update_config+: {
29
+ name: 'relational_transformer',
30
+ num_layers: args.num_layers,
31
+ num_heads: 8,
32
+ sc_link: args.sc_link,
33
+ cv_link: args.cv_link,
34
+ },
35
+ summarize_header: args.summarize_header,
36
+ use_column_type: args.use_column_type,
37
+ t5_version: args.t5_version,
38
+ pretrained_checkpoint: args.pretrained_checkpoint,
39
+ top_k_learnable:: null,
40
+ word_emb_size:: null,
41
+ },
42
+ encoder_preproc+: {
43
+ word_emb:: null,
44
+ min_freq:: null,
45
+ max_count:: null,
46
+ db_path: _data_path + "database",
47
+ compute_sc_link: args.sc_link,
48
+ compute_cv_link: args.cv_link,
49
+ fix_issue_16_primary_keys: true,
50
+ t5_version: args.t5_version,
51
+ pretrained_checkpoint: args.pretrained_checkpoint,
52
+ count_tokens_in_word_emb_for_vocab:: null,
53
+ save_path: _data_path + 'mT5-large-nl2code-1115,output_from=%s,fs=%d,emb=t5,cvlink' % [_output_from, _fs],
54
+ },
55
+ decoder_preproc+: {
56
+ grammar+: {
57
+ end_with_from: args.end_with_from,
58
+ clause_order: args.clause_order,
59
+ infer_from_conditions: true,
60
+ factorize_sketch: _fs,
61
+ },
62
+ save_path: _data_path + 'mT5-large-nl2code-1115,output_from=%s,fs=%d,emb=t5,cvlink' % [_output_from, _fs],
63
+
64
+ compute_sc_link:: null,
65
+ compute_cv_link:: null,
66
+ db_path:: null,
67
+ fix_issue_16_primary_keys:: null,
68
+ t5_version:: null,
69
+ pretrained_checkpoint:: null,
70
+ },
71
+ decoder+: {
72
+ name: 'NL2Code',
73
+ dropout: 0.20687225956012834,
74
+ desc_attn: 'mha',
75
+ enc_recurrent_size: enc_size,
76
+ recurrent_size : args.decoder_hidden_size,
77
+ loss_type: 'softmax',
78
+ use_align_mat: args.use_align_mat,
79
+ use_align_loss: args.use_align_loss,
80
+ },
81
+ },
82
+
83
+ train+: {
84
+ batch_size: args.bs,
85
+ num_batch_accumulated: args.num_batch_accumulated,
86
+ clip_grad: 1,
87
+
88
+ model_seed: args.att,
89
+ data_seed: args.att,
90
+ init_seed: args.att,
91
+ },
92
+
93
+ optimizer: {
94
+ name: 'bertAdamw',
95
+ lr: 0.0,
96
+ bert_lr: 0.0,
97
+ },
98
+
99
+ lr_scheduler+: {
100
+ name: 'bert_warmup_polynomial_group',
101
+ start_lrs: [args.lr, args.bert_lr],
102
+ end_lr: args.end_lr,
103
+ num_warmup_steps: $.train.max_steps / 8,
104
+ },
105
+
106
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-FIT-en-pt-es-fr/nl2code-base.libsonnet ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model details:
2
+ # - NL2Code
3
+ # - Pretrained, fixed word embeddings
4
+ # - glove-42B
5
+ # - min_freq 50
6
+ # - Spiderv2 encoder
7
+ # - question_encoder ['emb', 'bilstm']
8
+ # - column_encoder ['emb', 'bilstm-summarize']
9
+ # - table_encoder ['emb', 'bilstm-summarize']
10
+ # - upd_steps 4
11
+ # - Optimization
12
+ # - max_steps 40k
13
+ # - batch_size 10
14
+ # - Adam with lr 1e-3
15
+
16
+ function(output_from, data_path='data/spider-FIT-en-pt-es-fr/') {
17
+ local PREFIX = data_path,
18
+
19
+ data: {
20
+ train: {
21
+ name: 'spider',
22
+ paths: [
23
+ PREFIX + 'train_%s.json' % [s]
24
+ for s in ['spider', 'others']],
25
+ tables_paths: [
26
+ PREFIX + 'tables.json',
27
+ ],
28
+ db_path: PREFIX + 'database',
29
+ },
30
+ val: {
31
+ name: 'spider',
32
+ paths: [PREFIX + 'dev.json'],
33
+ tables_paths: [PREFIX + 'tables.json'],
34
+ db_path: PREFIX + 'database',
35
+ },
36
+ },
37
+
38
+ model: {
39
+ name: 'EncDec',
40
+ encoder: {
41
+ name: 'spiderv2',
42
+ dropout: 0.2,
43
+ word_emb_size: 300,
44
+ question_encoder: ['emb', 'bilstm'],
45
+ column_encoder: ['emb', 'bilstm-summarize'],
46
+ table_encoder: ['emb', 'bilstm-summarize'],
47
+ update_config: {
48
+ name: 'relational_transformer',
49
+ num_layers: 4,
50
+ num_heads: 8,
51
+ },
52
+ },
53
+ decoder: {
54
+ name: 'NL2Code',
55
+ dropout: 0.2,
56
+ desc_attn: 'mha',
57
+ },
58
+ encoder_preproc: {
59
+ word_emb: {
60
+ name: 'glove',
61
+ kind: '42B',
62
+ },
63
+ count_tokens_in_word_emb_for_vocab: false,
64
+ min_freq: 50,
65
+ max_count: 5000,
66
+ include_table_name_in_column: false,
67
+
68
+ save_path: PREFIX + 'nl2code-0401,output_from=%s,emb=glove-42B,min_freq=50/' % [output_from],
69
+ },
70
+ decoder_preproc: self.encoder_preproc {
71
+ grammar: {
72
+ name: 'spider',
73
+ output_from: output_from,
74
+ use_table_pointer: output_from,
75
+ include_literals: false,
76
+ },
77
+ use_seq_elem_rules: true,
78
+
79
+ word_emb:: null,
80
+ include_table_name_in_column:: null,
81
+ count_tokens_in_word_emb_for_vocab:: null,
82
+ },
83
+ },
84
+
85
+ train: {
86
+ batch_size: 10,
87
+ eval_batch_size: 50,
88
+
89
+ keep_every_n: 1000,
90
+ eval_every_n: 100,
91
+ save_every_n: 100,
92
+ report_every_n: 10,
93
+
94
+ max_steps: 120300,
95
+ num_eval_items: 50,
96
+ },
97
+ optimizer: {
98
+ name: 'adam',
99
+ lr: 0.0,
100
+ },
101
+ lr_scheduler: {
102
+ name: 'warmup_polynomial',
103
+ num_warmup_steps: $.train.max_steps / 20,
104
+ start_lr: 1e-3,
105
+ end_lr: 0,
106
+ decay_steps: $.train.max_steps - self.num_warmup_steps,
107
+ power: 0.5,
108
+ }
109
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-en-pt-es-fr-enr-enb/mT5.jsonnet ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local _0428_base = import 'nl2code-base.libsonnet';
2
+ local _data_path = 'data/spider-en-pt-es-fr-enr-enb/';
3
+ local _output_from = true;
4
+ local _fs = 2;
5
+
6
+ function(args) _0428_base(output_from=_output_from, data_path=_data_path) + {
7
+ local lr_s = '%0.1e' % args.lr,
8
+ local bert_lr_s = '%0.1e' % args.bert_lr,
9
+ local end_lr_s = if args.end_lr == 0 then '0e0' else '%0.1e' % args.end_lr,
10
+
11
+ local base_bert_enc_size = 1024,
12
+ local enc_size = base_bert_enc_size,
13
+
14
+ model_name: 'bs=%(bs)d,lr=%(lr)s,bert_lr=%(bert_lr)s,end_lr=%(end_lr)s,att=%(att)d' % (args + {
15
+ lr: lr_s,
16
+ bert_lr: bert_lr_s,
17
+ end_lr: end_lr_s,
18
+ }),
19
+
20
+ model+: {
21
+ encoder+: {
22
+ name: 'spider-t5',
23
+ batch_encs_update:: null,
24
+ question_encoder:: null,
25
+ column_encoder:: null,
26
+ table_encoder:: null,
27
+ dropout:: null,
28
+ update_config+: {
29
+ name: 'relational_transformer',
30
+ num_layers: args.num_layers,
31
+ num_heads: 8,
32
+ sc_link: args.sc_link,
33
+ cv_link: args.cv_link,
34
+ },
35
+ summarize_header: args.summarize_header,
36
+ use_column_type: args.use_column_type,
37
+ t5_version: args.t5_version,
38
+ pretrained_checkpoint: args.pretrained_checkpoint,
39
+ top_k_learnable:: null,
40
+ word_emb_size:: null,
41
+ },
42
+ encoder_preproc+: {
43
+ word_emb:: null,
44
+ min_freq:: null,
45
+ max_count:: null,
46
+ db_path: _data_path + "database",
47
+ compute_sc_link: args.sc_link,
48
+ compute_cv_link: args.cv_link,
49
+ fix_issue_16_primary_keys: true,
50
+ t5_version: args.t5_version,
51
+ pretrained_checkpoint: args.pretrained_checkpoint,
52
+ count_tokens_in_word_emb_for_vocab:: null,
53
+ save_path: _data_path + 'mT5-large-nl2code-1115,output_from=%s,fs=%d,emb=t5,cvlink' % [_output_from, _fs],
54
+ },
55
+ decoder_preproc+: {
56
+ grammar+: {
57
+ end_with_from: args.end_with_from,
58
+ clause_order: args.clause_order,
59
+ infer_from_conditions: true,
60
+ factorize_sketch: _fs,
61
+ },
62
+ save_path: _data_path + 'mT5-large-nl2code-1115,output_from=%s,fs=%d,emb=t5,cvlink' % [_output_from, _fs],
63
+
64
+ compute_sc_link:: null,
65
+ compute_cv_link:: null,
66
+ db_path:: null,
67
+ fix_issue_16_primary_keys:: null,
68
+ t5_version:: null,
69
+ pretrained_checkpoint:: null,
70
+ },
71
+ decoder+: {
72
+ name: 'NL2Code',
73
+ dropout: 0.20687225956012834,
74
+ desc_attn: 'mha',
75
+ enc_recurrent_size: enc_size,
76
+ recurrent_size : args.decoder_hidden_size,
77
+ loss_type: 'softmax',
78
+ use_align_mat: args.use_align_mat,
79
+ use_align_loss: args.use_align_loss,
80
+ },
81
+ },
82
+
83
+ train+: {
84
+ batch_size: args.bs,
85
+ num_batch_accumulated: args.num_batch_accumulated,
86
+ clip_grad: 1,
87
+
88
+ model_seed: args.att,
89
+ data_seed: args.att,
90
+ init_seed: args.att,
91
+ },
92
+
93
+ optimizer: {
94
+ name: 'bertAdamw',
95
+ lr: 0.0,
96
+ bert_lr: 0.0,
97
+ },
98
+
99
+ lr_scheduler+: {
100
+ name: 'bert_warmup_polynomial_group',
101
+ start_lrs: [args.lr, args.bert_lr],
102
+ end_lr: args.end_lr,
103
+ num_warmup_steps: $.train.max_steps / 8,
104
+ },
105
+
106
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-en-pt-es-fr-enr-enb/nl2code-base.libsonnet ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model details:
2
+ # - NL2Code
3
+ # - Pretrained, fixed word embeddings
4
+ # - glove-42B
5
+ # - min_freq 50
6
+ # - Spiderv2 encoder
7
+ # - question_encoder ['emb', 'bilstm']
8
+ # - column_encoder ['emb', 'bilstm-summarize']
9
+ # - table_encoder ['emb', 'bilstm-summarize']
10
+ # - upd_steps 4
11
+ # - Optimization
12
+ # - max_steps 40k
13
+ # - batch_size 10
14
+ # - Adam with lr 1e-3
15
+
16
+ function(output_from, data_path='data/en-pt-es-fr-enr-enb/') {
17
+ local PREFIX = data_path,
18
+
19
+ data: {
20
+ train: {
21
+ name: 'spider',
22
+ paths: [
23
+ PREFIX + 'train_%s.json' % [s]
24
+ for s in ['spider', 'others']],
25
+ tables_paths: [
26
+ PREFIX + 'tables.json',
27
+ ],
28
+ db_path: PREFIX + 'database',
29
+ },
30
+ val: {
31
+ name: 'spider',
32
+ paths: [PREFIX + 'dev.json'],
33
+ tables_paths: [PREFIX + 'tables.json'],
34
+ db_path: PREFIX + 'database',
35
+ },
36
+ },
37
+
38
+ model: {
39
+ name: 'EncDec',
40
+ encoder: {
41
+ name: 'spiderv2',
42
+ dropout: 0.2,
43
+ word_emb_size: 300,
44
+ question_encoder: ['emb', 'bilstm'],
45
+ column_encoder: ['emb', 'bilstm-summarize'],
46
+ table_encoder: ['emb', 'bilstm-summarize'],
47
+ update_config: {
48
+ name: 'relational_transformer',
49
+ num_layers: 4,
50
+ num_heads: 8,
51
+ },
52
+ },
53
+ decoder: {
54
+ name: 'NL2Code',
55
+ dropout: 0.2,
56
+ desc_attn: 'mha',
57
+ },
58
+ encoder_preproc: {
59
+ word_emb: {
60
+ name: 'glove',
61
+ kind: '42B',
62
+ },
63
+ count_tokens_in_word_emb_for_vocab: false,
64
+ min_freq: 50,
65
+ max_count: 5000,
66
+ include_table_name_in_column: false,
67
+
68
+ save_path: PREFIX + 'nl2code-0401,output_from=%s,emb=glove-42B,min_freq=50/' % [output_from],
69
+ },
70
+ decoder_preproc: self.encoder_preproc {
71
+ grammar: {
72
+ name: 'spider',
73
+ output_from: output_from,
74
+ use_table_pointer: output_from,
75
+ include_literals: false,
76
+ },
77
+ use_seq_elem_rules: true,
78
+
79
+ word_emb:: null,
80
+ include_table_name_in_column:: null,
81
+ count_tokens_in_word_emb_for_vocab:: null,
82
+ },
83
+ },
84
+
85
+ train: {
86
+ batch_size: 10,
87
+ eval_batch_size: 50,
88
+
89
+ keep_every_n: 1000,
90
+ eval_every_n: 100,
91
+ save_every_n: 100,
92
+ report_every_n: 10,
93
+
94
+ max_steps: 51300,
95
+ num_eval_items: 50,
96
+ },
97
+ optimizer: {
98
+ name: 'adam',
99
+ lr: 0.0,
100
+ },
101
+ lr_scheduler: {
102
+ name: 'warmup_polynomial',
103
+ num_warmup_steps: $.train.max_steps / 20,
104
+ start_lr: 1e-3,
105
+ end_lr: 0,
106
+ decay_steps: $.train.max_steps - self.num_warmup_steps,
107
+ power: 0.5,
108
+ }
109
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-en-pt-es-fr/gap-mT5.jsonnet ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local _0428_base = import 'nl2code-base.libsonnet';
2
+ local _data_path = 'data/spider-en-pt-es-fr/';
3
+ local _output_from = true;
4
+ local _fs = 2;
5
+
6
+ function(args) _0428_base(output_from=_output_from, data_path=_data_path) + {
7
+ local lr_s = '%0.1e' % args.lr,
8
+ local bert_lr_s = '%0.1e' % args.bert_lr,
9
+ local end_lr_s = if args.end_lr == 0 then '0e0' else '%0.1e' % args.end_lr,
10
+
11
+ local base_bert_enc_size = 1024,
12
+ local enc_size = base_bert_enc_size,
13
+
14
+ model_name: 'bs=%(bs)d,lr=%(lr)s,bert_lr=%(bert_lr)s,end_lr=%(end_lr)s,att=%(att)d' % (args + {
15
+ lr: lr_s,
16
+ bert_lr: bert_lr_s,
17
+ end_lr: end_lr_s,
18
+ }),
19
+
20
+ model+: {
21
+ encoder+: {
22
+ name: 'spider-t5',
23
+ batch_encs_update:: null,
24
+ question_encoder:: null,
25
+ column_encoder:: null,
26
+ table_encoder:: null,
27
+ dropout:: null,
28
+ update_config+: {
29
+ name: 'relational_transformer',
30
+ num_layers: args.num_layers,
31
+ num_heads: 8,
32
+ sc_link: args.sc_link,
33
+ cv_link: args.cv_link,
34
+ },
35
+ summarize_header: args.summarize_header,
36
+ use_column_type: args.use_column_type,
37
+ t5_version: args.t5_version,
38
+ pretrained_checkpoint: args.pretrained_checkpoint,
39
+ top_k_learnable:: null,
40
+ word_emb_size:: null,
41
+ },
42
+ encoder_preproc+: {
43
+ word_emb:: null,
44
+ min_freq:: null,
45
+ max_count:: null,
46
+ db_path: _data_path + "database",
47
+ compute_sc_link: args.sc_link,
48
+ compute_cv_link: args.cv_link,
49
+ fix_issue_16_primary_keys: true,
50
+ t5_version: args.t5_version,
51
+ pretrained_checkpoint: args.pretrained_checkpoint,
52
+ count_tokens_in_word_emb_for_vocab:: null,
53
+ save_path: _data_path + 'mT5-large-nl2code-1115,output_from=%s,fs=%d,emb=t5,cvlink' % [_output_from, _fs],
54
+ },
55
+ decoder_preproc+: {
56
+ grammar+: {
57
+ end_with_from: args.end_with_from,
58
+ clause_order: args.clause_order,
59
+ infer_from_conditions: true,
60
+ factorize_sketch: _fs,
61
+ },
62
+ save_path: _data_path + 'mT5-large-nl2code-1115,output_from=%s,fs=%d,emb=t5,cvlink' % [_output_from, _fs],
63
+
64
+ compute_sc_link:: null,
65
+ compute_cv_link:: null,
66
+ db_path:: null,
67
+ fix_issue_16_primary_keys:: null,
68
+ t5_version:: null,
69
+ pretrained_checkpoint:: null,
70
+ },
71
+ decoder+: {
72
+ name: 'NL2Code',
73
+ dropout: 0.20687225956012834,
74
+ desc_attn: 'mha',
75
+ enc_recurrent_size: enc_size,
76
+ recurrent_size : args.decoder_hidden_size,
77
+ loss_type: 'softmax',
78
+ use_align_mat: args.use_align_mat,
79
+ use_align_loss: args.use_align_loss,
80
+ },
81
+ },
82
+
83
+ train+: {
84
+ batch_size: args.bs,
85
+ num_batch_accumulated: args.num_batch_accumulated,
86
+ clip_grad: 1,
87
+
88
+ model_seed: args.att,
89
+ data_seed: args.att,
90
+ init_seed: args.att,
91
+ },
92
+
93
+ optimizer: {
94
+ name: 'bertAdamw',
95
+ lr: 0.0,
96
+ bert_lr: 0.0,
97
+ },
98
+
99
+ lr_scheduler+: {
100
+ name: 'bert_warmup_polynomial_group',
101
+ start_lrs: [args.lr, args.bert_lr],
102
+ end_lr: args.end_lr,
103
+ num_warmup_steps: $.train.max_steps / 8,
104
+ },
105
+
106
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-en-pt-es-fr/nl2code-base.libsonnet ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model details:
2
+ # - NL2Code
3
+ # - Pretrained, fixed word embeddings
4
+ # - glove-42B
5
+ # - min_freq 50
6
+ # - Spiderv2 encoder
7
+ # - question_encoder ['emb', 'bilstm']
8
+ # - column_encoder ['emb', 'bilstm-summarize']
9
+ # - table_encoder ['emb', 'bilstm-summarize']
10
+ # - upd_steps 4
11
+ # - Optimization
12
+ # - max_steps 40k
13
+ # - batch_size 10
14
+ # - Adam with lr 1e-3
15
+
16
+ function(output_from, data_path='data/spider-en-pt-es-fr/') {
17
+ local PREFIX = data_path,
18
+
19
+ data: {
20
+ train: {
21
+ name: 'spider',
22
+ paths: [
23
+ PREFIX + 'train_%s.json' % [s]
24
+ for s in ['spider', 'others']],
25
+ tables_paths: [
26
+ PREFIX + 'tables.json',
27
+ ],
28
+ db_path: PREFIX + 'database',
29
+ },
30
+ val: {
31
+ name: 'spider',
32
+ paths: [PREFIX + 'dev.json'],
33
+ tables_paths: [PREFIX + 'tables.json'],
34
+ db_path: PREFIX + 'database',
35
+ },
36
+ },
37
+
38
+ model: {
39
+ name: 'EncDec',
40
+ encoder: {
41
+ name: 'spiderv2',
42
+ dropout: 0.2,
43
+ word_emb_size: 300,
44
+ question_encoder: ['emb', 'bilstm'],
45
+ column_encoder: ['emb', 'bilstm-summarize'],
46
+ table_encoder: ['emb', 'bilstm-summarize'],
47
+ update_config: {
48
+ name: 'relational_transformer',
49
+ num_layers: 4,
50
+ num_heads: 8,
51
+ },
52
+ },
53
+ decoder: {
54
+ name: 'NL2Code',
55
+ dropout: 0.2,
56
+ desc_attn: 'mha',
57
+ },
58
+ encoder_preproc: {
59
+ word_emb: {
60
+ name: 'glove',
61
+ kind: '42B',
62
+ },
63
+ count_tokens_in_word_emb_for_vocab: false,
64
+ min_freq: 50,
65
+ max_count: 5000,
66
+ include_table_name_in_column: false,
67
+
68
+ save_path: PREFIX + 'nl2code-0401,output_from=%s,emb=glove-42B,min_freq=50/' % [output_from],
69
+ },
70
+ decoder_preproc: self.encoder_preproc {
71
+ grammar: {
72
+ name: 'spider',
73
+ output_from: output_from,
74
+ use_table_pointer: output_from,
75
+ include_literals: false,
76
+ },
77
+ use_seq_elem_rules: true,
78
+
79
+ word_emb:: null,
80
+ include_table_name_in_column:: null,
81
+ count_tokens_in_word_emb_for_vocab:: null,
82
+ },
83
+ },
84
+
85
+ train: {
86
+ batch_size: 10,
87
+ eval_batch_size: 50,
88
+
89
+ keep_every_n: 1000,
90
+ eval_every_n: 100,
91
+ save_every_n: 100,
92
+ report_every_n: 10,
93
+
94
+ max_steps: 51300,
95
+ num_eval_items: 50,
96
+ },
97
+ optimizer: {
98
+ name: 'adam',
99
+ lr: 0.0,
100
+ },
101
+ lr_scheduler: {
102
+ name: 'warmup_polynomial',
103
+ num_warmup_steps: $.train.max_steps / 20,
104
+ start_lr: 1e-3,
105
+ end_lr: 0,
106
+ decay_steps: $.train.max_steps - self.num_warmup_steps,
107
+ power: 0.5,
108
+ }
109
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-en/gap-mT5.jsonnet ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local _0428_base = import 'nl2code-base.libsonnet';
2
+ local _data_path = 'data/spider-en/';
3
+ local _output_from = true;
4
+ local _fs = 2;
5
+
6
+ function(args) _0428_base(output_from=_output_from, data_path=_data_path) + {
7
+ local lr_s = '%0.1e' % args.lr,
8
+ local bert_lr_s = '%0.1e' % args.bert_lr,
9
+ local end_lr_s = if args.end_lr == 0 then '0e0' else '%0.1e' % args.end_lr,
10
+
11
+ local base_bert_enc_size = 1024,
12
+ local enc_size = base_bert_enc_size,
13
+
14
+ model_name: 'bs=%(bs)d,lr=%(lr)s,bert_lr=%(bert_lr)s,end_lr=%(end_lr)s,att=%(att)d' % (args + {
15
+ lr: lr_s,
16
+ bert_lr: bert_lr_s,
17
+ end_lr: end_lr_s,
18
+ }),
19
+
20
+ model+: {
21
+ encoder+: {
22
+ name: 'spider-t5',
23
+ batch_encs_update:: null,
24
+ question_encoder:: null,
25
+ column_encoder:: null,
26
+ table_encoder:: null,
27
+ dropout:: null,
28
+ update_config+: {
29
+ name: 'relational_transformer',
30
+ num_layers: args.num_layers,
31
+ num_heads: 8,
32
+ sc_link: args.sc_link,
33
+ cv_link: args.cv_link,
34
+ },
35
+ summarize_header: args.summarize_header,
36
+ use_column_type: args.use_column_type,
37
+ t5_version: args.t5_version,
38
+ pretrained_checkpoint: args.pretrained_checkpoint,
39
+ top_k_learnable:: null,
40
+ word_emb_size:: null,
41
+ },
42
+ encoder_preproc+: {
43
+ word_emb:: null,
44
+ min_freq:: null,
45
+ max_count:: null,
46
+ db_path: _data_path + "database",
47
+ compute_sc_link: args.sc_link,
48
+ compute_cv_link: args.cv_link,
49
+ fix_issue_16_primary_keys: true,
50
+ t5_version: args.t5_version,
51
+ pretrained_checkpoint: args.pretrained_checkpoint,
52
+ count_tokens_in_word_emb_for_vocab:: null,
53
+ save_path: _data_path + 'mT5-large-nl2code-1115,output_from=%s,fs=%d,emb=t5,cvlink' % [_output_from, _fs],
54
+ },
55
+ decoder_preproc+: {
56
+ grammar+: {
57
+ end_with_from: args.end_with_from,
58
+ clause_order: args.clause_order,
59
+ infer_from_conditions: true,
60
+ factorize_sketch: _fs,
61
+ },
62
+ save_path: _data_path + 'mT5-large-nl2code-1115,output_from=%s,fs=%d,emb=t5,cvlink' % [_output_from, _fs],
63
+
64
+ compute_sc_link:: null,
65
+ compute_cv_link:: null,
66
+ db_path:: null,
67
+ fix_issue_16_primary_keys:: null,
68
+ t5_version:: null,
69
+ pretrained_checkpoint:: null,
70
+ },
71
+ decoder+: {
72
+ name: 'NL2Code',
73
+ dropout: 0.20687225956012834,
74
+ desc_attn: 'mha',
75
+ enc_recurrent_size: enc_size,
76
+ recurrent_size : args.decoder_hidden_size,
77
+ loss_type: 'softmax',
78
+ use_align_mat: args.use_align_mat,
79
+ use_align_loss: args.use_align_loss,
80
+ },
81
+ },
82
+
83
+ train+: {
84
+ batch_size: args.bs,
85
+ num_batch_accumulated: args.num_batch_accumulated,
86
+ clip_grad: 1,
87
+
88
+ model_seed: args.att,
89
+ data_seed: args.att,
90
+ init_seed: args.att,
91
+ },
92
+
93
+ optimizer: {
94
+ name: 'bertAdamw',
95
+ lr: 0.0,
96
+ bert_lr: 0.0,
97
+ },
98
+
99
+ lr_scheduler+: {
100
+ name: 'bert_warmup_polynomial_group',
101
+ start_lrs: [args.lr, args.bert_lr],
102
+ end_lr: args.end_lr,
103
+ num_warmup_steps: $.train.max_steps / 8,
104
+ },
105
+
106
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/mT5-large-en/nl2code-base.libsonnet ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model details:
2
+ # - NL2Code
3
+ # - Pretrained, fixed word embeddings
4
+ # - glove-42B
5
+ # - min_freq 50
6
+ # - Spiderv2 encoder
7
+ # - question_encoder ['emb', 'bilstm']
8
+ # - column_encoder ['emb', 'bilstm-summarize']
9
+ # - table_encoder ['emb', 'bilstm-summarize']
10
+ # - upd_steps 4
11
+ # - Optimization
12
+ # - max_steps 40k
13
+ # - batch_size 10
14
+ # - Adam with lr 1e-3
15
+
16
+ function(output_from, data_path='data/spider-en/') {
17
+ local PREFIX = data_path,
18
+
19
+ data: {
20
+ train: {
21
+ name: 'spider',
22
+ paths: [
23
+ PREFIX + 'train_%s.json' % [s]
24
+ for s in ['spider', 'others']],
25
+ tables_paths: [
26
+ PREFIX + 'tables.json',
27
+ ],
28
+ db_path: PREFIX + 'database',
29
+ },
30
+ val: {
31
+ name: 'spider',
32
+ paths: [PREFIX + 'dev.json'],
33
+ tables_paths: [PREFIX + 'tables.json'],
34
+ db_path: PREFIX + 'database',
35
+ },
36
+ },
37
+
38
+ model: {
39
+ name: 'EncDec',
40
+ encoder: {
41
+ name: 'spiderv2',
42
+ dropout: 0.2,
43
+ word_emb_size: 300,
44
+ question_encoder: ['emb', 'bilstm'],
45
+ column_encoder: ['emb', 'bilstm-summarize'],
46
+ table_encoder: ['emb', 'bilstm-summarize'],
47
+ update_config: {
48
+ name: 'relational_transformer',
49
+ num_layers: 4,
50
+ num_heads: 8,
51
+ },
52
+ },
53
+ decoder: {
54
+ name: 'NL2Code',
55
+ dropout: 0.2,
56
+ desc_attn: 'mha',
57
+ },
58
+ encoder_preproc: {
59
+ word_emb: {
60
+ name: 'glove',
61
+ kind: '42B',
62
+ },
63
+ count_tokens_in_word_emb_for_vocab: false,
64
+ min_freq: 50,
65
+ max_count: 5000,
66
+ include_table_name_in_column: false,
67
+
68
+ save_path: PREFIX + 'nl2code-0401,output_from=%s,emb=glove-42B,min_freq=50/' % [output_from],
69
+ },
70
+ decoder_preproc: self.encoder_preproc {
71
+ grammar: {
72
+ name: 'spider',
73
+ output_from: output_from,
74
+ use_table_pointer: output_from,
75
+ include_literals: false,
76
+ },
77
+ use_seq_elem_rules: true,
78
+
79
+ word_emb:: null,
80
+ include_table_name_in_column:: null,
81
+ count_tokens_in_word_emb_for_vocab:: null,
82
+ },
83
+ },
84
+
85
+ train: {
86
+ batch_size: 10,
87
+ eval_batch_size: 50,
88
+
89
+ keep_every_n: 1000,
90
+ eval_every_n: 100,
91
+ save_every_n: 100,
92
+ report_every_n: 10,
93
+
94
+ max_steps: 51100,
95
+ num_eval_items: 50,
96
+ },
97
+ optimizer: {
98
+ name: 'adam',
99
+ lr: 0.0,
100
+ },
101
+ lr_scheduler: {
102
+ name: 'warmup_polynomial',
103
+ num_warmup_steps: $.train.max_steps / 20,
104
+ start_lr: 1e-3,
105
+ end_lr: 0,
106
+ decay_steps: $.train.max_steps - self.num_warmup_steps,
107
+ power: 0.5,
108
+ }
109
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/spider-BART-large-en-train_en-eval.jsonnet ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ local exp_id = 1,
3
+ logdir: "logdir/BART-large-en-train",
4
+ model_config: "experiments/spider-configs/BART-large-en/gap-bart.jsonnet",
5
+ model_config_args: {
6
+ bs: 12,
7
+ num_batch_accumulated: 2,
8
+ bart_version: "facebook/bart-large",
9
+ pretrained_checkpoint: "models/BART-large/pretrained_checkpoint/pytorch_model.bin",
10
+ summarize_header: "avg",
11
+ use_column_type: false,
12
+ num_layers: 8,
13
+ lr: 1e-4,
14
+ bert_lr: 1e-5,
15
+ att: 1,
16
+ end_lr: 0,
17
+ sc_link: true,
18
+ cv_link: true,
19
+ use_align_mat: true,
20
+ use_align_loss: true,
21
+ bart_token_type: true,
22
+ decoder_hidden_size: 512,
23
+ end_with_from: true, # equivalent to "SWGOIF" if true
24
+ clause_order: null, # strings like "SWGOIF", it will be prioriotized over end_with_from
25
+ },
26
+
27
+ eval_name: "bart-large-en_run_%d_%s_%d" % [exp_id, self.eval_use_heuristic, self.eval_beam_size],
28
+ eval_output: "ie_dirs/BART-large-en-train_en-eval",
29
+ eval_beam_size: 1,
30
+ eval_use_heuristic: true,
31
+ eval_steps: [40300],
32
+ eval_section: "val",
33
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/spider-BERTimbau-base-pt.jsonnet ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ logdir: "logdir/BERTimbau-base-pt-train",
3
+ model_config: "experiments/spider-configs/BERTimbau-base/nl2code-bertimbau-base.jsonnet",
4
+ model_config_args: {
5
+ data_path: 'data/spider-pt/',
6
+ bs: 6,
7
+ num_batch_accumulated: 4,
8
+ bert_version: "neuralmind/bert-base-portuguese-cased",
9
+ summarize_header: "avg",
10
+ use_column_type: false,
11
+ max_steps: 81000,
12
+ num_layers: 8,
13
+ lr: 7.44e-4,
14
+ bert_lr: 3e-6,
15
+ att: 1,
16
+ end_lr: 0,
17
+ sc_link: true,
18
+ cv_link: true,
19
+ use_align_mat: true,
20
+ use_align_loss: true,
21
+ bert_token_type: true,
22
+ decoder_hidden_size: 512,
23
+ end_with_from: true, # equivalent to "SWGOIF" if true
24
+ clause_order: null, # strings like "SWGOIF", it will be prioriotized over end_with_from
25
+ },
26
+
27
+ eval_name: "bertimbau-base-pt-eval_%s_%d" % [self.eval_use_heuristic, self.eval_beam_size],
28
+ eval_output: "ie_dirs/BERTimbau-base-pt-train",
29
+ eval_beam_size: 1,
30
+ eval_use_heuristic: true,
31
+ eval_steps: [24100],
32
+ eval_section: "val",
33
+ }
gap-text2sql-main/mrat-sql-gap/experiments/spider-configs/spider-BERTimbau-large-pt.jsonnet ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ logdir: "logdir/BERTimbau-large-pt-train",
3
+ model_config: "experiments/spider-configs/BERTimbau-large/nl2code-bertimbau-large.jsonnet",
4
+ model_config_args: {
5
+ data_path: 'data/spider-pt/',
6
+ bs: 6,
7
+ num_batch_accumulated: 4,
8
+ bert_version: "neuralmind/bert-large-portuguese-cased",
9
+ summarize_header: "avg",
10
+ use_column_type: false,
11
+ max_steps: 81000,
12
+ num_layers: 8,
13
+ lr: 7.44e-4,
14
+ bert_lr: 3e-6,
15
+ att: 1,
16
+ end_lr: 0,
17
+ sc_link: true,
18
+ cv_link: true,
19
+ use_align_mat: true,
20
+ use_align_loss: true,
21
+ bert_token_type: true,
22
+ decoder_hidden_size: 512,
23
+ end_with_from: true, # equivalent to "SWGOIF" if true
24
+ clause_order: null, # strings like "SWGOIF", it will be prioriotized over end_with_from
25
+ },
26
+
27
+ eval_name: "bertimbau-large-pt-eval_%s_%d" % [self.eval_use_heuristic, self.eval_beam_size],
28
+ eval_output: "ie_dirs/BERTimbau-large-pt-train",
29
+ eval_beam_size: 1,
30
+ eval_use_heuristic: true,
31
+ eval_steps: [40100],
32
+ eval_section: "val",
33
+ }