Spaces:
Runtime error
Runtime error
Synced repo using 'sync_with_huggingface' Github Action
Browse files- .gitattributes +1 -0
- Dockerfile +27 -0
- LICENSE +201 -0
- __pycache__/custom_prompt_template.cpython-311.pyc +0 -0
- __pycache__/custom_prompt_template.cpython-39.pyc +0 -0
- app.py +451 -0
- custom_prompt_template.py +43 -0
- data-downloader/download_eval_data.sh +68 -0
- data-downloader/download_instructions_data.sh +120 -0
- olive_farm.png +3 -0
- open_instruct/get_data_stats.py +121 -0
- open_instruct/reformat_data.py +551 -0
- requirements.txt +6 -0
- web-app.py +67 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
olive_farm.png filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.8-slim-buster
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
COPY ./requirements.txt /app/requirements.txt
|
6 |
+
# COPY ./packages.txt /app/packages.txt
|
7 |
+
|
8 |
+
# RUN apt-get update && xargs -r -a /app/packages.txt apt-get install -y && rm -rf /var/lib/apt/lists/*
|
9 |
+
RUN pip3 install --no-cache-dir -r /app/requirements.txt
|
10 |
+
|
11 |
+
# User
|
12 |
+
RUN useradd -m -u 1000 user
|
13 |
+
USER user
|
14 |
+
ENV HOME /home/user
|
15 |
+
ENV PATH $HOME/.local/bin:$PATH
|
16 |
+
|
17 |
+
WORKDIR $HOME
|
18 |
+
RUN mkdir app
|
19 |
+
WORKDIR $HOME/app
|
20 |
+
COPY . $HOME/app
|
21 |
+
|
22 |
+
EXPOSE 8501
|
23 |
+
CMD streamlit run app.py \
|
24 |
+
--server.headless true \
|
25 |
+
--server.enableCORS false \
|
26 |
+
--server.enableXsrfProtection false \
|
27 |
+
--server.fileWatcherType none
|
LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
__pycache__/custom_prompt_template.cpython-311.pyc
ADDED
Binary file (2.41 kB). View file
|
|
__pycache__/custom_prompt_template.cpython-39.pyc
ADDED
Binary file (1.33 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,451 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import requests
|
3 |
+
import justext
|
4 |
+
import pdfplumber
|
5 |
+
import docx2txt
|
6 |
+
import json
|
7 |
+
import ast
|
8 |
+
import os
|
9 |
+
import re
|
10 |
+
import openai
|
11 |
+
import json
|
12 |
+
|
13 |
+
from custom_prompt_template import InstructionGenerationTemplate, AnswerGenerationTemplate
|
14 |
+
|
15 |
+
|
16 |
+
st.set_page_config(page_title="LLM instruction Generator")
|
17 |
+
|
18 |
+
st.sidebar.success("Select a page above")
|
19 |
+
|
20 |
+
|
21 |
+
# function for the odia stoplists justext
|
22 |
+
def odia_stoplist():
|
23 |
+
odia_stopwords = [
|
24 |
+
"ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର",
|
25 |
+
"ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର",
|
26 |
+
"ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡିକ", "ସେମାନଙ୍କର",
|
27 |
+
"ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ",
|
28 |
+
"ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା",
|
29 |
+
"ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |",
|
30 |
+
"କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ",
|
31 |
+
"ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ",
|
32 |
+
"ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ",
|
33 |
+
"ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ",
|
34 |
+
"ତାପରେ", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ",
|
35 |
+
"ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି",
|
36 |
+
"ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ",
|
37 |
+
"କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ"
|
38 |
+
]
|
39 |
+
return frozenset(odia_stopwords)
|
40 |
+
|
41 |
+
|
42 |
+
# function to extract data from url using justext
|
43 |
+
def extract_data_from_url(url, language):
|
44 |
+
try:
|
45 |
+
response = requests.get(url)
|
46 |
+
|
47 |
+
if response.status_code == 200:
|
48 |
+
print("inside the response")
|
49 |
+
response.raise_for_status()
|
50 |
+
page = response.content
|
51 |
+
para = ""
|
52 |
+
if language == "English":
|
53 |
+
paragraphs = justext.justext(page, justext.get_stoplist("English"))
|
54 |
+
elif language == "Hindi":
|
55 |
+
paragraphs = justext.justext(page, justext.get_stoplist("Hindi"), 70, 140, 0.0, 0.02, 0.5, 150, False)
|
56 |
+
elif language == "Odia":
|
57 |
+
paragraphs = justext.justext(
|
58 |
+
page, odia_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False
|
59 |
+
)
|
60 |
+
|
61 |
+
for paragraph in paragraphs:
|
62 |
+
if not paragraph.is_boilerplate:
|
63 |
+
para = para + "\n" + paragraph.text
|
64 |
+
# returning the extracted data i.e para as string
|
65 |
+
if para == "":
|
66 |
+
st.error("Unable to extract data from the URL")
|
67 |
+
return None
|
68 |
+
else:
|
69 |
+
return para
|
70 |
+
else:
|
71 |
+
st.error("Request failed ")
|
72 |
+
return None
|
73 |
+
except Exception as err:
|
74 |
+
st.error(err)
|
75 |
+
return None
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
# function to extract data from documents
|
81 |
+
def extract_data_from_documents(documents):
|
82 |
+
data = ""
|
83 |
+
if documents is not None:
|
84 |
+
for document in documents:
|
85 |
+
document_details = {
|
86 |
+
"filename": document.name,
|
87 |
+
"filetype": document.type,
|
88 |
+
"filesize": document.size,
|
89 |
+
}
|
90 |
+
st.write(document_details)
|
91 |
+
|
92 |
+
# Extract content from the txt file
|
93 |
+
if document.type == "text/plain":
|
94 |
+
# Read as bytes
|
95 |
+
data += str(document.read(), "utf-8")
|
96 |
+
|
97 |
+
# Extract content from the pdf file
|
98 |
+
elif document.type == "application/pdf":
|
99 |
+
# using pdfplumber
|
100 |
+
try:
|
101 |
+
with pdfplumber.open(document) as pdf:
|
102 |
+
all_text = ""
|
103 |
+
for page in pdf.pages:
|
104 |
+
text = page.extract_text()
|
105 |
+
all_text += text + "\n"
|
106 |
+
data += all_text
|
107 |
+
except requests.exceptions.RequestException as e:
|
108 |
+
st.write("None")
|
109 |
+
|
110 |
+
# Extract content from the docx file
|
111 |
+
elif (
|
112 |
+
document.type
|
113 |
+
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
114 |
+
):
|
115 |
+
data += docx2txt.process(document)
|
116 |
+
|
117 |
+
# return extract data
|
118 |
+
return data
|
119 |
+
else:
|
120 |
+
st.error("Error: An error occurred while fetching content.")
|
121 |
+
# return extract status, and the data extracted
|
122 |
+
return None
|
123 |
+
|
124 |
+
|
125 |
+
# function for the keyboard
|
126 |
+
|
127 |
+
|
128 |
+
|
129 |
+
# Check the inputs for language, promptType
|
130 |
+
def valid_drop_down(language, promptType, noOfQuestions, instructionFormat):
|
131 |
+
langFlag = False
|
132 |
+
promptFlag = False
|
133 |
+
noOfQuestionFlag = False
|
134 |
+
instructionFormatFlag = False
|
135 |
+
|
136 |
+
if language:
|
137 |
+
langFlag = True
|
138 |
+
if promptType:
|
139 |
+
promptFlag = True
|
140 |
+
if noOfQuestions:
|
141 |
+
noOfQuestionFlag = True
|
142 |
+
if instructionFormat:
|
143 |
+
instructionFormatFlag = True
|
144 |
+
# checking for the compalsory inputs and return true only if all are set
|
145 |
+
return langFlag & promptFlag & noOfQuestionFlag & instructionFormatFlag
|
146 |
+
|
147 |
+
|
148 |
+
def main():
|
149 |
+
# setting up the initial session_states
|
150 |
+
if "extract_button" not in st.session_state:
|
151 |
+
st.session_state.extract_button = False
|
152 |
+
if "submit" not in st.session_state:
|
153 |
+
st.session_state.submit = False
|
154 |
+
if "generated" not in st.session_state:
|
155 |
+
st.session_state.generated = False
|
156 |
+
if "selected" not in st.session_state:
|
157 |
+
st.session_state.selected = False
|
158 |
+
if "answered" not in st.session_state:
|
159 |
+
st.session_state.answered = False
|
160 |
+
|
161 |
+
st.subheader("LLM Instructions")
|
162 |
+
|
163 |
+
# form to get the inputs
|
164 |
+
with st.form(key="form1"):
|
165 |
+
st.write("#")
|
166 |
+
|
167 |
+
# dropdown for language
|
168 |
+
language = st.selectbox("Select a language", ("", "English", "Hindi", "Odia"))
|
169 |
+
|
170 |
+
# dropdown for prompt type
|
171 |
+
promptType = st.selectbox(
|
172 |
+
"Select the Prompt type", ("", "Input text", "Url", "Document")
|
173 |
+
)
|
174 |
+
# inputs for number
|
175 |
+
noOfQuestions = st.number_input(
|
176 |
+
"Number of questions to generate:", min_value=1, max_value=20, value=10
|
177 |
+
)
|
178 |
+
|
179 |
+
# dropdown for language
|
180 |
+
instructionFormat = st.selectbox(
|
181 |
+
"Format of instruction:", ("Imperative sentence", "Question")
|
182 |
+
)
|
183 |
+
|
184 |
+
# input text for openAiKey
|
185 |
+
openAiKey = st.text_input(label="Input the openai key")
|
186 |
+
if "openAiKey" in st.session_state:
|
187 |
+
st.session_state["openAiKey"] = openAiKey
|
188 |
+
else:
|
189 |
+
st.session_state["openAiKey"] = openAiKey
|
190 |
+
|
191 |
+
st.write("##")
|
192 |
+
|
193 |
+
# form submit button and setting up the session_state
|
194 |
+
if st.form_submit_button():
|
195 |
+
st.session_state.submit = True
|
196 |
+
|
197 |
+
if st.session_state.submit:
|
198 |
+
# extends the prompt form to extract the data
|
199 |
+
with st.expander(label="prompt"):
|
200 |
+
with st.form(key="form2"):
|
201 |
+
# calling the function inside if to check valid drop down inputs
|
202 |
+
if valid_drop_down(
|
203 |
+
language, promptType, noOfQuestions, instructionFormat
|
204 |
+
):
|
205 |
+
if promptType == "Input text":
|
206 |
+
inputText = st.text_area(
|
207 |
+
label="For Instructions",
|
208 |
+
placeholder="Please enter your text here",
|
209 |
+
)
|
210 |
+
|
211 |
+
elif promptType == "Url":
|
212 |
+
url = st.text_input(
|
213 |
+
label="For URL", placeholder="Please enter your text here"
|
214 |
+
)
|
215 |
+
elif promptType == "Document":
|
216 |
+
documents = st.file_uploader(
|
217 |
+
label="For Documents ( pdf / txt / docx )",
|
218 |
+
type=["pdf", "txt", "docx"],
|
219 |
+
accept_multiple_files=True,
|
220 |
+
)
|
221 |
+
|
222 |
+
# if addInfoCheckbox:
|
223 |
+
# additionalInfo = st.text_input(
|
224 |
+
# label="Additional Instructions",
|
225 |
+
# placeholder="Please enter your text here",
|
226 |
+
# )
|
227 |
+
|
228 |
+
if st.form_submit_button():
|
229 |
+
st.session_state.extract_button = True
|
230 |
+
# st.experimental_rerun()
|
231 |
+
|
232 |
+
# extracting data
|
233 |
+
if st.session_state.extract_button:
|
234 |
+
# extracting data
|
235 |
+
|
236 |
+
if promptType == "Input text":
|
237 |
+
extractedData = inputText
|
238 |
+
|
239 |
+
elif promptType == "Url":
|
240 |
+
extractedURLData = extract_data_from_url(url, language)
|
241 |
+
if extractedURLData is not None:
|
242 |
+
extractedData = extractedURLData
|
243 |
+
st.text_area("Extracted Text:", value=extractedData, height=200)
|
244 |
+
else:
|
245 |
+
extractedData = False
|
246 |
+
elif promptType == "Document":
|
247 |
+
if not documents:
|
248 |
+
documents = None
|
249 |
+
else:
|
250 |
+
for doc in documents:
|
251 |
+
if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]:
|
252 |
+
# if documents is not the relevant type
|
253 |
+
st.error("Unsupported file: " + doc.name)
|
254 |
+
|
255 |
+
extractedDocumentData = extract_data_from_documents(documents)
|
256 |
+
extractedData = extractedDocumentData
|
257 |
+
|
258 |
+
|
259 |
+
# if the values are extracted running the custom prompt by creating an instance
|
260 |
+
if extractedData:
|
261 |
+
|
262 |
+
|
263 |
+
# ----------------------------- RUNNING THE PROMPT -----------------------------
|
264 |
+
if "extractedData" not in st.session_state:
|
265 |
+
st.session_state["extractedData"] = extractedData
|
266 |
+
else:
|
267 |
+
st.session_state["extractedData"] = extractedData
|
268 |
+
|
269 |
+
if "Initial" not in st.session_state:
|
270 |
+
st.session_state.Initial=True
|
271 |
+
|
272 |
+
if st.session_state.Initial == True:
|
273 |
+
|
274 |
+
# running the prompt form here
|
275 |
+
|
276 |
+
openai.api_key = st.session_state["openAiKey"]
|
277 |
+
my_prompt_template = InstructionGenerationTemplate()
|
278 |
+
|
279 |
+
# providing the rules for the instructions to be generated
|
280 |
+
additional_rules = """
|
281 |
+
- You do not need to provide a response to the generated examples.
|
282 |
+
- You must return the response in the specified language.
|
283 |
+
- Each generated instruction can be either an imperative sentence or a question.
|
284 |
+
"""
|
285 |
+
|
286 |
+
if st.button("Generate Instructions"):
|
287 |
+
prompt = my_prompt_template.format(
|
288 |
+
num_questions=noOfQuestions,
|
289 |
+
context=extractedData,
|
290 |
+
instruction_format=instructionFormat,
|
291 |
+
lang=language,
|
292 |
+
additional_rules=additional_rules
|
293 |
+
)
|
294 |
+
response = openai.ChatCompletion.create(
|
295 |
+
model="gpt-3.5-turbo",
|
296 |
+
messages=[
|
297 |
+
{"role": "system", "content": prompt},
|
298 |
+
])
|
299 |
+
# if st.button("Generate Instructions"):
|
300 |
+
print("Generate button")
|
301 |
+
print("Checkpoint 1!")
|
302 |
+
|
303 |
+
if "result" not in st.session_state:
|
304 |
+
content = response.choices[0].message.content
|
305 |
+
# content = "\n1. helloworld1.\n2. helloworld2"
|
306 |
+
responses_list = content.split('\n')
|
307 |
+
responses_list = [re.sub(r'^\s*\d+\.\s*', '', resp) for resp in responses_list if resp]
|
308 |
+
st.session_state["result"]=responses_list
|
309 |
+
st.session_state.generated = True
|
310 |
+
st.session_state.Initial = False
|
311 |
+
if st.session_state.generated:
|
312 |
+
# displaying the generated instructions
|
313 |
+
st.write("Generated Insuctions")
|
314 |
+
result = st.session_state["result"]
|
315 |
+
# print(type(result))
|
316 |
+
# print(result)
|
317 |
+
result_dict = {i+1: value for i,value in enumerate(result)}
|
318 |
+
selected_items = [f" {value} " for key, value in result_dict.items() if st.checkbox(f"Q{key} : {value}")]
|
319 |
+
# print(type(result_dict))
|
320 |
+
# print(result_dict)
|
321 |
+
print("Checked point 2!")
|
322 |
+
# Display the selected items as a list
|
323 |
+
if selected_items:
|
324 |
+
st.write("Selected Items:")
|
325 |
+
st.write(selected_items)
|
326 |
+
if "selected_items" not in st.session_state:
|
327 |
+
st.session_state["selected_items"] = selected_items
|
328 |
+
st.session_state["selected_items"] = selected_items
|
329 |
+
st.session_state.selected = True
|
330 |
+
else:
|
331 |
+
st.write("No items selected.")
|
332 |
+
|
333 |
+
|
334 |
+
|
335 |
+
# ----------------------------- RUNNING THE PROMPT FOR ANSWER GENERATION -----------------------------
|
336 |
+
|
337 |
+
|
338 |
+
|
339 |
+
if st.session_state.selected:
|
340 |
+
|
341 |
+
if "Initial2" not in st.session_state:
|
342 |
+
st.session_state.Initial2=True
|
343 |
+
|
344 |
+
if st.session_state.Initial2:
|
345 |
+
# running the prompt form here
|
346 |
+
openai.api_key = st.session_state["openAiKey"]
|
347 |
+
my_prompt_template2 = AnswerGenerationTemplate()
|
348 |
+
|
349 |
+
# providing the rules for the answers to be generated
|
350 |
+
additional_rules = """
|
351 |
+
Enumerate the answers and dont provide any additional tags.
|
352 |
+
"""
|
353 |
+
|
354 |
+
question = st.session_state["selected_items"]
|
355 |
+
if st.button("Generate Answers"):
|
356 |
+
prompt = my_prompt_template2.format(
|
357 |
+
questions=question,
|
358 |
+
additional_rules = additional_rules
|
359 |
+
)
|
360 |
+
response = openai.ChatCompletion.create(
|
361 |
+
model="gpt-3.5-turbo",
|
362 |
+
messages=[
|
363 |
+
{"role": "system", "content": prompt},
|
364 |
+
])
|
365 |
+
|
366 |
+
# if st.button("Generate Answers"):
|
367 |
+
# print("\n\n\n\nInside Answersss:\n\n\n\n")
|
368 |
+
# print(st.session_state["selected_items"])
|
369 |
+
|
370 |
+
# print("Generate button")
|
371 |
+
# print("Checkpoint 3!")
|
372 |
+
|
373 |
+
if "answers" not in st.session_state:
|
374 |
+
content = response.choices[0].message.content
|
375 |
+
# content = "\n1. Answer1.\n2. Answer2"
|
376 |
+
print("\n\n\n\n\nAnswerss before regex\n\n\n\n")
|
377 |
+
print(content)
|
378 |
+
# print("Answer Type:" + str(type(content)))
|
379 |
+
responses_list = content.split('\n')
|
380 |
+
# print("\n\n\n\n\nAnswerss before regex after splitting\n\n\n\n")
|
381 |
+
# print(responses_list)
|
382 |
+
# print("Answer Type:" + str(type(responses_list)))
|
383 |
+
|
384 |
+
responses_list = [re.sub(r'^\s*\d+\.\s*', '', resp) for resp in responses_list if resp]
|
385 |
+
st.session_state["answers"]=responses_list
|
386 |
+
st.session_state.answered = True
|
387 |
+
st.session_state.Initial2 = False
|
388 |
+
|
389 |
+
if st.session_state.answered:
|
390 |
+
# displaying the generated Answers
|
391 |
+
|
392 |
+
questions = st.session_state["selected_items"]
|
393 |
+
answers = st.session_state["answers"]
|
394 |
+
# print("\n\n\n\n\nAnswerss after regex\n\n\n\n")
|
395 |
+
# print(answers)
|
396 |
+
# print("Answer Type:" + str(type(answers)))
|
397 |
+
answers_dict = {i+1: value for i,value in enumerate(answers)}
|
398 |
+
# print(type(answers_dict))
|
399 |
+
# print(answers_dict)
|
400 |
+
# print("Checked point 4!")
|
401 |
+
# st.write("answers")
|
402 |
+
st.write(answers_dict)
|
403 |
+
|
404 |
+
# Create a list to hold the JSON-like data
|
405 |
+
st.write("Generated Questions and Answers")
|
406 |
+
# Create a list of dictionaries
|
407 |
+
jsonl_data = [{"Question": question, "Answer": answers_dict.get(i, 'No answer found')} for i, question in enumerate(questions, start=1)]
|
408 |
+
|
409 |
+
|
410 |
+
st.write(jsonl_data)
|
411 |
+
jsonl_string = '\n'.join(json.dumps(item, ensure_ascii=False) for item in jsonl_data)
|
412 |
+
|
413 |
+
# Display the JSONL data
|
414 |
+
print(jsonl_string)
|
415 |
+
|
416 |
+
if st.download_button(label="Save as jsonl", data=jsonl_string, mime="application/json"):
|
417 |
+
st.success("Successfully saved")
|
418 |
+
|
419 |
+
|
420 |
+
|
421 |
+
|
422 |
+
if st.button("Clear"):
|
423 |
+
st.session_state.extract_button = False
|
424 |
+
st.session_state.submit = False
|
425 |
+
st.session_state.generated = False
|
426 |
+
st.session_state.selected = False
|
427 |
+
st.session_state.answered = False
|
428 |
+
|
429 |
+
|
430 |
+
|
431 |
+
if "Initial" in st.session_state:
|
432 |
+
st.session_state.Initial = True
|
433 |
+
if "Initial2" in st.session_state:
|
434 |
+
st.session_state.Initial2 = True
|
435 |
+
|
436 |
+
|
437 |
+
if "openAiKey" in st.session_state:
|
438 |
+
del st.session_state["openAiKey"]
|
439 |
+
if "extractedData" in st.session_state:
|
440 |
+
del st.session_state["extractedData"]
|
441 |
+
if "result" in st.session_state:
|
442 |
+
del st.session_state["result"]
|
443 |
+
if "selected_items" in st.session_state:
|
444 |
+
del st.session_state["selected_items"]
|
445 |
+
if "answered" in st.session_state:
|
446 |
+
del st.session_state["answers"]
|
447 |
+
st.experimental_rerun()
|
448 |
+
|
449 |
+
|
450 |
+
if __name__ == "__main__":
|
451 |
+
main()
|
custom_prompt_template.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
import langchain
|
3 |
+
class InstructionGenerationTemplate(langchain.prompts.PromptTemplate):
|
4 |
+
"""A custom prompt template for generating instructions."""
|
5 |
+
|
6 |
+
input_variables: List[str] = ["num_questions", "context", "instruction_format", "lang", "additional_rules"]
|
7 |
+
|
8 |
+
template = """
|
9 |
+
You are a highly intelligent language model trained to assist with a variety of language tasks. Your task here is to generate {num_questions} diverse questions or instructions based on the context provided below:
|
10 |
+
|
11 |
+
Context:
|
12 |
+
{context}
|
13 |
+
|
14 |
+
Please follow these rules:
|
15 |
+
{additional_rules}
|
16 |
+
|
17 |
+
Please generate the instructions in the {instruction_format} format and in {lang} language. Remember to adhere to the rules mentioned above.
|
18 |
+
"""
|
19 |
+
|
20 |
+
template_format = "f-string"
|
21 |
+
def format(self, **kwargs):
|
22 |
+
"""Format the prompt."""
|
23 |
+
return self.template.format(**kwargs)
|
24 |
+
|
25 |
+
class AnswerGenerationTemplate(langchain.prompts.PromptTemplate):
|
26 |
+
"""A custom prompt template for generating answers to questions."""
|
27 |
+
|
28 |
+
input_variables: List[str] = ["questions", "additional_rules"]
|
29 |
+
|
30 |
+
template = """
|
31 |
+
You are a highly intelligent language model tasked with providing answers to the following questions :
|
32 |
+
|
33 |
+
Questions:
|
34 |
+
{questions}
|
35 |
+
|
36 |
+
Please follow these rules:
|
37 |
+
{additional_rules}
|
38 |
+
"""
|
39 |
+
|
40 |
+
template_format = "f-string"
|
41 |
+
def format(self, **kwargs):
|
42 |
+
"""Format the prompt."""
|
43 |
+
return self.template.format(**kwargs)
|
data-downloader/download_eval_data.sh
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
mkdir -p data/downloads
|
2 |
+
mkdir -p data/eval
|
3 |
+
|
4 |
+
# MMLU dataset
|
5 |
+
wget -O data/downloads/mmlu_data.tar https://people.eecs.berkeley.edu/~hendrycks/data.tar
|
6 |
+
mkdir -p data/downloads/mmlu_data
|
7 |
+
tar -xvf data/downloads/mmlu_data.tar -C data/downloads/mmlu_data
|
8 |
+
mv data/downloads/mmlu_data/data data/eval/mmlu && rm -r data/downloads/mmlu_data data/downloads/mmlu_data.tar
|
9 |
+
|
10 |
+
|
11 |
+
# Big-Bench-Hard dataset
|
12 |
+
wget -O data/downloads/bbh_data.zip https://github.com/suzgunmirac/BIG-Bench-Hard/archive/refs/heads/main.zip
|
13 |
+
mkdir -p data/downloads/bbh
|
14 |
+
unzip data/downloads/bbh_data.zip -d data/downloads/bbh
|
15 |
+
mv data/downloads/bbh/BIG-Bench-Hard-main/ data/eval/bbh && rm -r data/downloads/bbh data/downloads/bbh_data.zip
|
16 |
+
|
17 |
+
|
18 |
+
# Super-NaturalInstructions dataset
|
19 |
+
wget -O data/downloads/superni_data.zip https://github.com/allenai/natural-instructions/archive/refs/heads/master.zip
|
20 |
+
mkdir -p data/downloads/superni
|
21 |
+
unzip data/downloads/superni_data.zip -d data/downloads/superni
|
22 |
+
mv data/downloads/superni/natural-instructions-master/ data/eval/superni && rm -r data/downloads/superni data/downloads/superni_data.zip
|
23 |
+
|
24 |
+
|
25 |
+
# TyDiQA-GoldP dataset
|
26 |
+
mkdir -p data/eval/tydiqa
|
27 |
+
wget -P data/eval/tydiqa/ https://storage.googleapis.com/tydiqa/v1.1/tydiqa-goldp-v1.1-dev.json
|
28 |
+
wget -P data/eval/tydiqa/ https://storage.googleapis.com/tydiqa/v1.1/tydiqa-goldp-v1.1-train.json
|
29 |
+
|
30 |
+
|
31 |
+
# XOR-QA dataset
|
32 |
+
wget -P data/eval/xorqa/ https://raw.githubusercontent.com/mia-workshop/MIA-Shared-Task-2022/main/data/eval/mia_2022_dev_xorqa.jsonl
|
33 |
+
wget -P data/eval/xorqa/ https://github.com/mia-workshop/MIA-Shared-Task-2022/raw/main/data/train/mia_2022_train_data.jsonl.zip
|
34 |
+
unzip data/eval/xorqa/mia_2022_train_data.jsonl.zip -d data/eval/xorqa/ && rm data/eval/xorqa/mia_2022_train_data.jsonl.zip
|
35 |
+
|
36 |
+
|
37 |
+
# GSM dataset
|
38 |
+
wget -P data/eval/gsm/ https://github.com/openai/grade-school-math/raw/master/grade_school_math/data/test.jsonl
|
39 |
+
|
40 |
+
|
41 |
+
# Multilingual GSM dataset
|
42 |
+
wget -O data/downloads/url-nlp.zip https://github.com/google-research/url-nlp/archive/refs/heads/main.zip
|
43 |
+
mkdir -p data/downloads/url-nlp
|
44 |
+
unzip data/downloads/url-nlp.zip -d data/downloads/url-nlp
|
45 |
+
mv data/downloads/url-nlp/url-nlp-main/mgsm data/eval/mgsm && rm -r data/downloads/url-nlp data/downloads/url-nlp.zip
|
46 |
+
|
47 |
+
|
48 |
+
# Codex HumanEval
|
49 |
+
wget -P data/eval/codex_humaneval https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz
|
50 |
+
|
51 |
+
|
52 |
+
# TruthfulQA
|
53 |
+
wget -P data/eval/truthfulqa https://github.com/sylinrl/TruthfulQA/raw/main/TruthfulQA.csv
|
54 |
+
|
55 |
+
|
56 |
+
# Self-instruct eval, Vicuna eval, and Koala eval for creative instructions/tasks
|
57 |
+
mkdir -p data/eval/creative_tasks
|
58 |
+
wget -O data/eval/creative_tasks/self_instruct_test.jsonl https://github.com/yizhongw/self-instruct/raw/main/human_eval/user_oriented_instructions.jsonl
|
59 |
+
wget -O data/eval/creative_tasks/vicuna_test.jsonl https://github.com/lm-sys/FastChat/raw/main/fastchat/eval/table/question.jsonl
|
60 |
+
wget -O data/eval/creative_tasks/koala_test.jsonl https://github.com/arnav-gudibande/koala-test-set/raw/main/koala_test_set.jsonl
|
61 |
+
|
62 |
+
|
63 |
+
# Toxigen data
|
64 |
+
mkdir -p data/eval/toxigen
|
65 |
+
for minority_group in asian black chinese jewish latino lgbtq mental_disability mexican middle_east muslim native_american physical_disability trans women
|
66 |
+
do
|
67 |
+
wget -O data/eval/toxigen/hate_${minority_group}.txt https://raw.githubusercontent.com/microsoft/TOXIGEN/main/prompts/hate_${minority_group}_1k.txt
|
68 |
+
done
|
data-downloader/download_instructions_data.sh
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# check if there is $HF_TOKEN in the environment variables
|
2 |
+
if [ -z "$HF_TOKEN" ]
|
3 |
+
then
|
4 |
+
echo "Warning: HuggingFace dataset LIMA requires permissive access."
|
5 |
+
echo "Warning: Please request the access at https://huggingface.co/datasets/GAIR/lima and set the HF_TOKEN environment variable before running this script."
|
6 |
+
exit 1
|
7 |
+
fi
|
8 |
+
|
9 |
+
echo "Downloading Super-NaturalInstructions dataset..."
|
10 |
+
wget -P data/raw_train/super_ni/ https://github.com/allenai/natural-instructions/archive/refs/heads/master.zip
|
11 |
+
unzip data/raw_train/super_ni/master.zip -d data/raw_train/super_ni/ && rm data/raw_train/super_ni/master.zip
|
12 |
+
mv data/raw_train/super_ni/natural-instructions-master/* data/raw_train/super_ni/ && rm -r data/raw_train/super_ni/natural-instructions-master
|
13 |
+
|
14 |
+
|
15 |
+
echo "Downloading the flan_v2 chain-of-thought submix..."
|
16 |
+
wget -P data/raw_train/cot/ https://beaker.org/api/v3/datasets/01GXZ52K2Q932H6KZY499A7FE8/files/cot_zsopt.jsonl
|
17 |
+
wget -P data/raw_train/cot/ https://beaker.org/api/v3/datasets/01GXZ51ZV283RAZW7J3ECM4S58/files/cot_fsopt.jsonl
|
18 |
+
|
19 |
+
|
20 |
+
echo "Downloading the flan_v2 collection, here we subsampled only 100K instances..."
|
21 |
+
wget -P data/raw_train/flan_v2/ https://beaker.org/api/v3/datasets/01GZTTS2EJFPA83PXS4FQCS1SA/files/flan_v2_resampled_100k.jsonl
|
22 |
+
|
23 |
+
|
24 |
+
echo "Downloading self-instruct data..."
|
25 |
+
wget -P data/raw_train/self_instruct/ https://raw.githubusercontent.com/yizhongw/self-instruct/main/data/gpt3_generations/batch_221203/all_instances_82K.jsonl
|
26 |
+
|
27 |
+
|
28 |
+
echo "Downloading unnatural-instructions data..."
|
29 |
+
wget -P data/raw_train/unnatural_instructions/ https://github.com/orhonovich/unnatural-instructions/raw/main/data/core_data.zip
|
30 |
+
unzip data/raw_train/unnatural_instructions/core_data.zip -d data/raw_train/unnatural_instructions/
|
31 |
+
|
32 |
+
|
33 |
+
echo "Downloading Stanford alpaca data..."
|
34 |
+
wget -P data/raw_train/stanford_alpaca/ https://github.com/tatsu-lab/stanford_alpaca/raw/main/alpaca_data.json
|
35 |
+
|
36 |
+
|
37 |
+
echo "Downloading the dolly dataset..."
|
38 |
+
wget -P data/raw_train/dolly/ https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl
|
39 |
+
|
40 |
+
|
41 |
+
echo "Downloading the OpenAssistant data (oasst1)..."
|
42 |
+
wget -P data/raw_train/oasst1/ https://huggingface.co/datasets/OpenAssistant/oasst1/resolve/main/2023-04-12_oasst_ready.trees.jsonl.gz
|
43 |
+
gzip -d data/raw_train/oasst1/2023-04-12_oasst_ready.trees.jsonl.gz
|
44 |
+
|
45 |
+
|
46 |
+
echo "Downloading the code alpaca dataset..."
|
47 |
+
wget -P data/raw_train/code_alpaca/ https://github.com/sahil280114/codealpaca/raw/master/data/code_alpaca_20k.json
|
48 |
+
|
49 |
+
|
50 |
+
echo "Downloading the gpt4-llm dataset..."
|
51 |
+
wget -P data/raw_train/gpt4_alpaca/ https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/raw/main/data/alpaca_gpt4_data.json
|
52 |
+
wget -P data/raw_train/gpt4_alpaca/ https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/raw/main/data/alpaca_gpt4_data_zh.json
|
53 |
+
|
54 |
+
|
55 |
+
echo "Downloading the baize dataset..."
|
56 |
+
wget -P data/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/alpaca_chat_data.json
|
57 |
+
wget -P data/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/medical_chat_data.json
|
58 |
+
wget -P data/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/quora_chat_data.json
|
59 |
+
wget -P data/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/stackoverflow_chat_data.json
|
60 |
+
|
61 |
+
|
62 |
+
echo "Downloading ShareGPT dataset..."
|
63 |
+
wget -P data/raw_train/sharegpt/ https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part1_html_cleaned.json
|
64 |
+
wget -P data/raw_train/sharegpt/ https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part2_html_cleaned.json
|
65 |
+
echo "Splitting the ShareGPT dataset..."
|
66 |
+
python scripts/split_sharegpt_conversations.py \
|
67 |
+
--in-files data/raw_train/sharegpt/sg_90k_part1_html_cleaned.json data/raw_train/sharegpt/sg_90k_part2_html_cleaned.json \
|
68 |
+
--out-file data/raw_train/sharegpt/sharegpt_html_cleaned_and_split.json \
|
69 |
+
--model-name-or-path ../hf_llama_models/7B/
|
70 |
+
|
71 |
+
|
72 |
+
echo "Downloading LIMA dataset..."
|
73 |
+
wget --header="Authorization: Bearer $HF_TOKEN" -P data/raw_train/lima/ https://huggingface.co/datasets/GAIR/lima/raw/main/train.jsonl
|
74 |
+
|
75 |
+
|
76 |
+
echo "Downloading WizardLM dataset..."
|
77 |
+
wget -P data/raw_train/wizardlm/ https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k/resolve/main/WizardLM_evol_instruct_V2_143k.json
|
78 |
+
|
79 |
+
|
80 |
+
echo "Downloading the OpenOrca dataset..."
|
81 |
+
wget -P data/raw_train/open_orca/ https://huggingface.co/datasets/Open-Orca/OpenOrca/resolve/main/1M-GPT4-Augmented.parquet
|
82 |
+
wget -P data/raw_train/open_orca/ https://huggingface.co/datasets/Open-Orca/OpenOrca/resolve/main/3_5M-GPT3_5-Augmented.parquet
|
83 |
+
|
84 |
+
|
85 |
+
echo "Reformatting the datasets..."
|
86 |
+
python open_instruct/reformat_datasets.py --raw_data_dir data/raw_train/ --output_dir data/processed/
|
87 |
+
|
88 |
+
|
89 |
+
echo "Creating Tulu data mixtures..."
|
90 |
+
mkdir -p data/processed/tulu/
|
91 |
+
cat data/processed/flan_v2/flan_v2_data.jsonl \
|
92 |
+
data/processed/cot/cot_data.jsonl \
|
93 |
+
data/processed/dolly/dolly_data.jsonl \
|
94 |
+
data/processed/oasst1/oasst1_data.jsonl \
|
95 |
+
data/processed/gpt4_alpaca/gpt4_alpaca_data.jsonl \
|
96 |
+
data/processed/code_alpaca/code_alpaca_data.jsonl \
|
97 |
+
data/processed/sharegpt/sharegpt_data.jsonl \
|
98 |
+
> data/processed/tulu/tulu_v1_mix.jsonl
|
99 |
+
|
100 |
+
cat data/processed/flan_v2/flan_v2_data.jsonl \
|
101 |
+
data/processed/cot/cot_data.jsonl \
|
102 |
+
data/processed/dolly/dolly_data.jsonl \
|
103 |
+
data/processed/oasst1/oasst1_data.jsonl \
|
104 |
+
> data/processed/tulu/tulu_v1_human_mix.jsonl
|
105 |
+
|
106 |
+
cat data/processed/flan_v2/flan_v2_data.jsonl \
|
107 |
+
data/processed/cot/cot_data.jsonl \
|
108 |
+
data/processed/oasst1/oasst1_data.jsonl \
|
109 |
+
data/processed/lima/lima_data.jsonl \
|
110 |
+
data/processed/code_alpaca/code_alpaca_data.jsonl \
|
111 |
+
data/processed/sharegpt/sharegpt_data.jsonl \
|
112 |
+
data/processed/wizardlm/wizardlm_data.jsonl \
|
113 |
+
data/processed/open_orca/open_orca_data.jsonl \
|
114 |
+
> data/processed/tulu/tulu_v2_mix.jsonl
|
115 |
+
|
116 |
+
cat data/processed/flan_v2/flan_v2_data.jsonl \
|
117 |
+
data/processed/cot/cot_data.jsonl \
|
118 |
+
data/processed/oasst1/oasst1_data.jsonl \
|
119 |
+
data/processed/lima/lima_data.jsonl \
|
120 |
+
> data/processed/tulu/tulu_v2_human_mix.jsonl
|
olive_farm.png
ADDED
Git LFS Details
|
open_instruct/get_data_stats.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
import tqdm
|
5 |
+
import pandas as pd
|
6 |
+
import numpy as np
|
7 |
+
import argparse
|
8 |
+
from datasets import load_dataset
|
9 |
+
from transformers import AutoTokenizer
|
10 |
+
|
11 |
+
|
12 |
+
def get_statistics_for_messages_data(data_path):
|
13 |
+
# load dataset
|
14 |
+
dataset = load_dataset("json", data_files={"train": data_path})
|
15 |
+
# tokenize dataset
|
16 |
+
tokenizer = AutoTokenizer.from_pretrained("/net/nfs.cirrascale/allennlp/yizhongw/hf_llama_models/7B", use_fast=False)
|
17 |
+
# get statistics
|
18 |
+
num_instances = len(dataset["train"])
|
19 |
+
num_of_turns = [len(instance["messages"]) for instance in dataset["train"]]
|
20 |
+
user_prompt_lengths = []
|
21 |
+
assistant_response_lengths = []
|
22 |
+
instance_lengths = []
|
23 |
+
for instance in tqdm.tqdm(dataset["train"], desc="Processing instances"):
|
24 |
+
instance_length = 0
|
25 |
+
for message in instance["messages"]:
|
26 |
+
if message["role"] == "user":
|
27 |
+
user_prompt_lengths.append(len(tokenizer(message["content"], truncation=False, add_special_tokens=False)["input_ids"]))
|
28 |
+
instance_length += user_prompt_lengths[-1]
|
29 |
+
elif message["role"] == "assistant":
|
30 |
+
assistant_response_lengths.append(len(tokenizer(message["content"], truncation=False, add_special_tokens=False)["input_ids"]))
|
31 |
+
instance_length += assistant_response_lengths[-1]
|
32 |
+
instance_lengths.append(instance_length)
|
33 |
+
|
34 |
+
top_100_longest_instances = np.argsort(instance_lengths)[-100:][::-1].tolist()
|
35 |
+
top_100_longest_instances = [dataset["train"][i]["id"] for i in top_100_longest_instances]
|
36 |
+
|
37 |
+
result = {
|
38 |
+
"num_instances": num_instances,
|
39 |
+
"turns_summary": pd.Series(num_of_turns).describe(),
|
40 |
+
"user_prompt_lengths_summary": pd.Series(user_prompt_lengths).describe(),
|
41 |
+
"assistant_response_lengths_summary": pd.Series(assistant_response_lengths).describe(),
|
42 |
+
"total_lengths_summary": pd.Series(instance_lengths).describe(),
|
43 |
+
"num_instances_with_total_length_gt_512": np.sum(np.array(instance_lengths) > 512),
|
44 |
+
"num_instances_with_total_length_gt_768": np.sum(np.array(instance_lengths) > 768),
|
45 |
+
"num_instances_with_total_length_gt_1024": np.sum(np.array(instance_lengths) > 1024),
|
46 |
+
"num_instances_with_total_length_gt_1536": np.sum(np.array(instance_lengths) > 1536),
|
47 |
+
"num_instances_with_total_length_gt_2048": np.sum(np.array(instance_lengths) > 2048),
|
48 |
+
"num_instances_with_total_length_gt_4096": np.sum(np.array(instance_lengths) > 4096),
|
49 |
+
"top_100_longest_instances": top_100_longest_instances,
|
50 |
+
}
|
51 |
+
|
52 |
+
# convert everything to dict or scalar
|
53 |
+
for key, value in result.items():
|
54 |
+
if isinstance(value, pd.Series):
|
55 |
+
result[key] = value.to_dict()
|
56 |
+
elif isinstance(value, np.ndarray):
|
57 |
+
result[key] = value.tolist()
|
58 |
+
elif isinstance(value, np.int64):
|
59 |
+
result[key] = int(value)
|
60 |
+
|
61 |
+
return result
|
62 |
+
|
63 |
+
def get_statistics_for_prompt_completion_data(data_path):
|
64 |
+
# load dataset
|
65 |
+
dataset = load_dataset("json", data_files={"train": data_path})
|
66 |
+
prompts = [instance["prompt"] for instance in dataset["train"]]
|
67 |
+
completions = [instance["completion"] for instance in dataset["train"]]
|
68 |
+
# tokenize dataset
|
69 |
+
tokenizer = AutoTokenizer.from_pretrained("/net/nfs.cirrascale/allennlp/yizhongw/hf_llama_models/7B")
|
70 |
+
tokenized_prompts = tokenizer(prompts, truncation=False, add_special_tokens=False)
|
71 |
+
tokenized_completions = tokenizer(completions, truncation=False, add_special_tokens=False)
|
72 |
+
# get statistics
|
73 |
+
num_instances = len(dataset["train"])
|
74 |
+
prompt_lengths = [len(tokenized_prompts["input_ids"][i]) for i in range(num_instances)]
|
75 |
+
completion_lengths = [len(tokenized_completions["input_ids"][i]) for i in range(num_instances)]
|
76 |
+
prompt_completion_lengths = [prompt_lengths[i] + completion_lengths[i] for i in range(num_instances)]
|
77 |
+
|
78 |
+
result = {
|
79 |
+
"num_instances": num_instances,
|
80 |
+
"prompt_lengths_summary": pd.Series(prompt_lengths).describe(),
|
81 |
+
"completion_lengths_summary": pd.Series(completion_lengths).describe(),
|
82 |
+
"prompt_completion_lengths_summary": pd.Series(prompt_completion_lengths).describe(),
|
83 |
+
"num_instances_with_prompt_length_gt_512": np.sum(np.array(prompt_lengths) > 512),
|
84 |
+
"num_instances_with_completion_length_gt_512": np.sum(np.array(completion_lengths) > 512),
|
85 |
+
"num_instances_with_prompt_completion_length_gt_512": np.sum(np.array(prompt_completion_lengths) > 512),
|
86 |
+
"num_instances_with_completion_length_gt_768": np.sum(np.array(completion_lengths) > 768),
|
87 |
+
"num_instances_with_prompt_completion_length_gt_1024": np.sum(np.array(prompt_completion_lengths) > 1024),
|
88 |
+
}
|
89 |
+
|
90 |
+
# convert everything to dict or scalar
|
91 |
+
for key, value in result.items():
|
92 |
+
if isinstance(value, pd.Series):
|
93 |
+
result[key] = value.to_dict()
|
94 |
+
elif isinstance(value, np.ndarray):
|
95 |
+
result[key] = value.tolist()
|
96 |
+
elif isinstance(value, np.int64):
|
97 |
+
result[key] = int(value)
|
98 |
+
|
99 |
+
return result
|
100 |
+
|
101 |
+
|
102 |
+
if __name__ == "__main__":
|
103 |
+
parser = argparse.ArgumentParser()
|
104 |
+
parser.add_argument("--data_path", type=str, required=True)
|
105 |
+
parser.add_argument("--save_path", type=str, help="Path to save the statistics.")
|
106 |
+
args = parser.parse_args()
|
107 |
+
|
108 |
+
with open(args.data_path, "r") as f:
|
109 |
+
sample = json.loads(f.readline())
|
110 |
+
if "prompt" in sample:
|
111 |
+
statistics = get_statistics_for_prompt_completion_data(args.data_path)
|
112 |
+
elif "messages" in sample:
|
113 |
+
statistics = get_statistics_for_messages_data(args.data_path)
|
114 |
+
else:
|
115 |
+
raise ValueError("Invalid data format - the data should be either prompt completion data or messages data.")
|
116 |
+
|
117 |
+
print(json.dumps(statistics, indent=4))
|
118 |
+
|
119 |
+
if args.save_path is not None:
|
120 |
+
with open(args.save_path, "w") as f:
|
121 |
+
json.dump(statistics, f, indent=4)
|
open_instruct/reformat_data.py
ADDED
@@ -0,0 +1,551 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding=utf-8
|
3 |
+
'''
|
4 |
+
This script is used to reformat the downloaded datasets into the format that can be used by the model.
|
5 |
+
Here we use jsonl for the converted data. Each line in the jsonl file is a json object formatted as follows:
|
6 |
+
{
|
7 |
+
"dataset": "dataset_name",
|
8 |
+
"id": "unique_id",
|
9 |
+
"messages": [
|
10 |
+
{"role": "system", "content": "message_text"}, # optional
|
11 |
+
{"role": "user", "content": "message_text"},
|
12 |
+
{"role": "assistant", "content": "message_text"},
|
13 |
+
{"role": "user", "content": "message_text"},
|
14 |
+
{"role": "assistant", "content": "message_text"},
|
15 |
+
...
|
16 |
+
],
|
17 |
+
}
|
18 |
+
'''
|
19 |
+
|
20 |
+
import json
|
21 |
+
import random
|
22 |
+
import re
|
23 |
+
import os
|
24 |
+
import pandas as pd
|
25 |
+
import argparse
|
26 |
+
from instruction_encode_templates import encode_instruction_example, encode_few_shot_example
|
27 |
+
|
28 |
+
|
29 |
+
def convert_super_ni_data(data_dir, output_dir, zero_shot_examples_per_task=60, few_shot_examples_per_task=20, n_few_shot=2):
|
30 |
+
os.makedirs(output_dir, exist_ok=True)
|
31 |
+
train_tasks = []
|
32 |
+
with open(os.path.join(data_dir, "splits", "xlingual", "train_tasks.txt"), "r") as fin:
|
33 |
+
for line in fin:
|
34 |
+
if not "_mmmlu_" in line: # skip mmlu to avoid test leakage
|
35 |
+
train_tasks.append(line.strip())
|
36 |
+
with open(os.path.join(output_dir, "super_ni_data.jsonl"), "w") as fout:
|
37 |
+
for task in train_tasks:
|
38 |
+
with open(os.path.join(data_dir, "tasks", f"{task}.json"), "r") as fin:
|
39 |
+
task_data = json.load(fin)
|
40 |
+
instruction = task_data["Definition"][0]
|
41 |
+
if zero_shot_examples_per_task + few_shot_examples_per_task < len(task_data["Instances"]):
|
42 |
+
instances = random.sample(task_data["Instances"], k=zero_shot_examples_per_task+few_shot_examples_per_task)
|
43 |
+
else:
|
44 |
+
instances = task_data["Instances"]
|
45 |
+
for instance in instances[:zero_shot_examples_per_task]:
|
46 |
+
encoded_example = encode_instruction_example(
|
47 |
+
instruction=instruction,
|
48 |
+
input=instance["input"],
|
49 |
+
output=instance["output"][0],
|
50 |
+
random_template=True,
|
51 |
+
eos_token=None
|
52 |
+
)
|
53 |
+
fout.write(json.dumps({
|
54 |
+
"dataset": "super_ni",
|
55 |
+
"id": f"super_ni_{instance['id']}",
|
56 |
+
"messages": [
|
57 |
+
{"role": "user", "content": encoded_example["prompt"]},
|
58 |
+
{"role": "assistant", "content": encoded_example["completion"]},
|
59 |
+
]
|
60 |
+
}) + "\n")
|
61 |
+
for instance in instances[zero_shot_examples_per_task:]:
|
62 |
+
if n_few_shot < len(task_data["Positive Examples"]):
|
63 |
+
examplars = random.sample(task_data["Positive Examples"], k=n_few_shot)
|
64 |
+
else:
|
65 |
+
examplars = task_data["Positive Examples"]
|
66 |
+
encoded_example = encode_few_shot_example(
|
67 |
+
instruction=instruction,
|
68 |
+
examplars=examplars,
|
69 |
+
input=instance["input"],
|
70 |
+
output=instance["output"][0],
|
71 |
+
eos_token=None
|
72 |
+
)
|
73 |
+
fout.write(json.dumps({
|
74 |
+
"dataset": "super_ni",
|
75 |
+
"id": f"super_ni_{instance['id']}",
|
76 |
+
"messages": [
|
77 |
+
{"role": "user", "content": encoded_example["prompt"]},
|
78 |
+
{"role": "assistant", "content": encoded_example["completion"]},
|
79 |
+
]
|
80 |
+
}) + "\n")
|
81 |
+
|
82 |
+
|
83 |
+
def convert_cot_data(data_dir, output_dir, num_zero_shot_examples=50000, num_few_shot_examples=50000):
|
84 |
+
os.makedirs(output_dir, exist_ok=True)
|
85 |
+
examples = []
|
86 |
+
if num_few_shot_examples > 0:
|
87 |
+
with open(os.path.join(data_dir, "cot_zsopt.jsonl"), "r") as fin:
|
88 |
+
zero_shot_examples = [json.loads(line) for line in fin]
|
89 |
+
if num_zero_shot_examples < len(zero_shot_examples):
|
90 |
+
zero_shot_examples = random.sample(zero_shot_examples, k=num_zero_shot_examples)
|
91 |
+
examples.extend(zero_shot_examples)
|
92 |
+
if num_few_shot_examples > 0:
|
93 |
+
with open(os.path.join(data_dir, "cot_fsopt.jsonl"), "r") as fin:
|
94 |
+
few_shot_examples = [json.loads(line) for line in fin]
|
95 |
+
if num_few_shot_examples < len(few_shot_examples):
|
96 |
+
few_shot_examples = random.sample(few_shot_examples, k=num_few_shot_examples)
|
97 |
+
examples.extend(few_shot_examples)
|
98 |
+
output_path = os.path.join(output_dir, "cot_data.jsonl")
|
99 |
+
with open(output_path, "w") as fout:
|
100 |
+
for idx, example in enumerate(examples):
|
101 |
+
prompt = example["inputs"]
|
102 |
+
if not prompt.endswith("\n") and not prompt.rstrip().endswith(":"):
|
103 |
+
prompt += "\n"
|
104 |
+
completion = example["targets"]
|
105 |
+
fout.write(json.dumps({
|
106 |
+
"dataset": "cot",
|
107 |
+
"id": f"cot_{idx}",
|
108 |
+
"messages": [
|
109 |
+
{"role": "user", "content": prompt},
|
110 |
+
{"role": "assistant", "content": completion},
|
111 |
+
]
|
112 |
+
}) + "\n")
|
113 |
+
|
114 |
+
|
115 |
+
def convert_flan_v2_data(data_dir, output_dir):
|
116 |
+
os.makedirs(output_dir, exist_ok=True)
|
117 |
+
examples = []
|
118 |
+
with open(os.path.join(data_dir, "flan_v2_resampled_100k.jsonl"), "r") as fin:
|
119 |
+
for line in fin:
|
120 |
+
examples.append(json.loads(line))
|
121 |
+
output_path = os.path.join(output_dir, "flan_v2_data.jsonl")
|
122 |
+
with open(output_path, "w") as fout:
|
123 |
+
for idx, example in enumerate(examples):
|
124 |
+
prompt = example["inputs"]
|
125 |
+
if not prompt.endswith("\n") and not prompt.rstrip().endswith(":"):
|
126 |
+
prompt += "\n"
|
127 |
+
completion = example["targets"]
|
128 |
+
fout.write(json.dumps({
|
129 |
+
"dataset": "flan_v2",
|
130 |
+
"id": f"flan_v2_{idx}",
|
131 |
+
"messages": [
|
132 |
+
{"role": "user", "content": prompt},
|
133 |
+
{"role": "assistant", "content": completion},
|
134 |
+
]
|
135 |
+
}) + "\n")
|
136 |
+
|
137 |
+
|
138 |
+
def convert_dolly_data(data_dir, output_dir):
|
139 |
+
os.makedirs(output_dir, exist_ok=True)
|
140 |
+
examples = []
|
141 |
+
with open(os.path.join(data_dir, "databricks-dolly-15k.jsonl"), "r") as fin:
|
142 |
+
for line in fin:
|
143 |
+
examples.append(json.loads(line))
|
144 |
+
output_path = os.path.join(output_dir, "dolly_data.jsonl")
|
145 |
+
with open(output_path, "w") as fout:
|
146 |
+
for idx, example in enumerate(examples):
|
147 |
+
encoded_example = encode_instruction_example(
|
148 |
+
instruction=example["instruction"],
|
149 |
+
input=example["context"],
|
150 |
+
output=example["response"],
|
151 |
+
random_template=True,
|
152 |
+
eos_token=None
|
153 |
+
)
|
154 |
+
fout.write(json.dumps({
|
155 |
+
"dataset": "dolly",
|
156 |
+
"id": f"dolly_{idx}",
|
157 |
+
"messages": [
|
158 |
+
{"role": "user", "content": encoded_example["prompt"]},
|
159 |
+
{"role": "assistant", "content": encoded_example["completion"]},
|
160 |
+
]
|
161 |
+
}) + "\n")
|
162 |
+
|
163 |
+
|
164 |
+
def convert_self_instruct_data(data_dir, output_dir):
|
165 |
+
os.makedirs(output_dir, exist_ok=True)
|
166 |
+
examples = []
|
167 |
+
with open(os.path.join(data_dir, "all_instances_82K.jsonl"), "r") as fin:
|
168 |
+
for line in fin:
|
169 |
+
examples.append(json.loads(line))
|
170 |
+
output_path = os.path.join(output_dir, "self_instruct_data.jsonl")
|
171 |
+
with open(output_path, "w") as fout:
|
172 |
+
for idx, example in enumerate(examples):
|
173 |
+
encoded_example = encode_instruction_example(
|
174 |
+
instruction=example["instruction"],
|
175 |
+
input=example["input"],
|
176 |
+
output=example["output"],
|
177 |
+
random_template=True,
|
178 |
+
eos_token=None
|
179 |
+
)
|
180 |
+
fout.write(json.dumps({
|
181 |
+
"dataset": "self_instruct",
|
182 |
+
"id": f"self_instruct_{idx}",
|
183 |
+
"messages": [
|
184 |
+
{"role": "user", "content": encoded_example["prompt"]},
|
185 |
+
{"role": "assistant", "content": encoded_example["completion"]},
|
186 |
+
]
|
187 |
+
}) + "\n")
|
188 |
+
|
189 |
+
|
190 |
+
def convert_unnatural_instructions_data(data_dir, output_dir):
|
191 |
+
os.makedirs(output_dir, exist_ok=True)
|
192 |
+
instance_cnt = 0
|
193 |
+
with open(os.path.join(data_dir, "core_data.jsonl"), "r") as fin, open((os.path.join(output_dir, "unnatural_instructions_data.jsonl")), "w") as fout:
|
194 |
+
for line in fin:
|
195 |
+
task_data = json.loads(line)
|
196 |
+
instruction = task_data["instruction"]
|
197 |
+
for instance in task_data["instances"]:
|
198 |
+
if instance["constraints"] and instance["constraints"].lower() not in ["none", "none."]:
|
199 |
+
instance_instruction = instruction + "\n" + instance["constraints"]
|
200 |
+
else:
|
201 |
+
instance_instruction = instruction
|
202 |
+
encoded_example = encode_instruction_example(
|
203 |
+
instruction=instance_instruction,
|
204 |
+
input=instance["input"],
|
205 |
+
output=instance["output"],
|
206 |
+
random_template=True,
|
207 |
+
eos_token=None
|
208 |
+
)
|
209 |
+
fout.write(json.dumps({
|
210 |
+
"dataset": "unnatural_instructions",
|
211 |
+
"id": f"unnatural_instructions_{instance_cnt}",
|
212 |
+
"messages": [
|
213 |
+
{"role": "user", "content": encoded_example["prompt"]},
|
214 |
+
{"role": "assistant", "content": encoded_example["completion"]},
|
215 |
+
]
|
216 |
+
}) + "\n")
|
217 |
+
instance_cnt += 1
|
218 |
+
|
219 |
+
|
220 |
+
def convert_stanford_alpaca_data(data_dir, output_dir):
|
221 |
+
os.makedirs(output_dir, exist_ok=True)
|
222 |
+
examples = []
|
223 |
+
with open(os.path.join(data_dir, "alpaca_data.json"), "r") as fin:
|
224 |
+
examples.extend(json.load(fin))
|
225 |
+
output_path = os.path.join(output_dir, "stanford_alpaca_data.jsonl")
|
226 |
+
with open(output_path, "w") as fout:
|
227 |
+
for idx, example in enumerate(examples):
|
228 |
+
encoded_example = encode_instruction_example(
|
229 |
+
instruction=example["instruction"],
|
230 |
+
input=example["input"],
|
231 |
+
output=example["output"],
|
232 |
+
random_template=True,
|
233 |
+
eos_token=None
|
234 |
+
)
|
235 |
+
fout.write(json.dumps({
|
236 |
+
"dataset": "stanford_alpaca",
|
237 |
+
"id": f"stanford_alpaca_{idx}",
|
238 |
+
"messages": [
|
239 |
+
{"role": "user", "content": encoded_example["prompt"]},
|
240 |
+
{"role": "assistant", "content": encoded_example["completion"]},
|
241 |
+
]
|
242 |
+
}) + "\n")
|
243 |
+
|
244 |
+
|
245 |
+
def convert_code_alpaca_data(data_dir, output_dir):
|
246 |
+
os.makedirs(output_dir, exist_ok=True)
|
247 |
+
examples = []
|
248 |
+
with open(os.path.join(data_dir, "code_alpaca_20k.json"), "r") as fin:
|
249 |
+
examples.extend(json.load(fin))
|
250 |
+
output_path = os.path.join(output_dir, "code_alpaca_data.jsonl")
|
251 |
+
with open(output_path, "w") as fout:
|
252 |
+
for idx, example in enumerate(examples):
|
253 |
+
encoded_example = encode_instruction_example(
|
254 |
+
instruction=example["instruction"],
|
255 |
+
input=example["input"],
|
256 |
+
output=example["output"],
|
257 |
+
random_template=True,
|
258 |
+
eos_token=None
|
259 |
+
)
|
260 |
+
fout.write(json.dumps({
|
261 |
+
"dataset": "code_alpaca",
|
262 |
+
"id": f"code_alpaca_{idx}",
|
263 |
+
"messages": [
|
264 |
+
{"role": "user", "content": encoded_example["prompt"]},
|
265 |
+
{"role": "assistant", "content": encoded_example["completion"]},
|
266 |
+
]
|
267 |
+
}) + "\n")
|
268 |
+
|
269 |
+
|
270 |
+
def convert_gpt4_alpaca_data(data_dir, output_dir, load_en=True, load_zh=False):
|
271 |
+
os.makedirs(output_dir, exist_ok=True)
|
272 |
+
examples = []
|
273 |
+
if load_en:
|
274 |
+
with open(os.path.join(data_dir, "alpaca_gpt4_data.json"), "r") as fin:
|
275 |
+
examples.extend(json.load(fin))
|
276 |
+
if load_zh:
|
277 |
+
with open(os.path.join(data_dir, "alpaca_gpt4_data_zh.json"), "r") as fin:
|
278 |
+
examples.extend(json.load(fin))
|
279 |
+
output_path = os.path.join(output_dir, "gpt4_alpaca_data.jsonl")
|
280 |
+
with open(output_path, "w") as fout:
|
281 |
+
for idx, example in enumerate(examples):
|
282 |
+
encoded_example = encode_instruction_example(
|
283 |
+
instruction=example["instruction"],
|
284 |
+
input=example["input"],
|
285 |
+
output=example["output"],
|
286 |
+
random_template=True,
|
287 |
+
eos_token=None
|
288 |
+
)
|
289 |
+
fout.write(json.dumps({
|
290 |
+
"dataset": "gpt4_alpaca",
|
291 |
+
"id": f"gpt4_alpaca_{idx}",
|
292 |
+
"messages": [
|
293 |
+
{"role": "user", "content": encoded_example["prompt"]},
|
294 |
+
{"role": "assistant", "content": encoded_example["completion"]},
|
295 |
+
]
|
296 |
+
}) + "\n")
|
297 |
+
|
298 |
+
|
299 |
+
def convert_sharegpt_data(data_dir, output_dir):
|
300 |
+
os.makedirs(output_dir, exist_ok=True)
|
301 |
+
examples = []
|
302 |
+
with open(os.path.join(data_dir, "sharegpt_html_cleaned_and_split.json"), "r") as fin:
|
303 |
+
examples.extend(json.load(fin))
|
304 |
+
|
305 |
+
output_path = os.path.join(output_dir, "sharegpt_data.jsonl")
|
306 |
+
with open(output_path, "w") as fout:
|
307 |
+
invalid_cnt = 0
|
308 |
+
for idx, example in enumerate(examples):
|
309 |
+
messages = []
|
310 |
+
valid = True
|
311 |
+
for message in example["conversations"]:
|
312 |
+
if message["from"] == "human" or message["from"] == "user":
|
313 |
+
messages.append({
|
314 |
+
"role": "user",
|
315 |
+
"content": message["value"]
|
316 |
+
})
|
317 |
+
elif message["from"] == "gpt" or message["from"] == "chatgpt":
|
318 |
+
messages.append({
|
319 |
+
"role": "assistant",
|
320 |
+
"content": message["value"]
|
321 |
+
})
|
322 |
+
elif message["from"] == "system":
|
323 |
+
valid = False
|
324 |
+
invalid_cnt += 1
|
325 |
+
break
|
326 |
+
elif message["from"] == "bing":
|
327 |
+
valid = False
|
328 |
+
invalid_cnt += 1
|
329 |
+
break
|
330 |
+
else:
|
331 |
+
raise ValueError(f"Unknown message sender: {message['from']}")
|
332 |
+
if messages and valid:
|
333 |
+
fout.write(json.dumps({
|
334 |
+
"dataset": "sharegpt",
|
335 |
+
"id": f"sharegpt_{example['id']}",
|
336 |
+
"messages": messages
|
337 |
+
}) + "\n")
|
338 |
+
print(f"# of invalid examples in sharegpt data: {invalid_cnt}")
|
339 |
+
|
340 |
+
|
341 |
+
def convert_baize_data(data_dir, output_dir):
|
342 |
+
os.makedirs(output_dir, exist_ok=True)
|
343 |
+
examples = []
|
344 |
+
for source in ["alpaca", "medical", "quora", "stackoverflow"]:
|
345 |
+
with open(os.path.join(data_dir, f"{source}_chat_data.json"), "r") as fin:
|
346 |
+
examples.extend(json.load(fin))
|
347 |
+
|
348 |
+
output_path = os.path.join(output_dir, "baize_data.jsonl")
|
349 |
+
with open(output_path, "w") as fout:
|
350 |
+
for idx, example in enumerate(examples):
|
351 |
+
# split example["input"] by [|Human|] and [|AI|]
|
352 |
+
messages = []
|
353 |
+
rounds = example["input"].split("[|Human|]")[1:]
|
354 |
+
for round in rounds:
|
355 |
+
if not round.strip() or "[|AI|]" not in round:
|
356 |
+
continue
|
357 |
+
human, assistant = round.split("[|AI|]")
|
358 |
+
messages.append({
|
359 |
+
"role": "user",
|
360 |
+
"content": human.strip()
|
361 |
+
})
|
362 |
+
messages.append({
|
363 |
+
"role": "assistant",
|
364 |
+
"content": assistant.strip()
|
365 |
+
})
|
366 |
+
fout.write(json.dumps({
|
367 |
+
"dataset": "baize",
|
368 |
+
"id": f"baize_{idx}",
|
369 |
+
"messages": messages
|
370 |
+
}) + "\n")
|
371 |
+
|
372 |
+
|
373 |
+
def convert_oasst1_data(data_dir, output_dir):
|
374 |
+
'''
|
375 |
+
For OASST1, because it's in a tree structure, where every user input might get multiple replies,
|
376 |
+
we have to save every path from the root node to the assistant reply (including both leaf node and intemediate node).
|
377 |
+
This results in some of the messages being duplicated among different paths (instances).
|
378 |
+
Be careful when using this dataset for training. Ideally, you should only minimize the loss of the last message in each path.
|
379 |
+
'''
|
380 |
+
os.makedirs(output_dir, exist_ok=True)
|
381 |
+
conversations = []
|
382 |
+
with open(os.path.join(data_dir, "2023-04-12_oasst_ready.trees.jsonl"), "r") as fin:
|
383 |
+
for line in fin:
|
384 |
+
conversations.append(json.loads(line))
|
385 |
+
|
386 |
+
output_path = os.path.join(output_dir, "oasst1_data.jsonl")
|
387 |
+
|
388 |
+
# we filter out the sequences that mention the creator information
|
389 |
+
filter_strings = [
|
390 |
+
"LAION",
|
391 |
+
"Open Asssistant",
|
392 |
+
"OpenAssistant",
|
393 |
+
]
|
394 |
+
|
395 |
+
# tranvers the conversation tree, and collect all valid sequences
|
396 |
+
def dfs(reply, messages, valid_sequences):
|
397 |
+
if any([filter_string in reply["text"] for filter_string in filter_strings]):
|
398 |
+
return
|
399 |
+
if reply["role"] == "assistant":
|
400 |
+
messages.append(
|
401 |
+
{"role": "assistant", "content": reply["text"]}
|
402 |
+
)
|
403 |
+
if not reply["replies"]: # leaf node
|
404 |
+
valid_sequences.append(messages[:])
|
405 |
+
else:
|
406 |
+
for child in reply["replies"]:
|
407 |
+
dfs(child, messages, valid_sequences)
|
408 |
+
messages.pop()
|
409 |
+
elif reply["role"] == "prompter":
|
410 |
+
messages.append(
|
411 |
+
{"role": "user", "content": reply["text"]}
|
412 |
+
)
|
413 |
+
for child in reply["replies"]:
|
414 |
+
dfs(child, messages, valid_sequences)
|
415 |
+
messages.pop()
|
416 |
+
else:
|
417 |
+
raise ValueError(f"Unknown role: {reply['role']}")
|
418 |
+
|
419 |
+
with open(output_path, "w") as fout:
|
420 |
+
example_cnt = 0
|
421 |
+
for _, conversation in enumerate(conversations):
|
422 |
+
valid_sequences = []
|
423 |
+
dfs(conversation["prompt"], [], valid_sequences)
|
424 |
+
for sequence in valid_sequences:
|
425 |
+
fout.write(json.dumps({
|
426 |
+
"dataset": "oasst1",
|
427 |
+
"id": f"oasst1_{example_cnt}",
|
428 |
+
"messages": sequence
|
429 |
+
}) + "\n")
|
430 |
+
example_cnt += 1
|
431 |
+
|
432 |
+
|
433 |
+
def convert_lima_data(data_dir, output_dir):
|
434 |
+
os.makedirs(output_dir, exist_ok=True)
|
435 |
+
examples = []
|
436 |
+
with open(os.path.join(data_dir, "train.jsonl"), "r") as fin:
|
437 |
+
for line in fin:
|
438 |
+
examples.append(json.loads(line))
|
439 |
+
output_path = os.path.join(output_dir, "lima_data.jsonl")
|
440 |
+
with open(output_path, "w") as fout:
|
441 |
+
for idx, example in enumerate(examples):
|
442 |
+
messages = []
|
443 |
+
if not len(example["conversations"]) % 2 == 0:
|
444 |
+
print(f"Waring: example {idx} in LIMA has odd number of messages. Cutting off the last message.")
|
445 |
+
example["conversations"] = example["conversations"][:-1]
|
446 |
+
|
447 |
+
for i in range(0, len(example["conversations"]), 2):
|
448 |
+
messages.append({
|
449 |
+
"role": "user",
|
450 |
+
"content": example["conversations"][i]
|
451 |
+
})
|
452 |
+
messages.append({
|
453 |
+
"role": "assistant",
|
454 |
+
"content": example["conversations"][i+1]
|
455 |
+
})
|
456 |
+
fout.write(json.dumps({
|
457 |
+
"dataset": "lima",
|
458 |
+
"id": f"lima_{idx}",
|
459 |
+
"messages": messages,
|
460 |
+
}) + "\n")
|
461 |
+
|
462 |
+
|
463 |
+
def convert_wizardlm_data(data_dir, output_dir):
|
464 |
+
os.makedirs(output_dir, exist_ok=True)
|
465 |
+
examples = []
|
466 |
+
with open(os.path.join(data_dir, "WizardLM_evol_instruct_V2_143k.json"), "r") as fin:
|
467 |
+
examples = json.load(fin)
|
468 |
+
|
469 |
+
output_path = os.path.join(output_dir, "wizardlm_data.jsonl")
|
470 |
+
with open(output_path, "w") as fout:
|
471 |
+
for idx, example in enumerate(examples):
|
472 |
+
messages = []
|
473 |
+
assert len(example["conversations"]) % 2 == 0
|
474 |
+
for i in range(0, len(example["conversations"]), 2):
|
475 |
+
assert example["conversations"][i]["from"] == "human"
|
476 |
+
assert example["conversations"][i+1]["from"] == "gpt"
|
477 |
+
messages.append({
|
478 |
+
"role": "user",
|
479 |
+
"content": example["conversations"][i]["value"]
|
480 |
+
})
|
481 |
+
messages.append({
|
482 |
+
"role": "assistant",
|
483 |
+
"content": example["conversations"][i+1]["value"]
|
484 |
+
})
|
485 |
+
fout.write(json.dumps({
|
486 |
+
"dataset": "wizardlm",
|
487 |
+
"id": f"wizardlm_{example['idx']}",
|
488 |
+
"messages": messages,
|
489 |
+
}) + "\n")
|
490 |
+
|
491 |
+
|
492 |
+
def convert_open_orca_data(data_dir, output_dir, num_gpt4_examples=100000, num_gpt35_examples=0):
|
493 |
+
os.makedirs(output_dir, exist_ok=True)
|
494 |
+
examples = []
|
495 |
+
|
496 |
+
df = pd.read_parquet(os.path.join(data_dir, "1M-GPT4-Augmented.parquet"))
|
497 |
+
gpt4_examples = [row.to_dict() for _, row in df.iterrows()]
|
498 |
+
random.shuffle(gpt4_examples)
|
499 |
+
examples.extend(gpt4_examples[:num_gpt4_examples])
|
500 |
+
|
501 |
+
df = pd.read_parquet(os.path.join(data_dir, "3_5M-GPT3_5-Augmented.parquet"))
|
502 |
+
gpt35_examples = [row.to_dict() for _, row in df.iterrows()]
|
503 |
+
random.shuffle(gpt35_examples)
|
504 |
+
examples.extend(gpt35_examples[:num_gpt35_examples])
|
505 |
+
|
506 |
+
output_path = os.path.join(output_dir, "open_orca_data.jsonl")
|
507 |
+
with open(output_path, "w") as fout:
|
508 |
+
for idx, example in enumerate(examples):
|
509 |
+
messages = [
|
510 |
+
{"role": "system", "content": example["system_prompt"]},
|
511 |
+
{"role": "user", "content": example["question"]},
|
512 |
+
{"role": "assistant", "content": example["response"]}
|
513 |
+
]
|
514 |
+
fout.write(json.dumps({
|
515 |
+
"dataset": "open_orca",
|
516 |
+
"id": f"open_orca_{example['id']}",
|
517 |
+
"messages": messages,
|
518 |
+
}) + "\n")
|
519 |
+
|
520 |
+
|
521 |
+
if __name__ == "__main__":
|
522 |
+
arg_parser = argparse.ArgumentParser()
|
523 |
+
arg_parser.add_argument("--raw_data_dir", type=str, default="data/downloads")
|
524 |
+
arg_parser.add_argument("--output_dir", type=str, default="data/processed")
|
525 |
+
arg_parser.add_argument("--seed", type=int, default=42)
|
526 |
+
args = arg_parser.parse_args()
|
527 |
+
random.seed(args.seed)
|
528 |
+
|
529 |
+
# get the subfolder names in raw_data_dir
|
530 |
+
subfolders = [f for f in os.listdir(args.raw_data_dir) if os.path.isdir(os.path.join(args.raw_data_dir, f))]
|
531 |
+
|
532 |
+
# all supported datasets
|
533 |
+
supported_datasets = []
|
534 |
+
all_funcs = [func_name for func_name in globals() if callable(globals()[func_name])]
|
535 |
+
for func_name in all_funcs:
|
536 |
+
if re.match(r"convert_.+_data", func_name):
|
537 |
+
supported_datasets.append(func_name[8:-5])
|
538 |
+
|
539 |
+
# check if the subfolder names are supported datasets
|
540 |
+
valid_subfolders = []
|
541 |
+
for subfolder in subfolders:
|
542 |
+
if subfolder not in supported_datasets:
|
543 |
+
print(f"Warning: {subfolder} in the raw data folder is not a supported dataset. We will skip it.")
|
544 |
+
else:
|
545 |
+
valid_subfolders.append(subfolder)
|
546 |
+
|
547 |
+
# prepare data for each dataset
|
548 |
+
statistics = {}
|
549 |
+
for subfolder in valid_subfolders:
|
550 |
+
print(f"Processing {subfolder} data...")
|
551 |
+
globals()[f"convert_{subfolder}_data"](os.path.join(args.raw_data_dir, subfolder), os.path.join(args.output_dir, subfolder))
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
pdfplumber
|
3 |
+
docx2txt
|
4 |
+
justext
|
5 |
+
openai
|
6 |
+
langchain
|
web-app.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
3 |
+
from langchain.vectorstores import FAISS, Chroma
|
4 |
+
from langchain.text_splitter import CharacterTextSplitter
|
5 |
+
from langchain.llms import OpenAI as OpenAI_llm
|
6 |
+
from langchain.chat_models import ChatOpenAI
|
7 |
+
from langchain.chains import ConversationalRetrievalChain,RetrievalQA
|
8 |
+
from langchain.memory import ConversationBufferMemory
|
9 |
+
from langchain.document_loaders import PyPDFLoader, TextLoader, WebBaseLoader
|
10 |
+
from langchain.prompts.chat import ChatPromptTemplate,HumanMessagePromptTemplate,SystemMessagePromptTemplate
|
11 |
+
# from langchain.chains.qa_with_sources import load_qa_with_sources_chain,BaseCombineDocumentsChain
|
12 |
+
import os
|
13 |
+
import chromadb
|
14 |
+
import tempfile
|
15 |
+
import requests
|
16 |
+
import openai
|
17 |
+
from bs4 import BeautifulSoup
|
18 |
+
from urllib.parse import urlparse
|
19 |
+
|
20 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
21 |
+
|
22 |
+
def assistant(url):
|
23 |
+
|
24 |
+
|
25 |
+
question=st.text_input("Ask your Question")
|
26 |
+
|
27 |
+
if st.button("Submit",type="primary"):
|
28 |
+
ABS_PATH: str = os.path.dirname(os.path.abspath(__file__))
|
29 |
+
DB_DIR: str = os.path.join(ABS_PATH,"db")
|
30 |
+
|
31 |
+
loader=WebBaseLoader(url)
|
32 |
+
data=loader.load()
|
33 |
+
|
34 |
+
text_splitter = CharacterTextSplitter(separator='\n',
|
35 |
+
chunk_size=1000,chunk_overlap=0)
|
36 |
+
|
37 |
+
docs = text_splitter.split_documents(data)
|
38 |
+
|
39 |
+
|
40 |
+
openai_embeddings = OpenAIEmbeddings()
|
41 |
+
|
42 |
+
# client = chromadb.PersistentClient(path=DB_DIR)
|
43 |
+
vectordb = FAISS.from_documents(documents=docs,embedding=openai_embeddings)
|
44 |
+
|
45 |
+
|
46 |
+
# vectordb.persist()
|
47 |
+
|
48 |
+
|
49 |
+
retriever=vectordb.as_retriever()
|
50 |
+
|
51 |
+
llm=ChatOpenAI(model_name='gpt-3.5-turbo')
|
52 |
+
|
53 |
+
qa=RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
|
54 |
+
|
55 |
+
|
56 |
+
response=qa(question)
|
57 |
+
|
58 |
+
st.write(response)
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
st.title('Chat with Website')
|
63 |
+
|
64 |
+
url=st.text_input('Enter Your URL here:')
|
65 |
+
|
66 |
+
if url:
|
67 |
+
assistant(url)
|