Spaces:

OdiaGenAI
/

Olive_Farm

Sleeping

App Files Files Community

sam2ai commited on Sep 26, 2023

Commit

11fa0f1

1 Parent(s): 52e6714

Synced repo using 'sync_with_huggingface' Github Action

Browse files

Files changed (14) hide show

.gitattributes +1 -0
Dockerfile +27 -0
LICENSE +201 -0
__pycache__/custom_prompt_template.cpython-311.pyc +0 -0
__pycache__/custom_prompt_template.cpython-39.pyc +0 -0
app.py +451 -0
custom_prompt_template.py +43 -0
data-downloader/download_eval_data.sh +68 -0
data-downloader/download_instructions_data.sh +120 -0
olive_farm.png +3 -0
open_instruct/get_data_stats.py +121 -0
open_instruct/reformat_data.py +551 -0
requirements.txt +6 -0
web-app.py +67 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+olive_farm.png filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,27 @@

+FROM python:3.8-slim-buster
+WORKDIR /app
+COPY ./requirements.txt /app/requirements.txt
+# COPY ./packages.txt /app/packages.txt
+# RUN apt-get update && xargs -r -a /app/packages.txt apt-get install -y && rm -rf /var/lib/apt/lists/*
+RUN pip3 install --no-cache-dir -r /app/requirements.txt
+# User
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME /home/user
+ENV PATH $HOME/.local/bin:$PATH
+WORKDIR $HOME
+RUN mkdir app
+WORKDIR $HOME/app
+COPY . $HOME/app
+EXPOSE 8501
+CMD streamlit run app.py \
+    --server.headless true \
+    --server.enableCORS false \
+    --server.enableXsrfProtection false \
+    --server.fileWatcherType none

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

__pycache__/custom_prompt_template.cpython-311.pyc ADDED Viewed

Binary file (2.41 kB). View file

__pycache__/custom_prompt_template.cpython-39.pyc ADDED Viewed

Binary file (1.33 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,451 @@

+import streamlit as st
+import requests
+import justext
+import pdfplumber
+import docx2txt
+import json
+import ast
+import os
+import re
+import openai
+import json
+from custom_prompt_template import InstructionGenerationTemplate, AnswerGenerationTemplate
+st.set_page_config(page_title="LLM instruction Generator")
+st.sidebar.success("Select a page above")
+# function for the odia stoplists justext
+def odia_stoplist():
+    odia_stopwords = [
+        "ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର",
+        "ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର",
+        "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡିକ", "ସେମାନଙ୍କର",
+        "ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ",
+        "ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା",
+        "ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |",
+        "କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ",
+        "ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ",
+        "ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ",
+        "ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ",
+        "ତାପରେ", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ",
+        "ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି",
+        "ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ",
+        "କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ"
+    ]
+    return frozenset(odia_stopwords)
+# function to extract data from url using justext
+def extract_data_from_url(url, language):
+    try:
+        response = requests.get(url)
+        if response.status_code == 200:
+            print("inside the response")
+            response.raise_for_status()
+            page = response.content
+            para = ""
+            if language == "English":
+                paragraphs = justext.justext(page, justext.get_stoplist("English"))
+            elif language == "Hindi":
+                paragraphs = justext.justext(page, justext.get_stoplist("Hindi"), 70, 140, 0.0, 0.02, 0.5, 150, False)
+            elif language == "Odia":
+                paragraphs = justext.justext(
+                    page, odia_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False
+                )
+            for paragraph in paragraphs:
+                if not paragraph.is_boilerplate:
+                    para = para + "\n" + paragraph.text
+            # returning the extracted data i.e para as string
+            if para == "":
+                st.error("Unable to extract data from the URL")
+                return None
+            else:
+                return para
+        else:
+            st.error("Request failed ")
+            return None
+    except Exception as err:
+        st.error(err)
+        return None
+# function to extract data from documents
+def extract_data_from_documents(documents):
+    data = ""
+    if documents is not None:
+        for document in documents:
+            document_details = {
+                "filename": document.name,
+                "filetype": document.type,
+                "filesize": document.size,
+            }
+            st.write(document_details)
+            # Extract content from the txt file
+            if document.type == "text/plain":
+                # Read as bytes
+                data += str(document.read(), "utf-8")
+            # Extract content from the pdf file
+            elif document.type == "application/pdf":
+                # using pdfplumber
+                try:
+                    with pdfplumber.open(document) as pdf:
+                        all_text = ""
+                        for page in pdf.pages:
+                            text = page.extract_text()
+                            all_text += text + "\n"
+                        data += all_text
+                except requests.exceptions.RequestException as e:
+                    st.write("None")
+            # Extract content from the docx file
+            elif (
+                document.type
+                == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+            ):
+                data += docx2txt.process(document)
+        # return extract data
+        return data
+    else:
+        st.error("Error: An error occurred while fetching content.")
+        # return extract status, and the data extracted
+        return None
+# function for the keyboard
+# Check the inputs for language, promptType
+def valid_drop_down(language, promptType, noOfQuestions, instructionFormat):
+    langFlag = False
+    promptFlag = False
+    noOfQuestionFlag = False
+    instructionFormatFlag = False
+    if language:
+        langFlag = True
+    if promptType:
+        promptFlag = True
+    if noOfQuestions:
+        noOfQuestionFlag = True
+    if instructionFormat:
+        instructionFormatFlag = True
+    # checking for the compalsory inputs and return true only if all are set
+    return langFlag & promptFlag & noOfQuestionFlag & instructionFormatFlag
+def main():
+    # setting up the initial session_states
+    if "extract_button" not in st.session_state:
+        st.session_state.extract_button = False
+    if "submit" not in st.session_state:
+        st.session_state.submit = False
+    if "generated" not in st.session_state:
+        st.session_state.generated = False
+    if "selected" not in st.session_state:
+        st.session_state.selected = False
+    if "answered" not in st.session_state:
+        st.session_state.answered = False
+    st.subheader("LLM Instructions")
+    # form to get the inputs
+    with st.form(key="form1"):
+        st.write("#")
+        # dropdown for language
+        language = st.selectbox("Select a language", ("", "English", "Hindi", "Odia"))
+        # dropdown for prompt type
+        promptType = st.selectbox(
+            "Select the Prompt type", ("", "Input text", "Url", "Document")
+        )
+        # inputs for number
+        noOfQuestions = st.number_input(
+            "Number of questions to generate:", min_value=1, max_value=20, value=10
+        )
+        # dropdown for language
+        instructionFormat = st.selectbox(
+            "Format of instruction:", ("Imperative sentence", "Question")
+        )
+        # input text for openAiKey
+        openAiKey = st.text_input(label="Input the openai key")
+        if "openAiKey" in  st.session_state:
+            st.session_state["openAiKey"]  = openAiKey
+        else:
+            st.session_state["openAiKey"]  = openAiKey
+        st.write("##")
+        # form submit button and setting up the session_state
+        if st.form_submit_button():
+            st.session_state.submit = True
+    if st.session_state.submit:
+        # extends the prompt form to extract the data
+        with st.expander(label="prompt"):
+            with st.form(key="form2"):
+                # calling the function inside if to check valid drop down inputs
+                if valid_drop_down(
+                    language, promptType, noOfQuestions, instructionFormat
+                ):
+                    if promptType == "Input text":
+                        inputText = st.text_area(
+                            label="For Instructions",
+                            placeholder="Please enter your text here",
+                        )
+                    elif promptType == "Url":
+                        url = st.text_input(
+                            label="For URL", placeholder="Please enter your text here"
+                        )
+                    elif promptType == "Document":
+                        documents = st.file_uploader(
+                            label="For Documents ( pdf / txt / docx )",
+                            type=["pdf", "txt", "docx"],
+                            accept_multiple_files=True,
+                        )
+                    # if addInfoCheckbox:
+                    #     additionalInfo = st.text_input(
+                    #         label="Additional Instructions",
+                    #         placeholder="Please enter your text here",
+                    #     )
+                    if st.form_submit_button():
+                        st.session_state.extract_button = True
+                        # st.experimental_rerun()
+    # extracting data
+    if st.session_state.extract_button:
+        # extracting data
+        if promptType == "Input text":
+            extractedData = inputText
+        elif promptType == "Url":
+            extractedURLData = extract_data_from_url(url, language)
+            if extractedURLData is not None:
+                extractedData = extractedURLData
+                st.text_area("Extracted Text:", value=extractedData, height=200)
+            else:
+                extractedData = False
+        elif promptType == "Document":
+            if not documents:
+                documents = None
+            else:
+                for doc in documents:
+                    if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]:
+                        # if documents is not the relevant type
+                        st.error("Unsupported file: " + doc.name)
+                extractedDocumentData = extract_data_from_documents(documents)
+                extractedData = extractedDocumentData
+        # if the values are extracted running the custom prompt by creating an instance
+        if extractedData:
+            # -----------------------------    RUNNING THE PROMPT   -----------------------------
+            if "extractedData" not in st.session_state:
+                st.session_state["extractedData"] = extractedData
+            else:
+                st.session_state["extractedData"] = extractedData
+            if "Initial" not in st.session_state:
+                st.session_state.Initial=True
+            if st.session_state.Initial == True:
+                # running the prompt form here
+                openai.api_key = st.session_state["openAiKey"]
+                my_prompt_template = InstructionGenerationTemplate()
+                # providing the rules for the instructions to be generated
+                additional_rules = """
+                - You do not need to provide a response to the generated examples.
+                - You must return the response in the specified language.
+                - Each generated instruction can be either an imperative sentence or a question.
+                """
+                if st.button("Generate Instructions"):
+                    prompt = my_prompt_template.format(
+                        num_questions=noOfQuestions,
+                        context=extractedData,
+                        instruction_format=instructionFormat,
+                        lang=language,
+                        additional_rules=additional_rules
+                    )
+                    response = openai.ChatCompletion.create(
+                        model="gpt-3.5-turbo",
+                        messages=[
+                                {"role": "system", "content": prompt},
+                            ])
+                # if st.button("Generate Instructions"):
+                    print("Generate button")
+                    print("Checkpoint 1!")
+                    if "result" not in st.session_state:
+                        content = response.choices[0].message.content
+                        # content = "\n1. helloworld1.\n2. helloworld2"
+                        responses_list = content.split('\n')
+                        responses_list = [re.sub(r'^\s*\d+\.\s*', '', resp) for resp in responses_list if resp]
+                        st.session_state["result"]=responses_list
+                        st.session_state.generated = True
+                    st.session_state.Initial = False
+            if st.session_state.generated:
+                # displaying the generated instructions
+                st.write("Generated Insuctions")
+                result = st.session_state["result"]
+                # print(type(result))
+                # print(result)
+                result_dict = {i+1: value for i,value in enumerate(result)}
+                selected_items = [f" {value} " for key, value in result_dict.items() if st.checkbox(f"Q{key} : {value}")]
+                # print(type(result_dict))
+                # print(result_dict)
+                print("Checked point 2!")
+                # Display the selected items as a list
+                if selected_items:
+                    st.write("Selected Items:")
+                    st.write(selected_items)
+                    if "selected_items" not in st.session_state:
+                        st.session_state["selected_items"] = selected_items
+                    st.session_state["selected_items"] = selected_items
+                    st.session_state.selected = True
+                else:
+                    st.write("No items selected.")
+            # -----------------------------    RUNNING THE PROMPT FOR ANSWER GENERATION  -----------------------------
+            if st.session_state.selected:
+                if "Initial2" not in st.session_state:
+                    st.session_state.Initial2=True
+                if st.session_state.Initial2:
+                    # running the prompt form here
+                    openai.api_key = st.session_state["openAiKey"]
+                    my_prompt_template2 = AnswerGenerationTemplate()
+                     # providing the rules for the answers to be generated
+                    additional_rules = """
+                        Enumerate the answers and dont provide any additional tags.
+                    """
+                    question =  st.session_state["selected_items"]
+                    if st.button("Generate Answers"):
+                        prompt = my_prompt_template2.format(
+                               questions=question,
+                               additional_rules = additional_rules
+                        )
+                        response = openai.ChatCompletion.create(
+                            model="gpt-3.5-turbo",
+                            messages=[
+                                    {"role": "system", "content": prompt},
+                                ])
+                    # if st.button("Generate Answers"):
+                        # print("\n\n\n\nInside Answersss:\n\n\n\n")
+                        # print(st.session_state["selected_items"])
+                        # print("Generate button")
+                        # print("Checkpoint 3!")
+                        if "answers" not in st.session_state:
+                            content = response.choices[0].message.content
+                            # content = "\n1. Answer1.\n2. Answer2"
+                            print("\n\n\n\n\nAnswerss before regex\n\n\n\n")
+                            print(content)
+                            # print("Answer Type:" + str(type(content)))
+                            responses_list = content.split('\n')
+                            # print("\n\n\n\n\nAnswerss before regex after splitting\n\n\n\n")
+                            # print(responses_list)
+                            # print("Answer Type:" + str(type(responses_list)))
+                            responses_list = [re.sub(r'^\s*\d+\.\s*', '', resp) for resp in responses_list if resp]
+                            st.session_state["answers"]=responses_list
+                            st.session_state.answered = True
+                        st.session_state.Initial2 = False
+                if st.session_state.answered:
+                    # displaying the generated Answers
+                    questions = st.session_state["selected_items"]
+                    answers = st.session_state["answers"]
+                    # print("\n\n\n\n\nAnswerss after regex\n\n\n\n")
+                    # print(answers)
+                    # print("Answer Type:" + str(type(answers)))
+                    answers_dict = {i+1: value for i,value in enumerate(answers)}
+                    # print(type(answers_dict))
+                    # print(answers_dict)
+                    # print("Checked point 4!")
+                    # st.write("answers")
+                    st.write(answers_dict)
+                    # Create a list to hold the JSON-like data
+                    st.write("Generated Questions and Answers")
+                    # Create a list of dictionaries
+                    jsonl_data = [{"Question": question, "Answer": answers_dict.get(i, 'No answer found')} for i, question in enumerate(questions, start=1)]
+                    st.write(jsonl_data)
+                    jsonl_string = '\n'.join(json.dumps(item, ensure_ascii=False) for item in jsonl_data)
+                    # Display the JSONL data
+                    print(jsonl_string)
+                    if st.download_button(label="Save as jsonl", data=jsonl_string, mime="application/json"):
+                        st.success("Successfully saved")
+        if st.button("Clear"):
+            st.session_state.extract_button = False
+            st.session_state.submit = False
+            st.session_state.generated = False
+            st.session_state.selected = False
+            st.session_state.answered = False
+            if "Initial" in st.session_state:
+                st.session_state.Initial = True
+            if "Initial2" in st.session_state:
+                st.session_state.Initial2 = True
+            if "openAiKey" in  st.session_state:
+                del st.session_state["openAiKey"]
+            if "extractedData" in st.session_state:
+                del st.session_state["extractedData"]
+            if "result" in  st.session_state:
+                del st.session_state["result"]
+            if "selected_items" in  st.session_state:
+                del st.session_state["selected_items"]
+            if "answered" in  st.session_state:
+                del st.session_state["answers"]
+            st.experimental_rerun()
+if __name__ == "__main__":
+    main()

custom_prompt_template.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from typing import List
+import langchain
+class InstructionGenerationTemplate(langchain.prompts.PromptTemplate):
+    """A custom prompt template for generating instructions."""
+    input_variables: List[str] = ["num_questions", "context", "instruction_format", "lang", "additional_rules"]
+    template = """
+        You are a highly intelligent language model trained to assist with a variety of language tasks. Your task here is to generate {num_questions} diverse questions or instructions based on the context provided below:
+        Context:
+        {context}
+        Please follow these rules:
+        {additional_rules}
+        Please generate the instructions in the {instruction_format} format and in {lang} language. Remember to adhere to the rules mentioned above.
+    """
+    template_format = "f-string"
+    def format(self, **kwargs):
+        """Format the prompt."""
+        return self.template.format(**kwargs)
+class AnswerGenerationTemplate(langchain.prompts.PromptTemplate):
+    """A custom prompt template for generating answers to questions."""
+    input_variables: List[str] = ["questions", "additional_rules"]
+    template = """
+        You are a highly intelligent language model tasked with providing answers to the following questions :
+        Questions:
+        {questions}
+        Please follow these rules:
+        {additional_rules}
+    """
+    template_format = "f-string"
+    def format(self, **kwargs):
+        """Format the prompt."""
+        return self.template.format(**kwargs)

data-downloader/download_eval_data.sh ADDED Viewed

	@@ -0,0 +1,68 @@

+mkdir -p data/downloads
+mkdir -p data/eval
+# MMLU dataset
+wget -O data/downloads/mmlu_data.tar https://people.eecs.berkeley.edu/~hendrycks/data.tar
+mkdir -p data/downloads/mmlu_data
+tar -xvf data/downloads/mmlu_data.tar -C data/downloads/mmlu_data
+mv data/downloads/mmlu_data/data data/eval/mmlu && rm -r data/downloads/mmlu_data data/downloads/mmlu_data.tar
+# Big-Bench-Hard dataset
+wget -O data/downloads/bbh_data.zip https://github.com/suzgunmirac/BIG-Bench-Hard/archive/refs/heads/main.zip
+mkdir -p data/downloads/bbh
+unzip data/downloads/bbh_data.zip -d data/downloads/bbh
+mv data/downloads/bbh/BIG-Bench-Hard-main/ data/eval/bbh && rm -r data/downloads/bbh data/downloads/bbh_data.zip
+# Super-NaturalInstructions dataset
+wget -O data/downloads/superni_data.zip https://github.com/allenai/natural-instructions/archive/refs/heads/master.zip
+mkdir -p data/downloads/superni
+unzip data/downloads/superni_data.zip -d data/downloads/superni
+mv data/downloads/superni/natural-instructions-master/ data/eval/superni && rm -r data/downloads/superni data/downloads/superni_data.zip
+# TyDiQA-GoldP dataset
+mkdir -p data/eval/tydiqa
+wget -P data/eval/tydiqa/ https://storage.googleapis.com/tydiqa/v1.1/tydiqa-goldp-v1.1-dev.json
+wget -P data/eval/tydiqa/ https://storage.googleapis.com/tydiqa/v1.1/tydiqa-goldp-v1.1-train.json
+# XOR-QA dataset
+wget -P data/eval/xorqa/ https://raw.githubusercontent.com/mia-workshop/MIA-Shared-Task-2022/main/data/eval/mia_2022_dev_xorqa.jsonl
+wget -P data/eval/xorqa/ https://github.com/mia-workshop/MIA-Shared-Task-2022/raw/main/data/train/mia_2022_train_data.jsonl.zip
+unzip data/eval/xorqa/mia_2022_train_data.jsonl.zip -d data/eval/xorqa/ && rm data/eval/xorqa/mia_2022_train_data.jsonl.zip
+# GSM dataset
+wget -P data/eval/gsm/ https://github.com/openai/grade-school-math/raw/master/grade_school_math/data/test.jsonl
+# Multilingual GSM dataset
+wget -O data/downloads/url-nlp.zip https://github.com/google-research/url-nlp/archive/refs/heads/main.zip
+mkdir -p data/downloads/url-nlp
+unzip data/downloads/url-nlp.zip -d data/downloads/url-nlp
+mv data/downloads/url-nlp/url-nlp-main/mgsm data/eval/mgsm && rm -r data/downloads/url-nlp data/downloads/url-nlp.zip
+# Codex HumanEval
+wget -P data/eval/codex_humaneval https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz
+# TruthfulQA
+wget -P data/eval/truthfulqa https://github.com/sylinrl/TruthfulQA/raw/main/TruthfulQA.csv
+# Self-instruct eval, Vicuna eval, and Koala eval for creative instructions/tasks
+mkdir -p data/eval/creative_tasks
+wget -O data/eval/creative_tasks/self_instruct_test.jsonl https://github.com/yizhongw/self-instruct/raw/main/human_eval/user_oriented_instructions.jsonl
+wget -O data/eval/creative_tasks/vicuna_test.jsonl https://github.com/lm-sys/FastChat/raw/main/fastchat/eval/table/question.jsonl
+wget -O data/eval/creative_tasks/koala_test.jsonl https://github.com/arnav-gudibande/koala-test-set/raw/main/koala_test_set.jsonl
+# Toxigen data
+mkdir -p data/eval/toxigen
+for minority_group in asian black chinese jewish latino lgbtq mental_disability mexican middle_east muslim native_american physical_disability trans women
+do
+    wget -O data/eval/toxigen/hate_${minority_group}.txt https://raw.githubusercontent.com/microsoft/TOXIGEN/main/prompts/hate_${minority_group}_1k.txt
+done

data-downloader/download_instructions_data.sh ADDED Viewed

	@@ -0,0 +1,120 @@

+# check if there is $HF_TOKEN in the environment variables
+if [ -z "$HF_TOKEN" ]
+then
+    echo "Warning: HuggingFace dataset LIMA requires permissive access."
+    echo "Warning: Please request the access at https://huggingface.co/datasets/GAIR/lima and set the HF_TOKEN environment variable before running this script."
+    exit 1
+fi
+echo "Downloading Super-NaturalInstructions dataset..."
+wget -P data/raw_train/super_ni/ https://github.com/allenai/natural-instructions/archive/refs/heads/master.zip
+unzip data/raw_train/super_ni/master.zip -d data/raw_train/super_ni/ && rm data/raw_train/super_ni/master.zip
+mv data/raw_train/super_ni/natural-instructions-master/* data/raw_train/super_ni/ && rm -r data/raw_train/super_ni/natural-instructions-master
+echo "Downloading the flan_v2 chain-of-thought submix..."
+wget -P data/raw_train/cot/ https://beaker.org/api/v3/datasets/01GXZ52K2Q932H6KZY499A7FE8/files/cot_zsopt.jsonl
+wget -P data/raw_train/cot/ https://beaker.org/api/v3/datasets/01GXZ51ZV283RAZW7J3ECM4S58/files/cot_fsopt.jsonl
+echo "Downloading the flan_v2 collection, here we subsampled only 100K instances..."
+wget -P data/raw_train/flan_v2/ https://beaker.org/api/v3/datasets/01GZTTS2EJFPA83PXS4FQCS1SA/files/flan_v2_resampled_100k.jsonl
+echo "Downloading self-instruct data..."
+wget -P data/raw_train/self_instruct/ https://raw.githubusercontent.com/yizhongw/self-instruct/main/data/gpt3_generations/batch_221203/all_instances_82K.jsonl
+echo "Downloading unnatural-instructions data..."
+wget -P data/raw_train/unnatural_instructions/ https://github.com/orhonovich/unnatural-instructions/raw/main/data/core_data.zip
+unzip data/raw_train/unnatural_instructions/core_data.zip -d data/raw_train/unnatural_instructions/
+echo "Downloading Stanford alpaca data..."
+wget -P data/raw_train/stanford_alpaca/ https://github.com/tatsu-lab/stanford_alpaca/raw/main/alpaca_data.json
+echo "Downloading the dolly dataset..."
+wget -P data/raw_train/dolly/ https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl
+echo "Downloading the OpenAssistant data (oasst1)..."
+wget -P data/raw_train/oasst1/ https://huggingface.co/datasets/OpenAssistant/oasst1/resolve/main/2023-04-12_oasst_ready.trees.jsonl.gz
+gzip -d data/raw_train/oasst1/2023-04-12_oasst_ready.trees.jsonl.gz
+echo "Downloading the code alpaca dataset..."
+wget -P data/raw_train/code_alpaca/ https://github.com/sahil280114/codealpaca/raw/master/data/code_alpaca_20k.json
+echo "Downloading the gpt4-llm dataset..."
+wget -P data/raw_train/gpt4_alpaca/ https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/raw/main/data/alpaca_gpt4_data.json
+wget -P data/raw_train/gpt4_alpaca/ https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/raw/main/data/alpaca_gpt4_data_zh.json
+echo "Downloading the baize dataset..."
+wget -P data/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/alpaca_chat_data.json
+wget -P data/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/medical_chat_data.json
+wget -P data/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/quora_chat_data.json
+wget -P data/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/stackoverflow_chat_data.json
+echo "Downloading ShareGPT dataset..."
+wget -P data/raw_train/sharegpt/ https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part1_html_cleaned.json
+wget -P data/raw_train/sharegpt/ https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part2_html_cleaned.json
+echo "Splitting the ShareGPT dataset..."
+python scripts/split_sharegpt_conversations.py \
+    --in-files data/raw_train/sharegpt/sg_90k_part1_html_cleaned.json data/raw_train/sharegpt/sg_90k_part2_html_cleaned.json \
+    --out-file data/raw_train/sharegpt/sharegpt_html_cleaned_and_split.json \
+    --model-name-or-path ../hf_llama_models/7B/
+echo "Downloading LIMA dataset..."
+wget --header="Authorization: Bearer $HF_TOKEN" -P data/raw_train/lima/ https://huggingface.co/datasets/GAIR/lima/raw/main/train.jsonl
+echo "Downloading WizardLM dataset..."
+wget -P data/raw_train/wizardlm/ https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k/resolve/main/WizardLM_evol_instruct_V2_143k.json
+echo "Downloading the OpenOrca dataset..."
+wget -P data/raw_train/open_orca/ https://huggingface.co/datasets/Open-Orca/OpenOrca/resolve/main/1M-GPT4-Augmented.parquet
+wget -P data/raw_train/open_orca/ https://huggingface.co/datasets/Open-Orca/OpenOrca/resolve/main/3_5M-GPT3_5-Augmented.parquet
+echo "Reformatting the datasets..."
+python open_instruct/reformat_datasets.py --raw_data_dir data/raw_train/ --output_dir data/processed/
+echo "Creating Tulu data mixtures..."
+mkdir -p data/processed/tulu/
+cat data/processed/flan_v2/flan_v2_data.jsonl \
+    data/processed/cot/cot_data.jsonl \
+    data/processed/dolly/dolly_data.jsonl \
+    data/processed/oasst1/oasst1_data.jsonl \
+    data/processed/gpt4_alpaca/gpt4_alpaca_data.jsonl \
+    data/processed/code_alpaca/code_alpaca_data.jsonl \
+    data/processed/sharegpt/sharegpt_data.jsonl \
+    > data/processed/tulu/tulu_v1_mix.jsonl
+cat data/processed/flan_v2/flan_v2_data.jsonl \
+    data/processed/cot/cot_data.jsonl \
+    data/processed/dolly/dolly_data.jsonl \
+    data/processed/oasst1/oasst1_data.jsonl \
+    > data/processed/tulu/tulu_v1_human_mix.jsonl
+cat data/processed/flan_v2/flan_v2_data.jsonl \
+    data/processed/cot/cot_data.jsonl \
+    data/processed/oasst1/oasst1_data.jsonl \
+    data/processed/lima/lima_data.jsonl \
+    data/processed/code_alpaca/code_alpaca_data.jsonl \
+    data/processed/sharegpt/sharegpt_data.jsonl \
+    data/processed/wizardlm/wizardlm_data.jsonl \
+    data/processed/open_orca/open_orca_data.jsonl \
+    > data/processed/tulu/tulu_v2_mix.jsonl
+cat data/processed/flan_v2/flan_v2_data.jsonl \
+    data/processed/cot/cot_data.jsonl \
+    data/processed/oasst1/oasst1_data.jsonl \
+    data/processed/lima/lima_data.jsonl \
+    > data/processed/tulu/tulu_v2_human_mix.jsonl

olive_farm.png ADDED Viewed

Git LFS Details

SHA256: 196ae590f3c139e3987860735fa7db0050f5d0080eeed2e0f88fa95b0d7bcdc8
Pointer size: 132 Bytes
Size of remote file: 1.44 MB

open_instruct/get_data_stats.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import json
+import os
+import sys
+import tqdm
+import pandas as pd
+import numpy as np
+import argparse
+from datasets import load_dataset
+from transformers import AutoTokenizer
+def get_statistics_for_messages_data(data_path):
+    # load dataset
+    dataset = load_dataset("json", data_files={"train": data_path})
+    # tokenize dataset
+    tokenizer = AutoTokenizer.from_pretrained("/net/nfs.cirrascale/allennlp/yizhongw/hf_llama_models/7B", use_fast=False)
+    # get statistics
+    num_instances = len(dataset["train"])
+    num_of_turns = [len(instance["messages"]) for instance in dataset["train"]]
+    user_prompt_lengths = []
+    assistant_response_lengths = []
+    instance_lengths = []
+    for instance in tqdm.tqdm(dataset["train"], desc="Processing instances"):
+        instance_length = 0
+        for message in instance["messages"]:
+            if message["role"] == "user":
+                user_prompt_lengths.append(len(tokenizer(message["content"], truncation=False, add_special_tokens=False)["input_ids"]))
+                instance_length += user_prompt_lengths[-1]
+            elif message["role"] == "assistant":
+                assistant_response_lengths.append(len(tokenizer(message["content"], truncation=False, add_special_tokens=False)["input_ids"]))
+                instance_length += assistant_response_lengths[-1]
+        instance_lengths.append(instance_length)
+    top_100_longest_instances = np.argsort(instance_lengths)[-100:][::-1].tolist()
+    top_100_longest_instances = [dataset["train"][i]["id"] for i in top_100_longest_instances]
+    result = {
+        "num_instances": num_instances,
+        "turns_summary": pd.Series(num_of_turns).describe(),
+        "user_prompt_lengths_summary": pd.Series(user_prompt_lengths).describe(),
+        "assistant_response_lengths_summary": pd.Series(assistant_response_lengths).describe(),
+        "total_lengths_summary": pd.Series(instance_lengths).describe(),
+        "num_instances_with_total_length_gt_512": np.sum(np.array(instance_lengths) > 512),
+        "num_instances_with_total_length_gt_768": np.sum(np.array(instance_lengths) > 768),
+        "num_instances_with_total_length_gt_1024": np.sum(np.array(instance_lengths) > 1024),
+        "num_instances_with_total_length_gt_1536": np.sum(np.array(instance_lengths) > 1536),
+        "num_instances_with_total_length_gt_2048": np.sum(np.array(instance_lengths) > 2048),
+        "num_instances_with_total_length_gt_4096": np.sum(np.array(instance_lengths) > 4096),
+        "top_100_longest_instances": top_100_longest_instances,
+    }
+    # convert everything to dict or scalar
+    for key, value in result.items():
+        if isinstance(value, pd.Series):
+            result[key] = value.to_dict()
+        elif isinstance(value, np.ndarray):
+            result[key] = value.tolist()
+        elif isinstance(value, np.int64):
+            result[key] = int(value)
+    return result
+def get_statistics_for_prompt_completion_data(data_path):
+    # load dataset
+    dataset = load_dataset("json", data_files={"train": data_path})
+    prompts = [instance["prompt"] for instance in dataset["train"]]
+    completions = [instance["completion"] for instance in dataset["train"]]
+    # tokenize dataset
+    tokenizer = AutoTokenizer.from_pretrained("/net/nfs.cirrascale/allennlp/yizhongw/hf_llama_models/7B")
+    tokenized_prompts = tokenizer(prompts, truncation=False, add_special_tokens=False)
+    tokenized_completions = tokenizer(completions, truncation=False, add_special_tokens=False)
+    # get statistics
+    num_instances = len(dataset["train"])
+    prompt_lengths = [len(tokenized_prompts["input_ids"][i]) for i in range(num_instances)]
+    completion_lengths = [len(tokenized_completions["input_ids"][i]) for i in range(num_instances)]
+    prompt_completion_lengths = [prompt_lengths[i] + completion_lengths[i] for i in range(num_instances)]
+    result = {
+        "num_instances": num_instances,
+        "prompt_lengths_summary": pd.Series(prompt_lengths).describe(),
+        "completion_lengths_summary": pd.Series(completion_lengths).describe(),
+        "prompt_completion_lengths_summary": pd.Series(prompt_completion_lengths).describe(),
+        "num_instances_with_prompt_length_gt_512": np.sum(np.array(prompt_lengths) > 512),
+        "num_instances_with_completion_length_gt_512": np.sum(np.array(completion_lengths) > 512),
+        "num_instances_with_prompt_completion_length_gt_512": np.sum(np.array(prompt_completion_lengths) > 512),
+        "num_instances_with_completion_length_gt_768": np.sum(np.array(completion_lengths) > 768),
+        "num_instances_with_prompt_completion_length_gt_1024": np.sum(np.array(prompt_completion_lengths) > 1024),
+    }
+    # convert everything to dict or scalar
+    for key, value in result.items():
+        if isinstance(value, pd.Series):
+            result[key] = value.to_dict()
+        elif isinstance(value, np.ndarray):
+            result[key] = value.tolist()
+        elif isinstance(value, np.int64):
+            result[key] = int(value)
+    return result
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_path", type=str, required=True)
+    parser.add_argument("--save_path", type=str, help="Path to save the statistics.")
+    args = parser.parse_args()
+    with open(args.data_path, "r") as f:
+        sample = json.loads(f.readline())
+    if "prompt" in sample:
+        statistics = get_statistics_for_prompt_completion_data(args.data_path)
+    elif "messages" in sample:
+        statistics = get_statistics_for_messages_data(args.data_path)
+    else:
+        raise ValueError("Invalid data format - the data should be either prompt completion data or messages data.")
+    print(json.dumps(statistics, indent=4))
+    if args.save_path is not None:
+        with open(args.save_path, "w") as f:
+            json.dump(statistics, f, indent=4)

open_instruct/reformat_data.py ADDED Viewed

	@@ -0,0 +1,551 @@

+#!/usr/bin/env python
+# coding=utf-8
+'''
+This script is used to reformat the downloaded datasets into the format that can be used by the model.
+Here we use jsonl for the converted data. Each line in the jsonl file is a json object formatted as follows:
+{
+    "dataset": "dataset_name",
+    "id": "unique_id",
+    "messages": [
+        {"role": "system", "content": "message_text"}, # optional
+        {"role": "user", "content": "message_text"},
+        {"role": "assistant", "content": "message_text"},
+        {"role": "user", "content": "message_text"},
+        {"role": "assistant", "content": "message_text"},
+        ...
+    ],
+}
+'''
+import json
+import random
+import re
+import os
+import pandas as pd
+import argparse
+from instruction_encode_templates import encode_instruction_example, encode_few_shot_example
+def convert_super_ni_data(data_dir, output_dir, zero_shot_examples_per_task=60, few_shot_examples_per_task=20, n_few_shot=2):
+    os.makedirs(output_dir, exist_ok=True)
+    train_tasks = []
+    with open(os.path.join(data_dir, "splits", "xlingual", "train_tasks.txt"), "r") as fin:
+        for line in fin:
+            if not "_mmmlu_" in line:   # skip mmlu to avoid test leakage
+                train_tasks.append(line.strip())
+    with open(os.path.join(output_dir, "super_ni_data.jsonl"), "w") as fout:
+        for task in train_tasks:
+            with open(os.path.join(data_dir, "tasks", f"{task}.json"), "r") as fin:
+                task_data = json.load(fin)
+            instruction = task_data["Definition"][0]
+            if zero_shot_examples_per_task + few_shot_examples_per_task < len(task_data["Instances"]):
+                instances = random.sample(task_data["Instances"], k=zero_shot_examples_per_task+few_shot_examples_per_task)
+            else:
+                instances = task_data["Instances"]
+            for instance in instances[:zero_shot_examples_per_task]:
+                encoded_example = encode_instruction_example(
+                    instruction=instruction,
+                    input=instance["input"],
+                    output=instance["output"][0],
+                    random_template=True,
+                    eos_token=None
+                )
+                fout.write(json.dumps({
+                    "dataset": "super_ni",
+                    "id": f"super_ni_{instance['id']}",
+                    "messages": [
+                        {"role": "user", "content": encoded_example["prompt"]},
+                        {"role": "assistant", "content": encoded_example["completion"]},
+                    ]
+                }) + "\n")
+            for instance in instances[zero_shot_examples_per_task:]:
+                if n_few_shot < len(task_data["Positive Examples"]):
+                    examplars = random.sample(task_data["Positive Examples"], k=n_few_shot)
+                else:
+                    examplars = task_data["Positive Examples"]
+                encoded_example = encode_few_shot_example(
+                    instruction=instruction,
+                    examplars=examplars,
+                    input=instance["input"],
+                    output=instance["output"][0],
+                    eos_token=None
+                )
+                fout.write(json.dumps({
+                    "dataset": "super_ni",
+                    "id": f"super_ni_{instance['id']}",
+                    "messages": [
+                        {"role": "user", "content": encoded_example["prompt"]},
+                        {"role": "assistant", "content": encoded_example["completion"]},
+                    ]
+                }) + "\n")
+def convert_cot_data(data_dir, output_dir, num_zero_shot_examples=50000, num_few_shot_examples=50000):
+    os.makedirs(output_dir, exist_ok=True)
+    examples = []
+    if num_few_shot_examples > 0:
+        with open(os.path.join(data_dir, "cot_zsopt.jsonl"), "r") as fin:
+            zero_shot_examples = [json.loads(line) for line in fin]
+            if num_zero_shot_examples < len(zero_shot_examples):
+                zero_shot_examples = random.sample(zero_shot_examples, k=num_zero_shot_examples)
+            examples.extend(zero_shot_examples)
+    if num_few_shot_examples > 0:
+        with open(os.path.join(data_dir, "cot_fsopt.jsonl"), "r") as fin:
+            few_shot_examples = [json.loads(line) for line in fin]
+            if num_few_shot_examples < len(few_shot_examples):
+                few_shot_examples = random.sample(few_shot_examples, k=num_few_shot_examples)
+            examples.extend(few_shot_examples)
+    output_path = os.path.join(output_dir, "cot_data.jsonl")
+    with open(output_path, "w") as fout:
+        for idx, example in enumerate(examples):
+            prompt = example["inputs"]
+            if not prompt.endswith("\n") and not prompt.rstrip().endswith(":"):
+                prompt += "\n"
+            completion = example["targets"]
+            fout.write(json.dumps({
+                "dataset": "cot",
+                "id": f"cot_{idx}",
+                "messages": [
+                    {"role": "user", "content": prompt},
+                    {"role": "assistant", "content": completion},
+                ]
+            }) + "\n")
+def convert_flan_v2_data(data_dir, output_dir):
+    os.makedirs(output_dir, exist_ok=True)
+    examples = []
+    with open(os.path.join(data_dir, "flan_v2_resampled_100k.jsonl"), "r") as fin:
+        for line in fin:
+            examples.append(json.loads(line))
+    output_path = os.path.join(output_dir, "flan_v2_data.jsonl")
+    with open(output_path, "w") as fout:
+        for idx, example in enumerate(examples):
+            prompt = example["inputs"]
+            if not prompt.endswith("\n") and not prompt.rstrip().endswith(":"):
+                prompt += "\n"
+            completion = example["targets"]
+            fout.write(json.dumps({
+                "dataset": "flan_v2",
+                "id": f"flan_v2_{idx}",
+                "messages": [
+                    {"role": "user", "content": prompt},
+                    {"role": "assistant", "content": completion},
+                ]
+            }) + "\n")
+def convert_dolly_data(data_dir, output_dir):
+    os.makedirs(output_dir, exist_ok=True)
+    examples = []
+    with open(os.path.join(data_dir, "databricks-dolly-15k.jsonl"), "r") as fin:
+        for line in fin:
+            examples.append(json.loads(line))
+    output_path = os.path.join(output_dir, "dolly_data.jsonl")
+    with open(output_path, "w") as fout:
+        for idx, example in enumerate(examples):
+            encoded_example = encode_instruction_example(
+                instruction=example["instruction"],
+                input=example["context"],
+                output=example["response"],
+                random_template=True,
+                eos_token=None
+            )
+            fout.write(json.dumps({
+                "dataset": "dolly",
+                "id": f"dolly_{idx}",
+                "messages": [
+                    {"role": "user", "content": encoded_example["prompt"]},
+                    {"role": "assistant", "content": encoded_example["completion"]},
+                ]
+            }) + "\n")
+def convert_self_instruct_data(data_dir, output_dir):
+    os.makedirs(output_dir, exist_ok=True)
+    examples = []
+    with open(os.path.join(data_dir, "all_instances_82K.jsonl"), "r") as fin:
+        for line in fin:
+            examples.append(json.loads(line))
+    output_path = os.path.join(output_dir, "self_instruct_data.jsonl")
+    with open(output_path, "w") as fout:
+        for idx, example in enumerate(examples):
+            encoded_example = encode_instruction_example(
+                instruction=example["instruction"],
+                input=example["input"],
+                output=example["output"],
+                random_template=True,
+                eos_token=None
+            )
+            fout.write(json.dumps({
+                "dataset": "self_instruct",
+                "id": f"self_instruct_{idx}",
+                "messages": [
+                    {"role": "user", "content": encoded_example["prompt"]},
+                    {"role": "assistant", "content": encoded_example["completion"]},
+                ]
+            }) + "\n")
+def convert_unnatural_instructions_data(data_dir, output_dir):
+    os.makedirs(output_dir, exist_ok=True)
+    instance_cnt = 0
+    with open(os.path.join(data_dir, "core_data.jsonl"), "r") as fin, open((os.path.join(output_dir, "unnatural_instructions_data.jsonl")), "w") as fout:
+        for line in fin:
+            task_data = json.loads(line)
+            instruction = task_data["instruction"]
+            for instance in task_data["instances"]:
+                if instance["constraints"] and instance["constraints"].lower() not in ["none", "none."]:
+                    instance_instruction = instruction + "\n" + instance["constraints"]
+                else:
+                    instance_instruction = instruction
+                encoded_example = encode_instruction_example(
+                    instruction=instance_instruction,
+                    input=instance["input"],
+                    output=instance["output"],
+                    random_template=True,
+                    eos_token=None
+                )
+                fout.write(json.dumps({
+                    "dataset": "unnatural_instructions",
+                    "id": f"unnatural_instructions_{instance_cnt}",
+                    "messages": [
+                        {"role": "user", "content": encoded_example["prompt"]},
+                        {"role": "assistant", "content": encoded_example["completion"]},
+                    ]
+                }) + "\n")
+                instance_cnt += 1
+def convert_stanford_alpaca_data(data_dir, output_dir):
+    os.makedirs(output_dir, exist_ok=True)
+    examples = []
+    with open(os.path.join(data_dir, "alpaca_data.json"), "r") as fin:
+        examples.extend(json.load(fin))
+    output_path = os.path.join(output_dir, "stanford_alpaca_data.jsonl")
+    with open(output_path, "w") as fout:
+        for idx, example in enumerate(examples):
+            encoded_example = encode_instruction_example(
+                instruction=example["instruction"],
+                input=example["input"],
+                output=example["output"],
+                random_template=True,
+                eos_token=None
+            )
+            fout.write(json.dumps({
+                "dataset": "stanford_alpaca",
+                "id": f"stanford_alpaca_{idx}",
+                "messages": [
+                    {"role": "user", "content": encoded_example["prompt"]},
+                    {"role": "assistant", "content": encoded_example["completion"]},
+                ]
+            }) + "\n")
+def convert_code_alpaca_data(data_dir, output_dir):
+    os.makedirs(output_dir, exist_ok=True)
+    examples = []
+    with open(os.path.join(data_dir, "code_alpaca_20k.json"), "r") as fin:
+        examples.extend(json.load(fin))
+    output_path = os.path.join(output_dir, "code_alpaca_data.jsonl")
+    with open(output_path, "w") as fout:
+        for idx, example in enumerate(examples):
+            encoded_example = encode_instruction_example(
+                instruction=example["instruction"],
+                input=example["input"],
+                output=example["output"],
+                random_template=True,
+                eos_token=None
+            )
+            fout.write(json.dumps({
+                "dataset": "code_alpaca",
+                "id": f"code_alpaca_{idx}",
+                "messages": [
+                    {"role": "user", "content": encoded_example["prompt"]},
+                    {"role": "assistant", "content": encoded_example["completion"]},
+                ]
+            }) + "\n")
+def convert_gpt4_alpaca_data(data_dir, output_dir, load_en=True, load_zh=False):
+    os.makedirs(output_dir, exist_ok=True)
+    examples = []
+    if load_en:
+        with open(os.path.join(data_dir, "alpaca_gpt4_data.json"), "r") as fin:
+            examples.extend(json.load(fin))
+    if load_zh:
+        with open(os.path.join(data_dir, "alpaca_gpt4_data_zh.json"), "r") as fin:
+            examples.extend(json.load(fin))
+    output_path = os.path.join(output_dir, "gpt4_alpaca_data.jsonl")
+    with open(output_path, "w") as fout:
+        for idx, example in enumerate(examples):
+            encoded_example = encode_instruction_example(
+                instruction=example["instruction"],
+                input=example["input"],
+                output=example["output"],
+                random_template=True,
+                eos_token=None
+            )
+            fout.write(json.dumps({
+                "dataset": "gpt4_alpaca",
+                "id": f"gpt4_alpaca_{idx}",
+                "messages": [
+                    {"role": "user", "content": encoded_example["prompt"]},
+                    {"role": "assistant", "content": encoded_example["completion"]},
+                ]
+            }) + "\n")
+def convert_sharegpt_data(data_dir, output_dir):
+    os.makedirs(output_dir, exist_ok=True)
+    examples = []
+    with open(os.path.join(data_dir, "sharegpt_html_cleaned_and_split.json"), "r") as fin:
+        examples.extend(json.load(fin))
+    output_path = os.path.join(output_dir, "sharegpt_data.jsonl")
+    with open(output_path, "w") as fout:
+        invalid_cnt = 0
+        for idx, example in enumerate(examples):
+            messages = []
+            valid = True
+            for message in example["conversations"]:
+                if message["from"] == "human" or message["from"] == "user":
+                    messages.append({
+                        "role": "user",
+                        "content": message["value"]
+                    })
+                elif message["from"] == "gpt" or message["from"] == "chatgpt":
+                    messages.append({
+                        "role": "assistant",
+                        "content": message["value"]
+                    })
+                elif message["from"] == "system":
+                    valid = False
+                    invalid_cnt += 1
+                    break
+                elif message["from"] == "bing":
+                    valid = False
+                    invalid_cnt += 1
+                    break
+                else:
+                    raise ValueError(f"Unknown message sender: {message['from']}")
+            if messages and valid:
+                fout.write(json.dumps({
+                    "dataset": "sharegpt",
+                    "id": f"sharegpt_{example['id']}",
+                    "messages": messages
+                }) + "\n")
+        print(f"# of invalid examples in sharegpt data: {invalid_cnt}")
+def convert_baize_data(data_dir, output_dir):
+    os.makedirs(output_dir, exist_ok=True)
+    examples = []
+    for source in ["alpaca", "medical", "quora", "stackoverflow"]:
+        with open(os.path.join(data_dir, f"{source}_chat_data.json"), "r") as fin:
+            examples.extend(json.load(fin))
+    output_path = os.path.join(output_dir, "baize_data.jsonl")
+    with open(output_path, "w") as fout:
+        for idx, example in enumerate(examples):
+            # split example["input"] by [|Human|] and [|AI|]
+            messages = []
+            rounds = example["input"].split("[|Human|]")[1:]
+            for round in rounds:
+                if not round.strip() or "[|AI|]" not in round:
+                    continue
+                human, assistant = round.split("[|AI|]")
+                messages.append({
+                    "role": "user",
+                    "content": human.strip()
+                })
+                messages.append({
+                    "role": "assistant",
+                    "content": assistant.strip()
+                })
+            fout.write(json.dumps({
+                "dataset": "baize",
+                "id": f"baize_{idx}",
+                "messages": messages
+            }) + "\n")
+def convert_oasst1_data(data_dir, output_dir):
+    '''
+    For OASST1, because it's in a tree structure, where every user input might get multiple replies,
+    we have to save every path from the root node to the assistant reply (including both leaf node and intemediate node).
+    This results in some of the messages being duplicated among different paths (instances).
+    Be careful when using this dataset for training. Ideally, you should only minimize the loss of the last message in each path.
+    '''
+    os.makedirs(output_dir, exist_ok=True)
+    conversations = []
+    with open(os.path.join(data_dir, "2023-04-12_oasst_ready.trees.jsonl"), "r") as fin:
+        for line in fin:
+            conversations.append(json.loads(line))
+    output_path = os.path.join(output_dir, "oasst1_data.jsonl")
+    # we filter out the sequences that mention the creator information
+    filter_strings = [
+        "LAION",
+        "Open Asssistant",
+        "OpenAssistant",
+    ]
+    # tranvers the conversation tree, and collect all valid sequences
+    def dfs(reply, messages, valid_sequences):
+        if any([filter_string in reply["text"] for filter_string in filter_strings]):
+            return
+        if reply["role"] == "assistant":
+            messages.append(
+                {"role": "assistant", "content": reply["text"]}
+            )
+            if not reply["replies"]:  # leaf node
+                valid_sequences.append(messages[:])
+            else:
+                for child in reply["replies"]:
+                    dfs(child, messages, valid_sequences)
+            messages.pop()
+        elif reply["role"] == "prompter":
+            messages.append(
+                {"role": "user", "content": reply["text"]}
+            )
+            for child in reply["replies"]:
+                dfs(child, messages, valid_sequences)
+            messages.pop()
+        else:
+            raise ValueError(f"Unknown role: {reply['role']}")
+    with open(output_path, "w") as fout:
+        example_cnt = 0
+        for _, conversation in enumerate(conversations):
+            valid_sequences = []
+            dfs(conversation["prompt"], [], valid_sequences)
+            for sequence in valid_sequences:
+                fout.write(json.dumps({
+                    "dataset": "oasst1",
+                    "id": f"oasst1_{example_cnt}",
+                    "messages": sequence
+                }) + "\n")
+                example_cnt += 1
+def convert_lima_data(data_dir, output_dir):
+    os.makedirs(output_dir, exist_ok=True)
+    examples = []
+    with open(os.path.join(data_dir, "train.jsonl"), "r") as fin:
+        for line in fin:
+            examples.append(json.loads(line))
+    output_path = os.path.join(output_dir, "lima_data.jsonl")
+    with open(output_path, "w") as fout:
+        for idx, example in enumerate(examples):
+            messages = []
+            if not len(example["conversations"]) % 2 == 0:
+                print(f"Waring: example {idx} in LIMA has odd number of messages. Cutting off the last message.")
+                example["conversations"] = example["conversations"][:-1]
+            for i in range(0, len(example["conversations"]), 2):
+                messages.append({
+                    "role": "user",
+                    "content": example["conversations"][i]
+                })
+                messages.append({
+                    "role": "assistant",
+                    "content": example["conversations"][i+1]
+                })
+            fout.write(json.dumps({
+                "dataset": "lima",
+                "id": f"lima_{idx}",
+                "messages": messages,
+            }) + "\n")
+def convert_wizardlm_data(data_dir, output_dir):
+    os.makedirs(output_dir, exist_ok=True)
+    examples = []
+    with open(os.path.join(data_dir, "WizardLM_evol_instruct_V2_143k.json"), "r") as fin:
+        examples = json.load(fin)
+    output_path = os.path.join(output_dir, "wizardlm_data.jsonl")
+    with open(output_path, "w") as fout:
+        for idx, example in enumerate(examples):
+            messages = []
+            assert len(example["conversations"]) % 2 == 0
+            for i in range(0, len(example["conversations"]), 2):
+                assert example["conversations"][i]["from"] == "human"
+                assert example["conversations"][i+1]["from"] == "gpt"
+                messages.append({
+                    "role": "user",
+                    "content": example["conversations"][i]["value"]
+                })
+                messages.append({
+                    "role": "assistant",
+                    "content": example["conversations"][i+1]["value"]
+                })
+            fout.write(json.dumps({
+                "dataset": "wizardlm",
+                "id": f"wizardlm_{example['idx']}",
+                "messages": messages,
+            }) + "\n")
+def convert_open_orca_data(data_dir, output_dir, num_gpt4_examples=100000, num_gpt35_examples=0):
+    os.makedirs(output_dir, exist_ok=True)
+    examples = []
+    df = pd.read_parquet(os.path.join(data_dir, "1M-GPT4-Augmented.parquet"))
+    gpt4_examples = [row.to_dict() for _, row in df.iterrows()]
+    random.shuffle(gpt4_examples)
+    examples.extend(gpt4_examples[:num_gpt4_examples])
+    df = pd.read_parquet(os.path.join(data_dir, "3_5M-GPT3_5-Augmented.parquet"))
+    gpt35_examples = [row.to_dict() for _, row in df.iterrows()]
+    random.shuffle(gpt35_examples)
+    examples.extend(gpt35_examples[:num_gpt35_examples])
+    output_path = os.path.join(output_dir, "open_orca_data.jsonl")
+    with open(output_path, "w") as fout:
+        for idx, example in enumerate(examples):
+            messages = [
+                {"role": "system", "content": example["system_prompt"]},
+                {"role": "user", "content": example["question"]},
+                {"role": "assistant", "content": example["response"]}
+            ]
+            fout.write(json.dumps({
+                "dataset": "open_orca",
+                "id": f"open_orca_{example['id']}",
+                "messages": messages,
+            }) + "\n")
+if __name__ == "__main__":
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument("--raw_data_dir", type=str, default="data/downloads")
+    arg_parser.add_argument("--output_dir", type=str, default="data/processed")
+    arg_parser.add_argument("--seed", type=int, default=42)
+    args = arg_parser.parse_args()
+    random.seed(args.seed)
+    # get the subfolder names in raw_data_dir
+    subfolders = [f for f in os.listdir(args.raw_data_dir) if os.path.isdir(os.path.join(args.raw_data_dir, f))]
+    # all supported datasets
+    supported_datasets = []
+    all_funcs = [func_name for func_name in globals() if callable(globals()[func_name])]
+    for func_name in all_funcs:
+        if re.match(r"convert_.+_data", func_name):
+            supported_datasets.append(func_name[8:-5])
+    # check if the subfolder names are supported datasets
+    valid_subfolders = []
+    for subfolder in subfolders:
+        if subfolder not in supported_datasets:
+            print(f"Warning: {subfolder} in the raw data folder is not a supported dataset. We will skip it.")
+        else:
+            valid_subfolders.append(subfolder)
+    # prepare data for each dataset
+    statistics = {}
+    for subfolder in valid_subfolders:
+        print(f"Processing {subfolder} data...")
+        globals()[f"convert_{subfolder}_data"](os.path.join(args.raw_data_dir, subfolder), os.path.join(args.output_dir, subfolder))

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+streamlit
+pdfplumber
+docx2txt
+justext
+openai
+langchain

web-app.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import streamlit as st
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.vectorstores import FAISS, Chroma
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.llms import OpenAI as OpenAI_llm
+from langchain.chat_models import ChatOpenAI
+from langchain.chains import ConversationalRetrievalChain,RetrievalQA
+from langchain.memory import ConversationBufferMemory
+from langchain.document_loaders import PyPDFLoader, TextLoader, WebBaseLoader
+from langchain.prompts.chat import ChatPromptTemplate,HumanMessagePromptTemplate,SystemMessagePromptTemplate
+# from langchain.chains.qa_with_sources import load_qa_with_sources_chain,BaseCombineDocumentsChain
+import os
+import chromadb
+import tempfile
+import requests
+import openai
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+def assistant(url):
+    question=st.text_input("Ask your Question")
+    if st.button("Submit",type="primary"):
+        ABS_PATH: str = os.path.dirname(os.path.abspath(__file__))
+        DB_DIR: str = os.path.join(ABS_PATH,"db")
+        loader=WebBaseLoader(url)
+        data=loader.load()
+        text_splitter = CharacterTextSplitter(separator='\n',
+                                              chunk_size=1000,chunk_overlap=0)
+        docs = text_splitter.split_documents(data)
+        openai_embeddings = OpenAIEmbeddings()
+        # client = chromadb.PersistentClient(path=DB_DIR)
+        vectordb = FAISS.from_documents(documents=docs,embedding=openai_embeddings)
+        # vectordb.persist()
+        retriever=vectordb.as_retriever()
+        llm=ChatOpenAI(model_name='gpt-3.5-turbo')
+        qa=RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
+        response=qa(question)
+        st.write(response)
+st.title('Chat with Website')
+url=st.text_input('Enter Your URL here:')
+if url:
+    assistant(url)